In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import f_regression
from sklearn.preprocessing import Imputer
from sklearn.pipeline import Pipeline

rent_data = pd.read_csv('https://ndownloader.figshare.com/files/7586326', sep=',')

# NOTE REMOVE ALL ROWS FOR WHICH TENURE 2 (sc116) IS == 9 & n== 3 (OWNER OCCUPIED AND OCCUPY RENT FREE) 
# These will likely not contribute to any useful information about rent prices
rent_data = rent_data[rent_data['sc116'] == 2]

# NOTE DELETING IN REVERSE ORDER
# household income specifics
rent_data = rent_data.drop(rent_data.ix[:, 'uf52h_h':'uf52h_g'], axis = 1)
# household member attributes
rent_data = rent_data.drop(rent_data.ix[:, 'uf30': 'hflag13'], axis = 1)
#household income attributes
rent_data = rent_data.drop(rent_data.ix[:, 'tot_per': 'uf40a'], axis = 1)
# ethnicity, sex, children
rent_data = rent_data.drop(rent_data.ix[:, 'sc27': 'rec_race_c'], axis = 1)
# smells, water leakage,(Specific to an apartment currently rented) phone line, social security 
rent_data = rent_data.drop(rent_data.ix[:, 'sc548': 'uf54'], axis = 1)
# related to current condition of rented apartment
rent_data = rent_data.drop(rent_data.ix[:, 'sc188': 'sc194'], axis = 1)
# assistance and amount of assistance 1
rent_data = rent_data.drop(rent_data.ix[:, 'sc181': 'uf17a'], axis = 1)
# assistance and amount of assistance 2
rent_data = rent_data.drop(rent_data.ix[:, 'sc174': 'uf64'], axis = 1)
# cost of seperate fuels such as gas, eletric. Note: the indicator variables of seperate utility charges remain
# (These may be valuable for the pricing and may hold most of the information)
rent_data = rent_data.drop(['uf12', 'uf13', 'uf14', 'uf15','uf16'], axis = 1)
# related to mortgage of building and interest rate
rent_data = rent_data.drop(rent_data.ix[:, 'sc127': 'uf7a'], axis = 1)
# householder sex, householder details
rent_data = rent_data.drop(rent_data.ix[:, 'hhr2': 'uf10'], axis = 1)
# remove first column
rent_data = rent_data.drop(['recid'], axis = 1)
#owner in building
rent_data = rent_data.drop(['sc147'], axis = 1)

y = rent_data.ix[:,'uf17'].values

# for c in rent_data.columns:
#     print("---- %s ---")
#     print(rent_data[c].value_counts())

# Look through current features and replace missing values with np.nan
# NOTE: FOR UF1_1 TO UF1_22 I ONLY REMOVED "RESPONSE NOT RECORDED" AND KEPT 
# "CONDITION NOT REPORTED" TO AVOID ONLY 1 CATEGORY AND LOSS OF INFORMATION
rent_data.ix[:, 'uf1_1':'uf1_22'] = rent_data.ix[:,'uf1_1':'uf1_22'].replace([9], np.nan)
rent_data['sc23'] = rent_data['sc23'].replace([8], np.nan)
rent_data['sc24'] = rent_data['sc24'].replace([8], np.nan)
rent_data['sc36'] = rent_data['sc36'].replace([8], np.nan)
rent_data['sc37'] = rent_data['sc37'].replace([8], np.nan)
rent_data['sc38'] = rent_data['sc38'].replace([8], np.nan)
rent_data['sc173'] = rent_data['sc173'].replace([8,3], np.nan)
rent_data['sc171'] = rent_data['sc171'].replace([8,3], np.nan)
rent_data['sc154'] = rent_data['sc154'].replace([8,9], np.nan)
rent_data['sc156'] = rent_data['sc156'].replace([9], np.nan)
rent_data['sc157'] = rent_data['sc157'].replace([8,9], np.nan)
rent_data['sc185'] = rent_data['sc185'].replace([8], np.nan)
rent_data['sc186'] = rent_data['sc186'].replace([8], np.nan)
rent_data['sc197'] = rent_data['sc197'].replace([4,8], np.nan)
rent_data['sc198'] = rent_data['sc198'].replace([8], np.nan)
rent_data['sc187'] = rent_data['sc187'].replace([8], np.nan)
rent_data['sc196'] = rent_data['sc196'].replace([8], np.nan)
rent_data['rec15'] = rent_data['rec15'].replace([10,11,12], np.nan)
rent_data['rec21'] = rent_data['rec21'].replace([8], np.nan)
rent_data['rec54'] = rent_data['rec54'].replace([7], np.nan)
rent_data['rec53'] = rent_data['sc196'].replace([9], np.nan)
# Replace all 2's in hflag to 1 (since 2 is the default value of 1)
rent_data.ix[:,'hflag6':] = rent_data.ix[:,'hflag6':].replace([2], [1])

rent_data = rent_data.drop('uf17', 1)
# Impute missing data using the most frequent
rent_data = pd.DataFrame(Imputer(missing_values = np.nan, strategy = "most_frequent", axis = 0).fit_transform(rent_data))


# rent_data = rent_data.drop('uf17', 1)

# HANDLE CATEGORICAL DATA
rent_dummies = pd.get_dummies(rent_data)

X = rent_dummies.values



X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)
estimators = [('scaler' , MinMaxScaler()) , ('clf', LassoCV())]
pipeline = Pipeline(estimators)
scores = cross_val_score(pipeline, X_train, y_train, cv=10)
mean = np.mean(scores)
# print(mean)
pipeline.fit(X_train, y_train)
accuracy = pipeline.score(X_test, y_test)
# print(accuracy)

        

def predict_rent():
    """that returns your test data, the true labels and your predicted labels (all as numpy arrays).
    
    Returns: test data, the true labels, and the predicted labels (all as numpy arrays).
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
    estimators = [('scaler' , MinMaxScaler()) ,('clf', RidgeCV())]
    pipeline = Pipeline(estimators)
    fit = pipeline.fit(X_train, y_train)
    prediction = pipeline.steps[1][1].predict(X_test)
    return X_train, y_test , np.array(prediction)

def score_rent():
    """Returns the R^2 of our machine learning model
    
    Args:
        
    Returns:
       R^2 and a function 
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)
    estimators = [('scaler' , MinMaxScaler()) ,('clf', ElasticNetCV())]
    pipeline = Pipeline(estimators)
    fit = pipeline.fit(X_train, y_train)
    accuracy = pipeline.score(X_test, y_test)
    return accuracy

[  772.74848695  1129.40733536  2406.62408428 ...,  1735.14426575
  1099.26266357  1543.89009709]
