# Preprocessing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_selection import VarianceThreshold, SelectKBest, SelectPercentile, SelectFromModel, f_regression, RFE, RFECV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, Lasso
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
# from sklearn.model_selection import r2_score
import numpy as np
import pickle
import csv

np.random.seed(42)

In [2]:
train = pd.read_csv('../datasets/final_train_2.csv', index_col='Id')

In [3]:
train.head()

Unnamed: 0_level_0,ms_subclass,lot_area,year_built,year_remod/add,bsmtfin_sf_1,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,1st_flr_sf,2nd_flr_sf,...,sale_type_CWD,sale_type_Con,sale_type_ConLD,sale_type_ConLI,sale_type_ConLw,sale_type_New,sale_type_Oth,sale_type_WD,overall_qual,overall_cond
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
109,60,13517,1976,2005,533.0,0.0,192.0,725.0,725,754,...,0,0,0,0,0,0,0,1,6,8
544,60,11492,1996,1997,637.0,0.0,276.0,913.0,913,1209,...,0,0,0,0,0,0,0,1,7,5
153,20,7922,1953,2007,731.0,0.0,326.0,1057.0,1057,0,...,0,0,0,0,0,0,0,1,5,7
318,60,9802,2006,2007,0.0,0.0,384.0,384.0,744,700,...,0,0,0,0,0,0,0,1,5,5
255,50,14235,1900,1993,0.0,0.0,676.0,676.0,831,614,...,0,0,0,0,0,0,0,1,6,8


In [4]:
X = train.drop('saleprice', axis = 1)
y = train['saleprice']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 24)

### Variance Threshold

In [6]:
threshold = VarianceThreshold(.03)

In [7]:
X_train_threshold = threshold.fit_transform(X_train)

In [8]:
X_train_threshold.shape 

(1537, 151)

In [9]:
X_test_threshold = threshold.transform(X_test)

In [10]:
threshold.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False, False,  True, False, False,  True,
        True, False, False,  True,  True, False,  True, False, False,
        True,  True,  True, False,  True, False, False, False,  True,
        True,  True, False,  True,  True,  True, False, False, False,
       False,  True, False,  True,  True,  True,  True, False, False,
        True, False, False,  True,  True, False,  True, False,  True,
        True, False,  True,  True,  True, False, False, False,  True,
        True,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False,  True, False,
        True,  True,  True,  True, False,  True, False, False,  True,
       False,  True, False,  True, False,  True, False, False, False,
       False, False,

In [11]:
included = X.columns[threshold.get_support()]
included ##save the columns to mirror onto test 

Index(['ms_subclass', 'lot_area', 'year_built', 'year_remod/add',
       'bsmtfin_sf_1', 'bsmtfin_sf_2', 'bsmt_unf_sf', 'total_bsmt_sf',
       '1st_flr_sf', '2nd_flr_sf',
       ...
       'fence_GdPrv', 'fence_GdWo', 'fence_MnPrv', 'fence_NF',
       'misc_feature_NoFeat', 'sale_type_COD', 'sale_type_New',
       'sale_type_WD ', 'overall_qual', 'overall_cond'],
      dtype='object', length=151)

In [12]:
ss = StandardScaler()

In [13]:
X_train_threshold_sc = ss.fit_transform(X_train_threshold)

In [14]:
X_test_threshold_sc = ss.transform(X_test_threshold)

In [15]:
X_train_threshold.shape

(1537, 151)

In [16]:
X_test.shape

(513, 295)

In [17]:
X_test_threshold.shape

(513, 151)

In [18]:
X_train_threshold_sc.shape

(1537, 151)

In [19]:
X_test_threshold_sc.shape

(513, 151)

### Export data and scaler for use in future notebooks

In [20]:
with open('../datasets/X_train_threshold_2_sc.csv', 'w+') as f:
    csv_writer = csv.writer(f)
    csv_writer.writerows(X_train_threshold_sc)

In [21]:
with open('../datasets/X_test_threshold_2_sc.csv', 'w+') as f:
    csv_writer = csv.writer(f)
    csv_writer.writerows(X_test_threshold_sc)

In [22]:
y_train.to_csv('../datasets/y_train_2.csv', index=False, header=False) 

In [23]:
y_test.to_csv('../datasets/y_test_2.csv', index=False, header=False)

In [24]:
with open('../assets/scaler.pkl', 'wb+') as f:
    pickle.dump(ss, f) 

### Save out column labels to align data properly (between train/test)

In [25]:
with open('../assets/columns.pkl', 'wb+') as f:
    pickle.dump(list(included), f)