In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import r2_score
import os

In [3]:
# use absolute path to retrieve datasets

test = pd.read_csv(os.path.abspath('./datasets/test.csv'))
# sample = pd.read_csv(os.path.abspath('./datasets/sample_sub_reg.csv'))

In [4]:
test.columns= test.columns.str.lower()
test.columns = test.columns.str.replace(' ', '_')

In [5]:
test.drop(columns = 'pid', inplace = True)

In [6]:
def find_outliers(num_list):
    mean = num_list.mean()
    std = num_list.std()
    pos_outlier = mean + (std * 3)
    neg_outlier = mean - (std *3)
    return [row for row in num_list if row > pos_outlier or row < neg_outlier]

find_outliers(test['gr_liv_area'])

[3627, 4476, 3112, 3447, 3086, 4316, 3194]

In [7]:
test[test['garage_yr_blt'] > 2010]['garage_yr_blt']

Series([], Name: garage_yr_blt, dtype: float64)

In [8]:
find_outliers(test['wood_deck_sf'])

[574, 501, 483, 670, 690, 467, 468, 486, 490, 474, 502, 460, 486, 511]

In [9]:
test['bedroom_abvgr'].replace({0: None}, inplace = True)
test['bedroom_abvgr'] = test.groupby(['totrms_abvgrd'])['bedroom_abvgr'].transform(
    lambda grp: grp.fillna(np.mean(grp)))


In [10]:
test[
    (test['bsmt_full_bath'] == 0) &
    (test['full_bath'] == 0)][['ms_subclass',
    'gr_liv_area', 'bedroom_abvgr', 'half_bath', 'bsmt_full_bath']]

Unnamed: 0,ms_subclass,gr_liv_area,bedroom_abvgr,half_bath,bsmt_full_bath


In [11]:
test.drop(columns = ['pool_qc',
                     'pool_area',#not applicable to most houses
                      'alley', #not applicable to most houses
                      'fence', #not applicable to most houses
                      'misc_feature', #not applicable to most houses
                      'misc_val', #not applicable to most houses
                     'garage_cond', #the same as another column: exter qual
                     'exter_cond', #the same as another column: garage area
                     'garage_area'], #the same as another column: garage qual
           inplace = True)

In [12]:
def get_column_names(df, word):
    return [col for col in df.columns if word.lower() in col.lower()]

bsmt_cols = get_column_names(test, 'bsmt')
gar_cols = get_column_names(test, 'garage')

In [13]:
deck_porch_cols = [col for col in test.columns # garage and porch columns
                  if ('porch' in col) | ('deck' in col)]

In [14]:
test[bsmt_cols] = test[bsmt_cols].fillna(0)

In [15]:
test[test[gar_cols].notnull().sum(axis=1) < 2][gar_cols]

Unnamed: 0,garage_type,garage_yr_blt,garage_finish,garage_cars,garage_qual
29,,,,0,
45,,,,0,
66,,,,0,
68,,,,0,
105,,,,0,
109,,,,0,
113,,,,0,
144,,,,0,
152,,,,0,
156,,,,0,


In [16]:
test[list(test[gar_cols].columns.values)] = test[
    list(test[gar_cols].columns.values)].fillna(0)

In [17]:
test['fireplaces'].fillna(0, inplace = True)
test['fireplace_qu'].fillna(0, inplace = True)

In [18]:
test['lot_frontage'] = test.groupby(['ms_subclass'])[
    'lot_frontage'].transform(lambda grp: grp.fillna(np.mean(grp)))

In [19]:
test['lot_frontage'].isna().sum()

0

In [20]:
test['mas_vnr_type'].fillna(0, inplace = True)
test['mas_vnr_area'].fillna(0, inplace = True)

In [21]:
test['bsmt_cond'].fillna(0, inplace = True)

In [22]:
test['garage_type'].fillna(0, inplace = True)

In [23]:
test['fireplaces'].fillna(0, inplace = True)

In [24]:
test['central_air'].replace({'Y': 1, 'N': 0}, inplace = True)

In [25]:
scale_mapper = {"Ex": 3,
       "Gd" : 2,
       "TA":1,
       "Fa":0 ,
       'Po':-1} #fillna(0)

test['kitchen_qual'].replace(scale_mapper, inplace = True)

test['bsmt_qual'].replace(scale_mapper, inplace = True)

test['bsmt_cond'].replace(scale_mapper, inplace = True)

test['heating_qc'].replace(scale_mapper, inplace = True)

test['garage_qual'].replace(scale_mapper, inplace = True)

test['exter_qual'].replace(scale_mapper, inplace = True)

#misc mappers

test['paved_drive'].replace({"Y": 3,
       "P" : 2,
       "N":1}, inplace = True)

# mapper for seasons
test['mo_sold']= test['mo_sold'].map(
    lambda x: 
    'Winter' if (x == 12) | (x <= 2) else (
        'Spring' if (x >=3) & (x<=5) else (
            'Summer' if (x >=6) & (x <=9) else 'Fall') ))


test['land_slope'].replace({
    'Gtl': 3, 'Mod': 2, "Sev": 1}, inplace = True)

test['lot_shape'].replace({
    'Reg': 4, 'IR1': 3, "IR2": 2, 'IR3': 1}, inplace = True)

test['land_contour'].replace({'Lvl': 4,
                               'Bnk': 3,
                               "HLS": 2,
                               'Low': 1}, inplace = True)

test['functional'].replace({'Typ': 8, 'Min1':7, 
                             'Min2': 6, 'Mod': 5, 
                             'Maj1': 4,'Maj2': 3, 
                             'Sev': 2, 'Sal': 1}, 
                            inplace = True)

test['street'].replace({'Grvl': 1,
                         'Pave': 2}, inplace = True)

test['fireplace_qu'].replace({'Ex': 3,
                               'Gd': 2,
                               "TA": 1,
                               'Fa': 0,
                               'Po':-1}, inplace = True)

test['bsmt_exposure'].fillna(0, inplace = True)
test['bsmt_exposure'].replace({'Gd': 4,
       'Av': 3,
       'Mn': 2,
       'No': 1}, inplace = True)


In [26]:
test['garage_finish'].replace({'Fin': 3,
                               'RFn': 2,
                               'Unf': 1}, inplace = True)

In [27]:
bsmnt_finish_mapper = {'GLQ':5,
       'ALQ':5,
       'BLQ':4,
       'Rec':3,
       'LwQ':2,
       'Unf':1,}

test['bsmtfin_type_1'].replace(bsmnt_finish_mapper, inplace = True)
test['bsmtfin_type_2'].replace(bsmnt_finish_mapper, inplace = True)

In [28]:
util_mapper = {"AllPub": 4,
       "NoSewr" : 3,
       "NoSeWa":2,
              'ELO': 1}

test['utilities'].replace( {"AllPub": 4,
       "NoSewr" : 3,
       "NoSeWa":2,
              'ELO': 1}, inplace = True)

In [29]:
test[test.isnull().any(axis = 1)]

Unnamed: 0,id,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,lot_config,...,garage_qual,paved_drive,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,mo_sold,yr_sold,sale_type
634,1578,80,RL,73.0,9735,2,4,4,4,Inside,...,1,3,100,0,0,0,0,Spring,2008,WD


In [30]:
test.fillna(0, inplace = True)

In [31]:
sum(test.isna().sum())

0

In [32]:
test['total_baths'] = test[
    'full_bath'] + test[
    'bsmt_full_bath'] + test[
    'bsmt_half_bath'] + test[
    'half_bath']

test['bsmt_finished'] = ((test['total_bsmt_sf'] - test['bsmt_unf_sf']))
test['size_n_qual'] = test['gr_liv_area'] * test['overall_qual']
test['bed_n_bath'] = test['total_baths'] * test['bedroom_abvgr']
test['fin_bsmt_fin_qual'] = test['bsmt_finished'] * test['bsmtfin_type_1']
test['bsmt_fin_n_ceiling'] = test['bsmt_finished'] * test['bsmt_qual']
test['bsmt_finished_with_bath'] = test['bsmt_finished'] * test['bsmt_full_bath']
test['bsmt_finished_with_halfbath'] = test['bsmt_finished'] * test['bsmt_half_bath']
test['overall_qual_cond'] = test['overall_qual'] * test['overall_cond']
test['finished_n_total'] = (test['total_bsmt_sf'] * test['bsmt_finished'])
test['bsmt_size_fin_sf'] = test['bsmt_finished'] * test['total_bsmt_sf']

In [33]:
test.to_csv(r'./datasets/cleaned_test.csv', index = False)