In [1]:
#import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_absolute_error

In [4]:
#define wrangle function
def wrangle(dataset):
    #read csv
    df = pd.read_csv(dataset)
    #cols to drop
    cols_to_drop = ['PoolQC','Alley','MiscFeature','Fence', 'FireplaceQu','MasVnrType', 'Id']

    #handle categorical missingness
    categ_vars = df.select_dtypes('object').columns
    for cat_col in categ_vars:
        df[cat_col] = df[cat_col].fillna(df[cat_col].mode()[0])
    
    #handle numerical missingness
    num_cols = df.select_dtypes(['int','float']).columns
    for num in num_cols:
        df[num] = df[num].fillna(df[num].mean())

    #drop cols
    df.drop(columns = cols_to_drop, inplace=True)

    return df

In [5]:
df = wrangle('train.csv')
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,12,2008,WD,Normal,250000


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 74 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1460 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   LotShape       1460 non-null   object 
 6   LandContour    1460 non-null   object 
 7   Utilities      1460 non-null   object 
 8   LotConfig      1460 non-null   object 
 9   LandSlope      1460 non-null   object 
 10  Neighborhood   1460 non-null   object 
 11  Condition1     1460 non-null   object 
 12  Condition2     1460 non-null   object 
 13  BldgType       1460 non-null   object 
 14  HouseStyle     1460 non-null   object 
 15  OverallQual    1460 non-null   int64  
 16  OverallCond    1460 non-null   int64  
 17  YearBuilt      1460 non-null   int64  
 18  YearRemo

In [7]:
#check correlation for numerical features
num_feat = df.select_dtypes(['int', 'float']).drop('SalePrice', axis=1).corr()
num_feat

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
MSSubClass,1.0,-0.357056,-0.139781,0.032628,-0.059316,0.02785,0.040581,0.022895,-0.069836,-0.065649,...,-0.098672,-0.012579,-0.0061,-0.012037,-0.043825,-0.02603,0.008283,-0.007683,-0.013585,-0.021407
LotFrontage,-0.357056,1.0,0.306795,0.234196,-0.05282,0.117598,0.082746,0.179283,0.215828,0.04334,...,0.323663,0.077106,0.137454,0.00979,0.062335,0.037684,0.180868,0.001168,0.010158,0.006768
LotArea,-0.139781,0.306795,1.0,0.105806,-0.005636,0.014228,0.013788,0.10396,0.214103,0.11117,...,0.180403,0.171698,0.084774,-0.01834,0.020423,0.04316,0.077672,0.038068,0.001205,-0.014261
OverallQual,0.032628,0.234196,0.105806,1.0,-0.091932,0.572323,0.550684,0.410238,0.239666,-0.059119,...,0.562022,0.238923,0.308819,-0.113937,0.030371,0.064886,0.065166,-0.031406,0.070815,-0.027347
OverallCond,-0.059316,-0.05282,-0.005636,-0.091932,1.0,-0.375983,0.073741,-0.127788,-0.046231,0.040229,...,-0.151521,-0.003334,-0.032589,0.070356,0.025504,0.054811,-0.001985,0.068777,-0.003511,0.04395
YearBuilt,0.02785,0.117598,0.014228,0.572323,-0.375983,1.0,0.592855,0.314745,0.249503,-0.049107,...,0.478954,0.22488,0.188686,-0.387268,0.031355,-0.050364,0.00495,-0.034383,0.012398,-0.013618
YearRemodAdd,0.040581,0.082746,0.013788,0.550684,0.073741,0.592855,1.0,0.179186,0.128451,-0.067759,...,0.3716,0.205726,0.226298,-0.193919,0.045286,-0.03874,0.005829,-0.010286,0.02149,0.035743
MasVnrArea,0.022895,0.179283,0.10396,0.410238,-0.127788,0.314745,0.179186,1.0,0.263582,-0.072302,...,0.372567,0.159349,0.124965,-0.109849,0.018795,0.061453,0.011723,-0.029815,-0.00594,-0.008184
BsmtFinSF1,-0.069836,0.215828,0.214103,0.239666,-0.046231,0.249503,0.128451,0.263582,1.0,-0.050117,...,0.29697,0.204306,0.111761,-0.102303,0.026451,0.062021,0.140491,0.003571,-0.015727,0.014359
BsmtFinSF2,-0.065649,0.04334,0.11117,-0.059119,0.040229,-0.049107,-0.067759,-0.072302,-0.050117,1.0,...,-0.018227,0.067898,0.003093,0.036543,-0.029993,0.088871,0.041709,0.00494,-0.015211,0.031706
