In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.simplefilter(action='ignore')

In [4]:
data=pd.read_csv(r"D:\Machine Learning Development\original\Machine-Learning-Pipeline-Overview\train.csv")
data.head(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [10]:
X_train,X_test,y_train,y_test=train_test_split(data,data['SalePrice'],test_size=.1,random_state=0)
X_test.shape,X_train.shape

((146, 81), (1314, 81))

In [15]:
vars_with_na = [i for i in X_train.columns if X_train[i].isnull().sum()>0 and data[i].dtypes=='O' ]

X_train[vars_with_na].isnull().mean()
    

Alley           0.938356
MasVnrType      0.004566
BsmtQual        0.024353
BsmtCond        0.024353
BsmtExposure    0.025114
BsmtFinType1    0.024353
BsmtFinType2    0.025114
Electrical      0.000761
FireplaceQu     0.472603
GarageType      0.056317
GarageFinish    0.056317
GarageQual      0.056317
GarageCond      0.056317
PoolQC          0.995434
Fence           0.814307
MiscFeature     0.961187
dtype: float64

In [16]:
X_train[vars_with_na]=X_train[vars_with_na].fillna('Missing')
X_test[vars_with_na]=X_test[vars_with_na].fillna('Missing')

In [19]:
X_train[vars_with_na].isnull().mean()
X_train[vars_with_na].isnull().sum()

Alley           0
MasVnrType      0
BsmtQual        0
BsmtCond        0
BsmtExposure    0
BsmtFinType1    0
BsmtFinType2    0
Electrical      0
FireplaceQu     0
GarageType      0
GarageFinish    0
GarageQual      0
GarageCond      0
PoolQC          0
Fence           0
MiscFeature     0
dtype: int64

In [24]:
[i for i in vars_with_na if X_test[i].isnull().sum()>0]

[]

In [26]:
vars_Numerical_with_na = [i for i in X_train.columns if X_train[i].isnull().sum()>0 and data[i].dtypes!='O' ]

X_train[vars_Numerical_with_na].isnull().mean()

LotFrontage    0.177321
MasVnrArea     0.004566
GarageYrBlt    0.056317
dtype: float64

In [34]:
for i in vars_Numerical_with_na:
    mode_value = X_train[i].mode()[0]
    print(mode_value)
    X_train[i+'_na']=np.where(X_train[i].isnull(),1,0)
    X_test[i+'_na']=np.where(X_test[i].isnull(),1,0)
    
    X_train[i]=X_train[i].fillna(mode_value)
    X_test[i]=X_test[i].fillna(mode_value)
    
X_train[vars_Numerical_with_na].isnull().sum()


60.0
0.0
2005.0


LotFrontage    0
MasVnrArea     0
GarageYrBlt    0
dtype: int64

In [36]:
X_train[['LotFrontage_na', 'MasVnrArea_na', 'GarageYrBlt_na']].head()

Unnamed: 0,LotFrontage_na,MasVnrArea_na,GarageYrBlt_na
930,0,0,0
656,0,0,0
45,0,0,0
1348,0,0,0
55,0,0,0


In [38]:
def elapsed_year(df,var):
    df[var]=df['YrSold']-df[var]
    return df

In [40]:
year_variables = [i for i in data.columns if 'Year' in i or 'Yr' in i]
year_variables

['YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'YrSold']

In [42]:
for i in year_variables:
    X_train = elapsed_year(X_train,i)
    X_test = elapsed_year(X_test,i)


In [46]:
numerical_variables = [i for i in data.columns if data[i].dtypes != 'O' ]

print('Number of Numerical Variables : ',len(numerical_variables))

discrete_variables = [i for i in numerical_variables if len(data[i].unique())<20 and i not in year_variables+['Id']]
print('Number of discrete variables: ', len(discrete_variables))

continuous_variables = [ i for i in numerical_variables if i not in year_variables+['Id']+discrete_variables]
print('Number of Continuous Variables:', len(continuous_variables))

Number of Numerical Variables :  38
Number of discrete variables:  14
Number of Continuous Variables: 19


In [48]:
for var in continuous_variables:
    X_train[var] = np.log(X_train[var])
    X_test[var] = np.log(X_test[var])

In [50]:
[var for var in ['LotFrontage', 'LotArea', '1stFlrSF',
                 'GrLivArea', 'SalePrice'] if X_test[var].isnull().sum() > 0]

[]

In [51]:
[var for var in ['LotFrontage', 'LotArea', '1stFlrSF',
                 'GrLivArea', 'SalePrice'] if X_train[var].isnull().sum() > 0]

[]

In [52]:
categorical_variables = [i for i in data.columns if data[i].dtypes == 'O']

print('Number of Categorical Variables: ', len(categorical_variables))

Number of Categorical Variables:  43


In [54]:
data[categorical_variables].head()

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,RL,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,RL,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal


In [55]:
def analyze_rare_labels(df, var, rare_percent):
    df= df.copy()
    
    tmp = df.groupby(var)['SalePrice'].count()/len(df)
    
    return tmp[tmp<rare_percent]


for i in categorical_variables:
    print(analyze_rare_labels(data,i,.02))
    print()
    print()

MSZoning
C (all)    0.006849
RH         0.010959
Name: SalePrice, dtype: float64


Street
Grvl    0.00411
Name: SalePrice, dtype: float64


Series([], Name: SalePrice, dtype: float64)


LotShape
IR3    0.006849
Name: SalePrice, dtype: float64


Series([], Name: SalePrice, dtype: float64)


Utilities
NoSeWa    0.000685
Name: SalePrice, dtype: float64


LotConfig
FR3    0.00274
Name: SalePrice, dtype: float64


LandSlope
Sev    0.008904
Name: SalePrice, dtype: float64


Neighborhood
Blmngtn    0.011644
Blueste    0.001370
BrDale     0.010959
ClearCr    0.019178
MeadowV    0.011644
NPkVill    0.006164
SWISU      0.017123
StoneBr    0.017123
Veenker    0.007534
Name: SalePrice, dtype: float64


Condition1
PosA    0.005479
PosN    0.013014
RRAe    0.007534
RRAn    0.017808
RRNe    0.001370
RRNn    0.003425
Name: SalePrice, dtype: float64


Condition2
Artery    0.001370
Feedr     0.004110
PosA      0.000685
PosN      0.001370
RRAe      0.000685
RRAn      0.000685
RRNn      0.001370
Name: Sal

In [58]:
def find_frequent_labels(df, var, rare_perc):
    
    df = df.copy()

    tmp = df.groupby(var)['SalePrice'].count() / len(df)

    return tmp[tmp > rare_perc].index

In [59]:
for i in categorical_variables:
    
    
    frequent_ls = find_frequent_labels(X_train,i, 0.01)
    
    
    X_train[i] = np.where(X_train[i].isin(
        frequent_ls), X_train[i], 'Rare')
    
    X_test[i] = np.where(X_test[i].isin(
        frequent_ls), X_test[i], 'Rare')