In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
sns.set_style(style="whitegrid")

In [23]:
df_train = pd.read_csv("train.csv")

In [24]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [25]:
categorical_features = []
numerical_features =[]

#Seperating numerical and categorical features.
#-->>
for feature, content in df_train.items():
    if pd.api.types.is_string_dtype(content):
        categorical_features.append(feature)
    if pd.api.types.is_numeric_dtype(content):
        numerical_features.append(feature)
print("matches" if ((len(categorical_features) + len(numerical_features)) == len(df_train.columns)) else "doesnt match")

matches


In [26]:
#Let's convert categorical columns into numerical columns by label encoding
for feature in categorical_features:
    df_train[feature] = df_train[feature].astype("category").cat.codes
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   int8   
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   int8   
 6   Alley          1460 non-null   int8   
 7   LotShape       1460 non-null   int8   
 8   LandContour    1460 non-null   int8   
 9   Utilities      1460 non-null   int8   
 10  LotConfig      1460 non-null   int8   
 11  LandSlope      1460 non-null   int8   
 12  Neighborhood   1460 non-null   int8   
 13  Condition1     1460 non-null   int8   
 14  Condition2     1460 non-null   int8   
 15  BldgType       1460 non-null   int8   
 16  HouseStyle     1460 non-null   int8   
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [27]:
# let's replace -1 values (which were nan values prior to converting them into numerical values) to np.nan
# (we do it to use pandas "dropna()" an "isna()" methods).
for feature in categorical_features:
    df_train[feature].replace(-1, np.nan, inplace=True)
df_train

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,3,65.0,8450,1,,3,3,0,...,0,,,,0,2,2008,8,4,208500
1,2,20,3,80.0,9600,1,,3,3,0,...,0,,,,0,5,2007,8,4,181500
2,3,60,3,68.0,11250,1,,0,3,0,...,0,,,,0,9,2008,8,4,223500
3,4,70,3,60.0,9550,1,,0,3,0,...,0,,,,0,2,2006,8,0,140000
4,5,60,3,84.0,14260,1,,0,3,0,...,0,,,,0,12,2008,8,4,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,3,62.0,7917,1,,3,3,0,...,0,,,,0,8,2007,8,4,175000
1456,1457,20,3,85.0,13175,1,,3,3,0,...,0,,2.0,,0,2,2010,8,4,210000
1457,1458,70,3,66.0,9042,1,,3,3,0,...,0,,0.0,2.0,2500,5,2010,8,4,266500
1458,1459,20,3,68.0,9717,1,,3,3,0,...,0,,,,0,4,2010,8,4,142125


In [28]:
more_than_90_percent_nan_features = []

In [29]:
#Let's make a function to see how many nan values in each column
def check_nan(df):
    for col in df.columns:
        nan_count = df[col].isna().sum()
        nan_percent = (nan_count / len(df)) * 100
        print("{} has {} nan values:--{:0.2f}% of the whole data--".format(col, nan_count, nan_percent))
        if nan_percent >= 90.0:
            more_than_90_percent_nan_features.append(col)

In [30]:
check_nan(df_train)

Id has 0 nan values:--0.00% of the whole data--
MSSubClass has 0 nan values:--0.00% of the whole data--
MSZoning has 0 nan values:--0.00% of the whole data--
LotFrontage has 259 nan values:--17.74% of the whole data--
LotArea has 0 nan values:--0.00% of the whole data--
Street has 0 nan values:--0.00% of the whole data--
Alley has 1369 nan values:--93.77% of the whole data--
LotShape has 0 nan values:--0.00% of the whole data--
LandContour has 0 nan values:--0.00% of the whole data--
Utilities has 0 nan values:--0.00% of the whole data--
LotConfig has 0 nan values:--0.00% of the whole data--
LandSlope has 0 nan values:--0.00% of the whole data--
Neighborhood has 0 nan values:--0.00% of the whole data--
Condition1 has 0 nan values:--0.00% of the whole data--
Condition2 has 0 nan values:--0.00% of the whole data--
BldgType has 0 nan values:--0.00% of the whole data--
HouseStyle has 0 nan values:--0.00% of the whole data--
OverallQual has 0 nan values:--0.00% of the whole data--
OverallCo

In [31]:
#These columns have more than 90% missing data (let's try to fill them with ML, we can try to predict df_test, with having them in our data set and without having them in our dataset later.And see which way performs better)
#So we are keeping them for now.
more_than_90_percent_nan_features

['Alley', 'PoolQC', 'MiscFeature']

In [32]:
#Now, we have made all the columns into numerical, so we can apply ML algorithm
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   int8   
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   int8   
 6   Alley          91 non-null     float64
 7   LotShape       1460 non-null   int8   
 8   LandContour    1460 non-null   int8   
 9   Utilities      1460 non-null   int8   
 10  LotConfig      1460 non-null   int8   
 11  LandSlope      1460 non-null   int8   
 12  Neighborhood   1460 non-null   int8   
 13  Condition1     1460 non-null   int8   
 14  Condition2     1460 non-null   int8   
 15  BldgType       1460 non-null   int8   
 16  HouseStyle     1460 non-null   int8   
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [33]:
# let's fill nan values in the PoolQC column with ML algorithm, then we make a function to fill every nan value in the rest of the columns
# we will make X_train y_train out of PoolQC's nan values and non-nan values
#---------------------------------------
#we need the indexes that contains non-nan values in the(so when we make y_train it has no nan values and we can fit the model to it)
#but in the X_train dataset there will be non values and we will fill them with straight-forward methods(replacing numerical nan values with the mode of them, replacing categorical values with -1)

train_indexes = df_train["MasVnrType"].dropna().index
test_indexes = df_train.loc[df_train["MasVnrType"].isna()].index

In [34]:
X_train = df_train.loc[train_indexes].drop(["MasVnrType", "Id"], axis=1).copy()
y_train = df_train.loc[train_indexes, "MasVnrType"].copy()
X_test = df_train.loc[test_indexes].drop(["MasVnrType","Id"], axis=1).copy()

In [35]:
for feature in categorical_features:
    if feature != "MasVnrType":
        X_train[feature].fillna(-1, inplace=True)
        X_test[feature].fillna(-1, inplace=True)
for feature in numerical_features:
    fill_value = df_train[feature].median()
    if feature != "Id":
        X_train[feature].fillna(fill_value, inplace=True)
        X_test[feature].fillna(fill_value, inplace=True)

In [36]:
check_nan(X_test)

MSSubClass has 0 nan values:--0.00% of the whole data--
MSZoning has 0 nan values:--0.00% of the whole data--
LotFrontage has 0 nan values:--0.00% of the whole data--
LotArea has 0 nan values:--0.00% of the whole data--
Street has 0 nan values:--0.00% of the whole data--
Alley has 0 nan values:--0.00% of the whole data--
LotShape has 0 nan values:--0.00% of the whole data--
LandContour has 0 nan values:--0.00% of the whole data--
Utilities has 0 nan values:--0.00% of the whole data--
LotConfig has 0 nan values:--0.00% of the whole data--
LandSlope has 0 nan values:--0.00% of the whole data--
Neighborhood has 0 nan values:--0.00% of the whole data--
Condition1 has 0 nan values:--0.00% of the whole data--
Condition2 has 0 nan values:--0.00% of the whole data--
BldgType has 0 nan values:--0.00% of the whole data--
HouseStyle has 0 nan values:--0.00% of the whole data--
OverallQual has 0 nan values:--0.00% of the whole data--
OverallCond has 0 nan values:--0.00% of the whole data--
YearBui

In [37]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [38]:
y_preds = model.predict(X_test)

In [39]:
len(y_preds)

8

In [58]:
def fill_nan(df):
    """
    this function first label encodes the categorical features, then fills the nan values in every column via RandomForest models
    df:raw dataframe
    irrelevant_columns: the column or columns that have no predictive power over predicitons("Id" etc.)
    """
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.ensemble import RandomForestRegressor
    categorical_features = []
    numerical_features = []
    for feature, content in df.items():
        if pd.api.types.is_string_dtype(content):
            categorical_features.append(feature)
        if pd.api.types.is_numeric_dtype(content):
            numerical_features.append(feature)
    for feature in categorical_features:
        df[feature] = df[feature].astype("category").cat.codes
    for feature in categorical_features:
        df[feature].replace(-1, np.nan, inplace=True)
    #----------------------------------------------------------------------------------------
    #filling nan values in categorical columns with ML(RandomForestClassifier())
    #-->>
    for feature in categorical_features:
        train_indexes = df[feature].dropna().index
        test_indexes = df.loc[df[feature].isna()].index
        if len(test_indexes) == 0:
            continue
        X_train = df.loc[train_indexes].drop([feature, "Id"], axis=1).copy()
        y_train = df.loc[train_indexes, feature].copy()
        X_test = df.loc[test_indexes].drop([feature, "Id"], axis=1).copy()
        for col in categorical_features:
            if col != feature:
                X_train[col].fillna(-1, inplace=True)
                X_test[col].fillna(-1, inplace=True)
        for col in numerical_features:
            fill_value = df[col].median()
            if col != "Id":
                X_train[col].fillna(fill_value, inplace=True)
                X_test[col].fillna(fill_value, inplace=True)
        model = RandomForestClassifier()
        model.fit(X_train, y_train)
        y_preds = model.predict(X_test)
        df.loc[test_indexes, feature] = y_preds
    for feature in numerical_features:
        train_indexes = df[feature].dropna().index
        test_indexes = df.loc[df[feature].isna()].index
        if len(test_indexes) == 0:
            continue
        X_train = df.loc[train_indexes].drop([feature, "Id"], axis=1).copy()
        y_train = df.loc[train_indexes, feature].copy()
        X_test = df.loc[test_indexes].drop([feature, "Id"], axis=1).copy()
        
        for col in numerical_features:
            fill_value = df[col].median()
            if col != "Id" and col != feature:
                X_train[col].fillna(fill_value, inplace=True)
                X_test[col].fillna(fill_value, inplace=True)
        model = RandomForestRegressor()
        model.fit(X_train, y_train)
        y_preds = model.predict(X_test)
        df.loc[test_indexes, feature] = y_preds
        
    """
    #----------------------------------------------------------------------------------------
    #Filling nan values in numerical columns with ML(RandomForestRegressor())
    #-->>
    for feature in numerical_features:
        train_indexes = df[feature].dropna().index
        test_indexes = df.loc[df_train[feature].isna()].index
        irrelevant_columns.append(feature)
        columns_to_drop = irrelevant_columns
        X_train = df.loc[train_indexes].drop(columns_to_drop, axis=1).copy()
        y_train = df.loc[train_indexes, feature].copy()
        X_test = df.loc[test_indexes].drop(columns_to_drop, axis=1).copy()
    #note: this time we dont have any nan values in categorical features, hence we only need to fill the nan values in numerical features
        for col in categorical_features:
            if not col in irrelevant_columns:
                X_train[col].fillna(-1, inplace=True)
                X_test[col].fillna(-1, inplace=True)
        for col in numerical_features:
            if not col in irrelevant_columns:
                X_train[col].fillna(X_train[col].median(), inplace=True)
                X_test[col].fillna(X_test[col].median(), inplace=True)
        if len(X_test) != 0: 
            try:
                model = RandomForestRegressor()
                model.fit(X_train, y_train)
                y_preds = model.predict(X_test)
                df.loc[test_indexes, feature] = y_preds
                del model
            except:
                print(feature)
    """
    #----------------------------------------------------------------------------------------
    return df

In [80]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
df_train_filled = fill_nan(df_train)
df_test_filled = fill_nan(df_test)

In [81]:
check_nan(df_train_filled)

Id has 0 nan values:--0.00% of the whole data--
MSSubClass has 0 nan values:--0.00% of the whole data--
MSZoning has 0 nan values:--0.00% of the whole data--
LotFrontage has 0 nan values:--0.00% of the whole data--
LotArea has 0 nan values:--0.00% of the whole data--
Street has 0 nan values:--0.00% of the whole data--
Alley has 0 nan values:--0.00% of the whole data--
LotShape has 0 nan values:--0.00% of the whole data--
LandContour has 0 nan values:--0.00% of the whole data--
Utilities has 0 nan values:--0.00% of the whole data--
LotConfig has 0 nan values:--0.00% of the whole data--
LandSlope has 0 nan values:--0.00% of the whole data--
Neighborhood has 0 nan values:--0.00% of the whole data--
Condition1 has 0 nan values:--0.00% of the whole data--
Condition2 has 0 nan values:--0.00% of the whole data--
BldgType has 0 nan values:--0.00% of the whole data--
HouseStyle has 0 nan values:--0.00% of the whole data--
OverallQual has 0 nan values:--0.00% of the whole data--
OverallCond has 

In [82]:
check_nan(df_test_filled)

Id has 0 nan values:--0.00% of the whole data--
MSSubClass has 0 nan values:--0.00% of the whole data--
MSZoning has 0 nan values:--0.00% of the whole data--
LotFrontage has 0 nan values:--0.00% of the whole data--
LotArea has 0 nan values:--0.00% of the whole data--
Street has 0 nan values:--0.00% of the whole data--
Alley has 0 nan values:--0.00% of the whole data--
LotShape has 0 nan values:--0.00% of the whole data--
LandContour has 0 nan values:--0.00% of the whole data--
Utilities has 0 nan values:--0.00% of the whole data--
LotConfig has 0 nan values:--0.00% of the whole data--
LandSlope has 0 nan values:--0.00% of the whole data--
Neighborhood has 0 nan values:--0.00% of the whole data--
Condition1 has 0 nan values:--0.00% of the whole data--
Condition2 has 0 nan values:--0.00% of the whole data--
BldgType has 0 nan values:--0.00% of the whole data--
HouseStyle has 0 nan values:--0.00% of the whole data--
OverallQual has 0 nan values:--0.00% of the whole data--
OverallCond has 

In [103]:
df_train_filled.to_csv("df_train_filled")
df_test_filled.to_csv("df_test_filled")