In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
sns.set_style(style="whitegrid")

In [2]:
df_train_filled = pd.read_csv("df_train_filled")
df_test_filled = pd.read_csv("df_test_filled")

In [3]:
df_train_filled.drop("Unnamed: 0", axis=1, inplace=True)
df_test_filled.drop("Unnamed: 0", axis=1, inplace=True)

In [4]:
df_train_filled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   int64  
 3   LotFrontage    1460 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   int64  
 6   Alley          1460 non-null   float64
 7   LotShape       1460 non-null   int64  
 8   LandContour    1460 non-null   int64  
 9   Utilities      1460 non-null   int64  
 10  LotConfig      1460 non-null   int64  
 11  LandSlope      1460 non-null   int64  
 12  Neighborhood   1460 non-null   int64  
 13  Condition1     1460 non-null   int64  
 14  Condition2     1460 non-null   int64  
 15  BldgType       1460 non-null   int64  
 16  HouseStyle     1460 non-null   int64  
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [5]:
df_test_filled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1459 non-null   float64
 3   LotFrontage    1459 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   int64  
 6   Alley          1459 non-null   float64
 7   LotShape       1459 non-null   int64  
 8   LandContour    1459 non-null   int64  
 9   Utilities      1459 non-null   float64
 10  LotConfig      1459 non-null   int64  
 11  LandSlope      1459 non-null   int64  
 12  Neighborhood   1459 non-null   int64  
 13  Condition1     1459 non-null   int64  
 14  Condition2     1459 non-null   int64  
 15  BldgType       1459 non-null   int64  
 16  HouseStyle     1459 non-null   int64  
 17  OverallQual    1459 non-null   int64  
 18  OverallC

In [6]:
#All the features are numeric, now we target encode the categorical columns that we have label encoded before

In [91]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [92]:
categorical_features = []
for feature, content in df_train.items():
    if pd.api.types.is_string_dtype(content):
        categorical_features.append(feature)
categorical_features

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [93]:
df_train_filled["MSZoning"].value_counts().index

Float64Index([174000.0, 120500.0, 205950.0, 136500.0, 74700.0], dtype='float64')

In [94]:
df_train_filled.loc[df_train_filled["MSZoning"] == 3, "SalePrice"].median()

nan

In [95]:
for feature in categorical_features:
    target_encoded_values = {}
    for category in df_train_filled[feature].value_counts().index:
        target_encoded_values[category] = df_train_filled.loc[df_train_filled[feature] == category, "SalePrice"].median()
    df_train_filled[feature] = df_train_filled[feature].map(target_encoded_values)
    df_test_filled[feature] = df_test_filled[feature].map(target_encoded_values)

In [96]:
#PoolQC,Alley,MiscFeature has more than 90% nan values.

In [97]:
features_to_drop = ["Id", "SalePrice", "PoolQC", "Alley", "MiscFeature"]

In [98]:
from sklearn.model_selection import train_test_split

In [99]:
X = df_train_filled.drop(features_to_drop, axis=1)
y = df_train_filled["SalePrice"]

In [100]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [101]:
from sklearn.ensemble import RandomForestRegressor

In [78]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X_train, y_train)
lm.score(X_train,y_train), lm.score(X_test, y_test)

(0.9048278615399882, 0.6316468987209078)

In [79]:
X_train["lm_predictions"] = lm.predict(X_train)
X_test["lm_predictions"] = lm.predict(X_test)

In [102]:
model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.8114116621010194

In [103]:
model.score(X_train, y_train)

0.9818670153039258

In [104]:
model.fit(X, y)

In [105]:
df_test

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [106]:
features_to_drop.remove("SalePrice")
X_train = df_test_filled.drop(features_to_drop, axis=1)

In [107]:
y_preds = model.predict(X_train)

In [108]:
predictions = pd.DataFrame({"Id": df_test["Id"]})

In [109]:
predictions["SalePrice"] = y_preds

In [110]:
predictions.to_csv("submission_5_2.csv", index=False)