# Imports

In [1]:
import pandas as pd 
import numpy as np 


# Loading Data

In [2]:


from sklearn.model_selection import train_test_split

df = pd.read_csv("../data/train.csv")

train_df,test_df = train_test_split(df,random_state=42)

In [3]:
train_df.isna().sum().sort_values()
train_df.dropna(axis=1).isna().sum().sort_values()

Id               0
MSSubClass       0
MSZoning         0
LotArea          0
Street           0
                ..
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
SalePrice        0
Length: 62, dtype: int64

# Preprocessing

In [4]:
y_train = train_df.pop("SalePrice")
y_test = test_df.pop("SalePrice")

In [5]:
train_df_num = train_df.select_dtypes("number").set_index("Id")
train_df_cat = train_df.select_dtypes("object")
test_df_num = test_df.select_dtypes("number").set_index("Id")
test_df_cat = test_df.select_dtypes("object")


## Numeric Preprocessing

### Scaling

In [6]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(train_df_num)
train_df_num_scaled = pd.DataFrame(scaler.transform(train_df_num)
                                   ,columns=train_df_num.columns)
test_df_num_scaled = pd.DataFrame(scaler.transform(test_df_num)
                                   ,columns=test_df_num.columns)

train_df_num_scaled

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,1.475911,-1.086855,-0.683950,0.637073,-0.515364,1.107889,1.020374,-0.520898,-0.944261,-0.284678,...,-0.192617,0.458853,-0.428838,-0.344584,-0.121808,-0.280154,-0.073333,-0.123489,-0.508139,0.137143
1,-0.871228,0.301747,-0.054883,-0.094926,0.390453,0.094543,0.682585,-0.025462,0.469362,2.166141,...,0.030695,1.295148,-0.718262,-0.344584,-0.121808,-0.280154,15.000378,-0.123489,-2.000860,-1.372124
2,-0.167086,-0.412391,-0.152524,-0.094926,-0.515364,-1.049557,-1.681937,-0.602500,-0.533502,-0.284678,...,-0.938576,-0.738046,-0.718262,-0.344584,-0.121808,-0.280154,-0.073333,-0.123489,1.357763,0.891777
3,-0.871228,,0.144198,-0.826925,-0.515364,-0.363097,-0.330782,-0.602500,-0.979219,-0.284678,...,-0.910068,-0.738046,-0.718262,0.242811,-0.121808,-0.280154,-0.073333,1.704639,0.611402,-0.617490
4,-0.871228,0.182724,-0.090142,-0.094926,0.390453,-0.428474,-1.295893,0.813865,0.349193,-0.284678,...,-0.838798,-0.738046,-0.718262,4.142457,-0.121808,-0.280154,-0.073333,1.399951,-0.881319,-0.617490
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1090,-0.871228,0.301747,-0.129289,-0.094926,-0.515364,1.140578,1.020374,-0.602500,-0.926782,-0.284678,...,-0.178363,-0.738046,-0.399895,-0.344584,-0.121808,-0.280154,-0.073333,-0.123489,-1.254499,-0.617490
1091,-0.167086,-0.214020,-0.266078,-1.558925,-2.326999,-1.409132,-1.681937,-0.602500,0.379781,-0.284678,...,0.467817,2.568772,-0.081528,-0.344584,-0.121808,-0.280154,-0.073333,-0.123489,2.104124,0.891777
1092,-0.871228,-0.412391,-0.232808,-0.826925,1.296270,-0.526540,0.248285,-0.602500,-0.614343,-0.284678,...,0.448812,-0.738046,-0.718262,-0.344584,-0.121808,-0.280154,-0.073333,-0.123489,-0.881319,-1.372124
1093,-0.167086,-0.610763,-0.280725,0.637073,2.202087,-1.736018,0.634329,-0.602500,-0.979219,-0.284678,...,-1.242660,-0.738046,2.754834,-0.344584,-0.121808,-0.280154,-0.073333,-0.123489,-0.134958,-0.617490


### Filling Na (Numeric)

In [7]:
from sklearn.impute import SimpleImputer

impute = SimpleImputer(strategy='mean')
impute.fit(train_df_num_scaled)
train_df_num_scaled = pd.DataFrame(impute.transform(train_df_num_scaled)
                                   ,columns=train_df_num_scaled.columns)
test_df_num_scaled = pd.DataFrame(impute.transform(test_df_num_scaled)
                                                   ,columns=test_df_num_scaled.columns)


## Categorical columns

In [8]:
na_col = train_df_cat.isna().sum()/len(train_df_cat)
na_col = list(na_col[na_col>0].index)

In [9]:
train_df_cat.drop(columns=na_col,inplace=True)
test_df_cat.drop(columns=na_col,inplace=True)

In [10]:
more_than_seven_cat_col = list(train_df_cat.nunique()[train_df_cat.nunique() > 7].index)
train_df_cat.drop(columns=more_than_seven_cat_col,inplace=True)
test_df_cat.drop(columns=more_than_seven_cat_col,inplace=True)

In [11]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse_output=False,
                   drop="first",
                   handle_unknown="ignore"
                   )

ohe.fit(train_df_cat)

train_df_cat = pd.DataFrame(ohe.transform(train_df_cat),columns=ohe.get_feature_names_out())
test_df_cat = pd.DataFrame(ohe.transform(test_df_cat),columns=ohe.get_feature_names_out())



In [12]:
train_df_cat.shape, train_df_num_scaled.shape

((1095, 76), (1095, 36))

## Concatenate to create our train

In [13]:
X_train = train_df_num_scaled.join(train_df_cat)
X_test = test_df_num_scaled.join(test_df_cat)

In [14]:
X_train.shape , X_test.shape

((1095, 112), (365, 112))

In [15]:
X_train.shape

(1095, 112)

# Models 
## Baseline

In [16]:
# A la mano
print((y_test - y_test.mean()).abs().mean())
# sklearn
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test
                    ,np.ones_like(y_test)*y_test.mean()
                   )

59303.87802589604


np.float64(59303.87802589604)

## Linear Regression

In [17]:
from sklearn.linear_model import LinearRegression

# Instantiate the model
lin = LinearRegression()
# Fit the model on train
lin.fit(X_train,y_train)
# Make a prediction on test
y_pred = lin.predict(X_test)

lin.coef_

array([-3.35823253e+03,  2.49503672e+03,  5.60330323e+03,  1.38208832e+04,
        6.10032224e+03,  7.22396264e+03,  2.18524586e+03,  3.36136791e+03,
       -8.57671980e+16, -2.88257515e+16, -8.33260383e+16,  8.31413486e+16,
        4.25441259e+17,  4.84620577e+17,  5.38344733e+16, -5.79669604e+17,
        1.47716800e+02, -1.39891672e+03, -2.51558974e+02, -1.90913420e+03,
       -5.37487662e+03, -4.69093669e+03, -4.57711600e+02,  1.56442183e+03,
        1.65253390e+03, -6.41721748e+02,  6.20117327e+03,  1.90773380e+03,
       -6.16226975e+02, -4.37382572e+02,  1.09544360e+03,  2.29269980e+03,
        2.71123940e+03, -1.29600000e+03, -7.96000000e+02, -1.92000000e+02,
        2.90540000e+04,  1.61840000e+04,  2.31372500e+04,  1.75360000e+04,
        3.11040000e+04,  6.35600000e+03, -9.70000000e+02, -1.18400000e+03,
        1.66480000e+04, -5.62600000e+03,  2.32800000e+03, -2.08905000e+04,
        1.10880000e+04, -4.40000000e+03, -8.82996875e+03,  2.89600000e+03,
        1.16880000e+04, -

In [18]:
# MAE on the Train
print(f" The MAE on Train is {mean_absolute_error(y_train,lin.predict(X_train)):.2f}")
print(f" The MAE on Test is {mean_absolute_error(y_test,y_pred):.2f}")


 The MAE on Train is 16183.45
 The MAE on Test is 20635.16


### Decision Tree

In [19]:
from sklearn.tree import DecisionTreeRegressor 

tree = DecisionTreeRegressor(max_depth=10)

tree.fit(X_train,y_train)
y_pred = tree.predict(X_test)

# MAE on the Train
print(f" The MAE on Train is {mean_absolute_error(y_train,lin.predict(X_train)):.2f}")
print(f" The MAE on Test is {mean_absolute_error(y_test,y_pred):.2f}")


 The MAE on Train is 16183.45
 The MAE on Test is 23041.05


### Random Forest

In [20]:
from sklearn.ensemble import RandomForestRegressor

forest = RandomForestRegressor()


forest.fit(X_train,y_train)


# MAE on the Train
y_pred = forest.predict(X_test)
print(f" The MAE on Train is {mean_absolute_error(y_train,lin.predict(X_train)):.2f}")
print(f" The MAE on Test is {mean_absolute_error(y_test,y_pred):.2f}")


 The MAE on Train is 16183.45
 The MAE on Test is 17009.05


### KNN Regressor


In [21]:
from sklearn.neighbors import KNeighborsRegressor

# Instantie
knn = KNeighborsRegressor()
# Fit on the Train 
knn.fit(X_train,y_train) 
# Score  (R2 par defautl pour une regression)
print(knn.score(X_test,y_test))
from sklearn.metrics import mean_squared_error
# MAE on the Train
y_pred = knn.predict(X_test)
print(f" The MSE on Train is {mean_absolute_error(y_train,knn.predict(X_train)):.2f}")
print(f" The MSE on Test is {mean_absolute_error(y_test,y_pred):.2f}")


0.8149183687161994
 The MSE on Train is 18622.51
 The MSE on Test is 21313.85


## GridSearch 

In [22]:
RandomForestRegressor()

In [23]:
from sklearn.model_selection import GridSearchCV 

params = {"n_estimators":[50,100,200],
           #"max_depth": [None,10,15,20],
           #"min_samples_leaf":[1,2,5,10], 
           #"bootstrap": [True,False]

}


search = GridSearchCV(RandomForestRegressor()
                     ,params
                     ,verbose = 10)

search.fit(X_train
          ,y_train)


Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV 1/5; 1/3] START n_estimators=50.............................................
[CV 1/5; 1/3] END ..............n_estimators=50;, score=0.862 total time=   0.8s
[CV 2/5; 1/3] START n_estimators=50.............................................
[CV 2/5; 1/3] END ..............n_estimators=50;, score=0.534 total time=   0.8s
[CV 3/5; 1/3] START n_estimators=50.............................................
[CV 3/5; 1/3] END ..............n_estimators=50;, score=0.843 total time=   0.8s
[CV 4/5; 1/3] START n_estimators=50.............................................
[CV 4/5; 1/3] END ..............n_estimators=50;, score=0.872 total time=   0.8s
[CV 5/5; 1/3] START n_estimators=50.............................................
[CV 5/5; 1/3] END ..............n_estimators=50;, score=0.866 total time=   0.8s
[CV 1/5; 2/3] START n_estimators=100............................................
[CV 1/5; 2/3] END .............n_estimators=100;,

In [24]:
search.best_estimator_

In [25]:
search.best_params_

{'n_estimators': 200}

In [26]:
search.score(X_test,y_test)

0.8962232237631281

In [27]:
mean_absolute_error(search.predict(X_test),y_test)

np.float64(17085.408301369862)

## Training on whole Data

In [28]:
def f(a,b=4) :
    print(a,b)
    
f(**{"a":3,"b":5})

3 5


In [29]:
best_model = RandomForestRegressor(**search.best_params_)
X_total = pd.concat([X_train,X_test])
y_total = pd.concat([y_train,y_test])
best_model.fit(X_total,y_total)

# Pipeline 

A pipeline is a way to wrap our processor + model in one 

## Saving model without Pipeline

In [30]:
import pickle, joblib

# Reminder : Saving model
with open("model.pickle","wb") as file : 
    joblib.dump(best_model,file)

In [31]:

# Loading model 
with open("model.pickle","rb") as file : 
    new_model = joblib.load(file)
    


## Pipeline

In [32]:
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.compose import make_column_selector
from sklearn.compose import ColumnTransformer


In [33]:
num_preproc_pipe = Pipeline([("Imputer",SimpleImputer())
                             ,("Scaling",StandardScaler())
                            ])

cat_preproc_pipe = Pipeline([("imputer",SimpleImputer(strategy="most_frequent"))
                             ,("Encode",OneHotEncoder(drop="first",
                                                      handle_unknown="ignore"
                                                      ))
    
                            ])
preproc_pipe = ColumnTransformer([("NumPreproc",num_preproc_pipe,make_column_selector(dtype_include="number"))
                    ,("CatPreproc",cat_preproc_pipe,make_column_selector(dtype_include="object"))
                                 ])
preproc_pipe

In [34]:
from sklearn.ensemble import VotingRegressor


final_pipe = Pipeline([("Fulpreproc",preproc_pipe)
                      ,("Voting",VotingRegressor([("rand",RandomForestRegressor(min_samples_leaf=5))
                                                 ,("lin",LinearRegression())
                                                  ,("knn",KNeighborsRegressor())
                                                 ])
                       )])
final_pipe

In [35]:
final_pipe.fit(X_total,y_total)

In [36]:
import os

In [40]:
final_preproc = final_pipe.steps[0][1]

if not os.path.exists("../models") :
    os.mkdir("../models")

with open("../models/preproc.pickle","wb") as file :
    joblib.dump(final_preproc,file)

with open("../models/preproc.pickle","rb") as file :
    loaded_final_preproc = joblib.load(file)



    
final_model = final_pipe.steps[1][1]
final_model