# Imports

In [None]:
import pandas as pd 
import numpy as np 


# Loading Data

In [None]:


from sklearn.model_selection import train_test_split

df = pd.read_csv("../data/train.csv")

train_df,test_df = train_test_split(df,random_state=42)

In [None]:
train_df.isna().sum().sort_values()
train_df.dropna(axis=1).isna().sum().sort_values()

# Preprocessing

In [None]:
y_train = train_df.pop("SalePrice")
y_test = test_df.pop("SalePrice")

In [None]:
train_df_num = train_df.select_dtypes("number").set_index("Id")
train_df_cat = train_df.select_dtypes("object")
test_df_num = test_df.select_dtypes("number").set_index("Id")
test_df_cat = test_df.select_dtypes("object")


## Numeric Preprocessing

### Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(train_df_num)
train_df_num_scaled = pd.DataFrame(scaler.transform(train_df_num)
                                   ,columns=train_df_num.columns)
test_df_num_scaled = pd.DataFrame(scaler.transform(test_df_num)
                                   ,columns=test_df_num.columns)

train_df_num_scaled

### Filling Na (Numeric)

In [None]:
from sklearn.impute import SimpleImputer

impute = SimpleImputer(strategy='mean')
impute.fit(train_df_num_scaled)
train_df_num_scaled = pd.DataFrame(impute.transform(train_df_num_scaled)
                                   ,columns=train_df_num_scaled.columns)
test_df_num_scaled = pd.DataFrame(impute.transform(test_df_num_scaled)
                                                   ,columns=test_df_num_scaled.columns)


## Categorical columns

In [None]:
na_col = train_df_cat.isna().sum()/len(train_df_cat)
na_col = list(na_col[na_col>0].index)

In [None]:
train_df_cat.drop(columns=na_col,inplace=True)
test_df_cat.drop(columns=na_col,inplace=True)

In [None]:
more_than_seven_cat_col = list(train_df_cat.nunique()[train_df_cat.nunique() > 7].index)
train_df_cat.drop(columns=more_than_seven_cat_col,inplace=True)
test_df_cat.drop(columns=more_than_seven_cat_col,inplace=True)

In [None]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse_output=False,
                   drop="first",
                   handle_unknown="ignore"
                   )

ohe.fit(train_df_cat)

train_df_cat = pd.DataFrame(ohe.transform(train_df_cat),columns=ohe.get_feature_names_out())
test_df_cat = pd.DataFrame(ohe.transform(test_df_cat),columns=ohe.get_feature_names_out())

In [None]:
train_df_cat.shape, train_df_num_scaled.shape

## Concatenate to create our train

In [None]:
X_train = train_df_num_scaled.join(train_df_cat)
X_test = test_df_num_scaled.join(test_df_cat)

In [None]:
X_train.shape , X_test.shape

In [None]:
X_train.shape

# Models 
## Baseline

In [None]:
# A la mano
print((y_test - y_test.mean()).abs().mean())
# sklearn
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test
                    ,np.ones_like(y_test)*y_test.mean()
                   )

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

# Instantiate the model
lin = LinearRegression()
# Fit the model on train
lin.fit(X_train,y_train)
# Make a prediction on test
y_pred = lin.predict(X_test)

lin.coef_

In [None]:
# MAE on the Train
print(f" The MAE on Train is {mean_absolute_error(y_train,lin.predict(X_train)):.2f}")
print(f" The MAE on Test is {mean_absolute_error(y_test,y_pred):.2f}")


### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor 

tree = DecisionTreeRegressor(max_depth=10)

tree.fit(X_train,y_train)
y_pred = tree.predict(X_test)

# MAE on the Train
print(f" The MAE on Train is {mean_absolute_error(y_train,lin.predict(X_train)):.2f}")
print(f" The MAE on Test is {mean_absolute_error(y_test,y_pred):.2f}")


### Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

forest = RandomForestRegressor()


forest.fit(X_train,y_train)


# MAE on the Train
y_pred = forest.predict(X_test)
print(f" The MAE on Train is {mean_absolute_error(y_train,lin.predict(X_train)):.2f}")
print(f" The MAE on Test is {mean_absolute_error(y_test,y_pred):.2f}")


### KNN Regressor


In [None]:
from sklearn.neighbors import KNeighborsRegressor

# Instantie
knn = KNeighborsRegressor()
# Fit on the Train 
knn.fit(X_train,y_train) 
# Score  (R2 par defautl pour une regression)
print(knn.score(X_test,y_test))
from sklearn.metrics import mean_squared_error
# MAE on the Train
y_pred = knn.predict(X_test)
print(f" The MSE on Train is {mean_absolute_error(y_train,knn.predict(X_train)):.2f}")
print(f" The MSE on Test is {mean_absolute_error(y_test,y_pred):.2f}")


## GridSearch 

In [None]:
RandomForestRegressor()

In [None]:
from sklearn.model_selection import GridSearchCV 

params = {"n_estimators":[50,100,200],
           #"max_depth": [None,10,15,20],
           #"min_samples_leaf":[1,2,5,10], 
           #"bootstrap": [True,False]

}


search = GridSearchCV(RandomForestRegressor()
                     ,params
                     ,verbose = 10)

search.fit(X_train
          ,y_train)


In [None]:
search.best_estimator_

In [None]:
search.best_params_

In [None]:
search.score(X_test,y_test)

In [None]:
mean_absolute_error(search.predict(X_test),y_test)

## Training on whole Data

In [None]:
def f(a,b=4) :
    print(a,b)
    
f(**{"a":3,"b":5})

In [None]:
best_model = RandomForestRegressor(**search.best_params_)
X_total = pd.concat([X_train,X_test])
y_total = pd.concat([y_train,y_test])
best_model.fit(X_total,y_total)

# Pipeline 

A pipeline is a way to wrap our processor + model in one 

## Saving model without Pipeline

In [None]:
import pickle, joblib

# Reminder : Saving model
with open("model.pickle","wb") as file : 
    joblib.dump(best_model,file)

In [None]:

# Loading model 
with open("model.pickle","rb") as file : 
    new_model = joblib.load(file)
    


## Pipeline

In [None]:
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.compose import make_column_selector
from sklearn.compose import ColumnTransformer


In [None]:
num_preproc_pipe = Pipeline([("Imputer",SimpleImputer())
                             ,("Scaling",StandardScaler())
                            ])

cat_preproc_pipe = Pipeline([("imputer",SimpleImputer(strategy="most_frequent"))
                             ,("Encode",OneHotEncoder(drop="first",
                                                      handle_unknown="ignore"
                                                      ))
    
                            ])
preproc_pipe = ColumnTransformer([("NumPreproc",num_preproc_pipe,make_column_selector(dtype_include="number"))
                    ,("CatPreproc",cat_preproc_pipe,make_column_selector(dtype_include="object"))
                                 ])
preproc_pipe

In [None]:
from sklearn.ensemble import VotingRegressor


final_pipe = Pipeline([("Fulpreproc",preproc_pipe)
                      ,("Voting",VotingRegressor([("rand",RandomForestRegressor(min_samples_leaf=5))
                                                 ,("lin",LinearRegression())
                                                  ,("knn",KNeighborsRegressor())
                                                 ])
                       )])
final_pipe

In [None]:
final_pipe.fit(X_total,y_total)

In [None]:
import os

In [None]:
final_preproc = final_pipe.steps[0][1]

if not os.path.exists("../models") :
    os.mkdir("models")

with open("../models/preproc.pickle","wb") as file :
    joblib.dump(final_preproc,file)

with open("../models/preproc.pickle","rb") as file :
    loaded_final_preproc = joblib.load(file)



    
loaded_final_preproc