In [71]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error  #for test not use it 

from sklearn.metrics import r2_score
import pickle
from typing import Union

In [72]:
# Load data after preprocessing
data = pd.read_csv('data/data_ready.csv')

    Create polynomial model with given degree on the features and target.
    Using PolynomialFeatures() to generate polynomial included interaction features only.
    it can fit the model on the new features from PolynomialFeatures() generated.
    and predict target data by the new features from PolynomialFeatures() generated.



In [73]:
def polynomial_model(model,
                    degree:int|tuple[int, int], 
                    X:pd.DataFrame,
                    y:list,
                    train_model = True,
                    predict_model = False
                    ) -> Union[None,np.array]:

    poly = PolynomialFeatures(degree, interaction_only=True)
    poly.fit(X)
    new_features = poly.transform(X)
    
    if train_model:
        model.fit(new_features, y)
    
    if predict_model:
        predict_data = model.predict(new_features) 
        return predict_data

    Training all models in the dictionary on the features and target variable.
    Keeping in view the degree number.


In [74]:
def train_models(models:dict,
                X_train:pd.DataFrame, 
                y_train:list) -> dict:

    for model_id in models:
        degree = models[model_id].get('degree') if models[model_id].get('degree') else None
        
        polynomial_model(model=models[model_id]['model'],
                            degree=degree,
                            X=X_train,
                            y=y_train) if degree else models[model_id]['model'].fit(X_train, y_train)

        print('Trained -> ', models[model_id])

    return models

In [75]:
def evaluation_models(models:dict,
                      X:pd.DataFrame, 
                      y:list,
                      train:bool = False) -> dict:

    for model_id in models:
        degree = models[model_id].get('degree') if models[model_id].get('degree') else None
        
        y_predict = polynomial_model(model = models[model_id]['model'],
                                    X=X,
                                    y=y,
                                    degree=degree,
                                    train_model=False,
                                    predict_model=True
                                    ) if degree else models[model_id]['model'].predict(X)

        score = r2_score(y_predict, y)
        
        if train:
            models[model_id]["Train_Score"] = score
        else:
            models[model_id]["Test_Score"] = score
    
    return models

In [76]:
# Select All features exclude target feature
X = data[data.drop('price', axis=1).columns]
y = data['price']

# Spliting data to training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [77]:
# Dictionary contains all models we want to create
models = {
        'linear': {'model': LinearRegression()},
        'ploy_2': {'model': LinearRegression(), 'degree' : 2},
        'ploy_3': {'model': LinearRegression(), 'degree' : 3},
        'knn'   : {'model': KNeighborsRegressor(n_neighbors=3)},
        'D_tree': {'model': DecisionTreeRegressor(random_state=0)}
        }

In [78]:
# Training models
models = train_models(models, X_train, y_train)

Trained ->  {'model': LinearRegression()}
Trained ->  {'model': LinearRegression(), 'degree': 2}
Trained ->  {'model': LinearRegression(), 'degree': 3}
Trained ->  {'model': KNeighborsRegressor(n_neighbors=3)}
Trained ->  {'model': DecisionTreeRegressor(random_state=0)}


In [79]:
# Evaluation models on test dataset by score r^2 metric
models = evaluation_models(models, X_test, y_test)

In [80]:
# Evaluation models on train dataset by score r^2 metric
models = evaluation_models(models, X_train, y_train, train=True)

In [81]:
# Display score value on the test dataset for each model 
for model in models.keys():
    print('Train Score for ', model,' : ', models[model]['Train_Score']*100)
    print('Test Score for ', model,' : ', models[model]['Test_Score']*100, end='\n\n')

Train Score for  linear  :  60.33948563538966
Test Score for  linear  :  54.780913569779386

Train Score for  ploy_2  :  88.89576893130324
Test Score for  ploy_2  :  0.5936241485962057

Train Score for  ploy_3  :  99.49023371763224
Test Score for  ploy_3  :  -0.054120905492904825

Train Score for  knn  :  21.900187077445477
Test Score for  knn  :  -96.57201428681459

Train Score for  D_tree  :  100.0
Test Score for  D_tree  :  60.978668550175556



In [82]:
# Pickle best model to use it
pickle.dump(models['D_tree']['model'], open('./pkls/model.pkl', 'wb'))

In [83]:
# Pickle polynomial model to use it
poly = PolynomialFeatures(2, interaction_only=True)
poly.fit(X_train)
pickle.dump(poly, open('./pkls/polynomial_features.pkl', 'wb'))

In [84]:
models_metadata = {}

models_metadata = models

models_metadata['Test Size'] = 0.3
models_metadata['Metric Evaluation'] = 'r^2 score'
models_metadata['Data Shape Used'] = data.shape

In [85]:
with open("./metadata/models_metadata.txt", 'w') as f: 
    for key, value in models_metadata.items(): 
        f.write('%s:%s\n' % (key, value))