In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import pickle
from typing import Union

In [2]:
# Load data after preprocessing
data = pd.read_csv('../../data/data/data_ready.csv')

In [3]:
def polynomial_model(model,
                    degree:int|tuple[int, int], 
                    X:pd.DataFrame,
                    y:list,
                    train_model = True,
                    predict_model = False
                    ) -> Union[None,np.array]:
    """Create polynomial model with given degree on the features and target.
    Using PolynomialFeatures() to generate polynomial included interaction features only.
    it can fit the model on the new features from PolynomialFeatures() generated.
    and predict target data by the new features from PolynomialFeatures() generated.

    Args:
        model (ScikitModel): model
        degree ([int or tuple(int, int)]): could be single number assign to maximal degree, 
                                            or tuple of numbers assign to (min_degree, max_degree).
        X (pd.DataFrame): samples of features dataframe.
        y (list, optional): samples of target column.
        train_model (bool, optional): train model on new features from polynomial features model. Defaults to True.
        predict_model (bool, optional): predict target data on new features from polynomial features model. Defaults to False.

    Returns:
        (None or np.array): nothing when train model and an array contains predicted values when predict model.
    """
    poly = PolynomialFeatures(degree, interaction_only=True)
    poly.fit(X)
    new_features = poly.transform(X)
    
    if train_model:
        model.fit(new_features, y)
    
    if predict_model:
        predict_data = model.predict(new_features) 
        return predict_data

In [4]:
def train_models(models:dict,
                X_train:pd.DataFrame, 
                y_train:list) -> dict:
    """Training all models in the dictionary on the features and target variable.
    Keeping in view the degree number, if any.

    Args:
        models (dict): Contains model name (key), model and degree number (optional).
        X_train (pd.DataFrame): features dataframe.
        y_train (list): target column.

    Returns:
        (dict): New dictionary contains models trained.
    """
    for model_id in models:
        degree = models[model_id].get('degree') if models[model_id].get('degree') else None
        
        polynomial_model(model=models[model_id]['model'],
                            degree=degree,
                            X=X_train,
                            y=y_train) if degree else models[model_id]['model'].fit(X_train, y_train)

        print('Trained -> ', models[model_id])

    return models

In [5]:
def evaluation_models(models:dict,
                      X:pd.DataFrame, 
                      y:list,
                      train:bool = False) -> dict:
    """Evaluation all models in the given dictionary by score R.
    Keeping in view the degree number, if any.

    Args:
        models (dict): Contains model name (key), model and degree number (optional).
        X (pd.DataFrame): Features dataframe.
        y (list): Target column.
        train (bool, optional): Whether to save the score in train_score or test_score. 
                                if set to True, the score will saved in train_score value dictionary. Defaults to False.

    Returns:
        (dict): Same given dictionary in addition to the score value for each model.
    """
    for model_id in models:
        degree = models[model_id].get('degree') if models[model_id].get('degree') else None
        
        y_predict = polynomial_model(model = models[model_id]['model'],
                                    X=X,
                                    y=y,
                                    degree=degree,
                                    train_model=False,
                                    predict_model=True
                                    ) if degree else models[model_id]['model'].predict(X)

        score = r2_score(y_predict, y)
        
        if train:
            models[model_id]["Train_Score"] = score
        else:
            models[model_id]["Test_Score"] = score
    
    return models

In [6]:
# Select All features exclude target feature
X = data[data.drop('price', axis=1).columns]
y = data['price']

# Spliting data to training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [7]:
# Dictionary contains all models we want to create
models = {
        'linear': {'model': LinearRegression()},
        'ridge1': {'model': Ridge(), 'degree' : 1},
        'lasso1': {'model': Lasso(), 'degree' : 1},
        'ploy_2': {'model': LinearRegression(), 'degree' : 2},
        'ridge2': {'model': Ridge(), 'degree' : 2},
        'lasso2': {'model': Lasso(), 'degree' : 2},
        'ridge3': {'model': Ridge(), 'degree' : 3},
        'lasso3': {'model': Lasso(), 'degree' : 3},
        'ploy_3': {'model': LinearRegression(), 'degree' : 3},
        'knn'   : {'model': KNeighborsRegressor(n_neighbors=3)},
        'D_tree': {'model': DecisionTreeRegressor(random_state=0)}
        }

In [8]:
# Training models
models = train_models(models, X_train, y_train)

Trained ->  {'model': LinearRegression()}
Trained ->  {'model': Ridge(), 'degree': 1}
Trained ->  {'model': Lasso(), 'degree': 1}
Trained ->  {'model': LinearRegression(), 'degree': 2}
Trained ->  {'model': Ridge(), 'degree': 2}


  model = cd_fast.enet_coordinate_descent(


Trained ->  {'model': Lasso(), 'degree': 2}
Trained ->  {'model': Ridge(), 'degree': 3}


  model = cd_fast.enet_coordinate_descent(


Trained ->  {'model': Lasso(), 'degree': 3}
Trained ->  {'model': LinearRegression(), 'degree': 3}
Trained ->  {'model': KNeighborsRegressor(n_neighbors=3)}
Trained ->  {'model': DecisionTreeRegressor(random_state=0)}


In [46]:
# Evaluation models on test dataset by score r^2 metric
models = evaluation_models(models, X_test, y_test)

In [47]:
# Evaluation models on train dataset by score r^2 metric
models = evaluation_models(models, X_train, y_train, train=True)

In [48]:
# Display score value on the test dataset for each model 
for model in models.keys():
    print('Train Score for ', model,' : ', models[model]['Train_Score']*100)
    print('Test Score for ', model,' : ', models[model]['Test_Score']*100, end='\n\n')

Train Score for  linear  :  58.033892705985465
Test Score for  linear  :  52.19344833114866

Train Score for  ridge1  :  57.63069237170526
Test Score for  ridge1  :  51.7137641210706

Train Score for  lasso1  :  57.95717858936595
Test Score for  lasso1  :  52.11878098028333

Train Score for  ploy_2  :  83.95027175120154
Test Score for  ploy_2  :  -0.05630833328733598

Train Score for  ridge2  :  81.0858664798923
Test Score for  ridge2  :  59.58241025058977

Train Score for  lasso2  :  81.86919348748094
Test Score for  lasso2  :  57.337443774084996

Train Score for  ridge3  :  94.46404420266309
Test Score for  ridge3  :  0.412887677129703

Train Score for  lasso3  :  94.40237317459082
Test Score for  lasso3  :  34.459363281033006

Train Score for  ploy_3  :  97.41109163977613
Test Score for  ploy_3  :  -0.08998636839514518

Train Score for  knn  :  72.53226352356124
Test Score for  knn  :  38.157019444321726

Train Score for  D_tree  :  99.99813098066798
Test Score for  D_tree  :  57.78

In [49]:
# Pickle best model to use it
pickle.dump(models['ridge2']['model'], open('../pkls/model.pkl', 'wb'))

In [50]:
models_metadata = {}

models_metadata = models

models_metadata['Test Size'] = 0.3
models_metadata['Metric Evaluation'] = 'r^2 score'
models_metadata['Data Shape Used'] = data.shape

In [51]:
with open("../metadata/models_metadata.txt", 'w') as f: 
    for key, value in models_metadata.items(): 
        f.write('%s:%s\n' % (key, value))

In [52]:
# Pickle polynomial model to use it
poly = PolynomialFeatures(2, interaction_only=True)
poly.fit(X_train)
pickle.dump(poly, open('../pkls/polynomial_features.pkl', 'wb'))