In [1]:
import pandas as pd

from ml.data.data import Data
from ml.model.preprocess.preprocessor import Preprocessor
from sklearn import svm
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from ml.model.evaluate.model_evaluator import ModelEvaluator
from ml.model.evaluate.score import RMSE, MAE
from ml.model.preprocess.train_test_splitter import TrainTestSplitter
from ml.model.evaluate.model_evaluator import ScoreCriteria

In [2]:
class_column = 'SalePrice'

data = Data(class_column)
dataset = data.dataset
class_col_df = dataset[class_column]

data.save_feature_columns('feature_columns.pickle')

2024-01-27 14:31:14,636 - training - INFO - [Data] - Found data.csv locally.
2024-01-27 14:31:14,644 - training - INFO - [Data] - Succesfully saved feature columns to feature_columns.pickle


In [3]:
set_preprocessor: Preprocessor = Preprocessor(data=dataset, class_column=class_column)
preprocess_pipeline = set_preprocessor.get_preprocess_pipeline()
fit_preprocess_pipeline = preprocess_pipeline.fit(dataset.drop([class_column], axis=1))

set_preprocessor.save_pipeline(fit_preprocess_pipeline, 'preprocess_pipeline.pickle')

2024-01-27 14:31:25,777 - training - INFO - [PreProcessor] - Successfully saved pipeline to preprocess_pipeline.pickle


In [4]:
processed_df = fit_preprocess_pipeline.transform(dataset)
processed_df = pd.concat([processed_df, class_col_df], axis=1)

In [5]:
processed_df

Unnamed: 0_level_0,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,LotConfig_Corner,LotConfig_CulDSac,LotConfig_FR2,LotConfig_FR3,LotConfig_Inside,...,Exterior1st_Wd Sdng,Exterior1st_WdShing,MSSubClass,LotArea,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF2,TotalBsmtSF,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,60.0,8450.0,5.0,2003.0,2003.0,0.0,856.0,208500.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,20.0,9600.0,8.0,1976.0,1976.0,0.0,1262.0,181500.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,60.0,11250.0,5.0,2001.0,2002.0,0.0,920.0,223500.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,70.0,9550.0,5.0,1915.0,1970.0,0.0,756.0,140000.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,60.0,14260.0,5.0,2000.0,2000.0,0.0,1145.0,250000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2914,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,160.0,1936.0,7.0,1970.0,1970.0,0.0,546.0,
2915,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,160.0,1894.0,5.0,1970.0,1970.0,0.0,546.0,
2916,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,20.0,20000.0,7.0,1960.0,1996.0,0.0,1224.0,
2917,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,85.0,10441.0,5.0,1992.0,1992.0,0.0,912.0,


In [6]:
splitter = TrainTestSplitter(df=processed_df, class_column='SalePrice')
X_train, X_valid, Y_train, Y_valid, X_test = splitter.split(test_size=0.2)

2024-01-27 14:31:32,621 - training - INFO - [Data] - Split data into train and test sets with
2024-01-27 14:31:32,623 - training - INFO - [Data] - Creating test set with 1459 rows.


In [16]:
models = {
    'SVR': svm.SVR(),
    'RFR': RandomForestRegressor(n_estimators=10),
    'LR': LinearRegression()
}

In [17]:
evaluator: ModelEvaluator = ModelEvaluator()

scores = {}
for model_name, model in models.items():
    model_scores = scores[model_name] = {}
    model.fit(X_train, Y_train)
    model_scores['RMSE'] = evaluator.evaluate(model, X_valid, Y_valid, RMSE)
    model_scores['MAE'] = evaluator.evaluate(model, X_valid, Y_valid, MAE)

# TODO: make this function accept Score object instead of string
best_model_name:str = evaluator.choose_best_model(scores, ScoreCriteria.MIN, 'RMSE')

In [18]:
best_model = models.get(best_model_name)
evaluator.save_model(best_model, 'final_model2.pickle')

In [10]:
test_case = [{
    "MSSubClass": 60,
    "MSZoning": "RL",
    "LotArea": 9627,
    "LotConfig": "Inside",
    "BldgType": "1Fam",
    "OverallCond": 5,
    "YearBuilt": 1993,
    "YearRemodAdd": 1994,
    "Exterior1st": "HdBoard",
    "BsmtFinSF2": 0.0,
    "TotalBsmtSF": 996.0
}]

In [11]:
x = pd.DataFrame(test_case)
x

Unnamed: 0,MSSubClass,MSZoning,LotArea,LotConfig,BldgType,OverallCond,YearBuilt,YearRemodAdd,Exterior1st,BsmtFinSF2,TotalBsmtSF
0,60,RL,9627,Inside,1Fam,5,1993,1994,HdBoard,0.0,996.0


In [12]:
import pickle
sdsd = pickle.load(open('preprocess_pipeline.pickle', 'rb'))
