In [1]:
import pandas as pd

from sklearn.impute import SimpleImputer

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import cross_val_score

from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor

In [2]:
train_df = pd.read_csv('../data/train.csv')
train_df

Unnamed: 0,LandContour,GarageYrBlt,FullBath,1stFlrSF,BsmtFinSF1,TotRmsAbvGrd,ExterQual,OpenPorchSF,Heating,Condition2,...,GarageCars,KitchenQual,KitchenAbvGr,GrLivArea,2ndFlrSF,CentralAir,BsmtQual,TotalBsmtSF,Fireplaces,SalePrice
0,Lvl,2003.0,2,856,706,8,Gd,61,GasA,Norm,...,2,Gd,1,1710,854,Y,Gd,856,0,208500
1,Lvl,1976.0,2,1262,978,6,TA,0,GasA,Norm,...,2,TA,1,1262,0,Y,Gd,1262,1,181500
2,Lvl,2001.0,2,920,486,6,Gd,42,GasA,Norm,...,2,Gd,1,1786,866,Y,Gd,920,1,223500
3,Lvl,1998.0,1,961,216,7,TA,35,GasA,Norm,...,3,Gd,1,1717,756,Y,TA,756,1,140000
4,Lvl,2000.0,2,1145,655,9,Gd,84,GasA,Norm,...,3,Gd,1,2198,1053,Y,Gd,1145,1,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,Lvl,1999.0,2,953,0,7,TA,40,GasA,Norm,...,2,TA,1,1647,694,Y,Gd,953,1,175000
1456,Lvl,1978.0,2,2073,790,7,TA,0,GasA,Norm,...,2,TA,1,2073,0,Y,Gd,1542,2,210000
1457,Lvl,1941.0,2,1188,275,9,Ex,60,GasA,Norm,...,1,Gd,1,2340,1152,Y,TA,1152,2,266500
1458,Lvl,1950.0,1,1078,49,5,TA,0,GasA,Norm,...,1,Gd,1,1078,0,Y,TA,1078,0,142125


In [3]:
X_train = train_df.drop(columns='SalePrice')
y_train = train_df['SalePrice']

In [4]:
num_cols = ['GarageYrBlt', '1stFlrSF', 'BsmtFinSF1',
            'TotRmsAbvGrd', 'OpenPorchSF', 'GarageCars', 'GrLivArea', '2ndFlrSF', 'TotalBsmtSF']

cat_cols = [col for col in X_train.columns if not col in num_cols]


numeric_transformer = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler()
)

categorical_transformer = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(sparse_output=False, handle_unknown='ignore')
)


preprocessor = make_column_transformer(
    (numeric_transformer, num_cols),
    (categorical_transformer, cat_cols)
)

preprocessor

In [5]:
# Choosing the best model
models = [RandomForestRegressor(), LinearRegression(), SVR(), DecisionTreeRegressor(), MLPRegressor()]
metrics = []
std_l = []
pipeline_l = []

for model in models:
    regressor_pipeline = make_pipeline(
        preprocessor,
        model
    )

    cv = cross_val_score(regressor_pipeline, X_train, y_train, cv=3)

    metrics.append(cv.mean())
    std_l.append(cv.std())
    pipeline_l.append(regressor_pipeline)

metrics_df = pd.DataFrame({'model': models, 'r2_score': metrics, 'r2_std': std_l, 'pipeline': pipeline_l})
metrics_df



Unnamed: 0,model,r2_score,r2_std,pipeline
0,RandomForestRegressor(),0.851785,0.026543,(ColumnTransformer(transformers=[('pipeline-1'...
1,LinearRegression(),0.79824,0.049901,(ColumnTransformer(transformers=[('pipeline-1'...
2,SVR(),-0.053913,0.023465,(ColumnTransformer(transformers=[('pipeline-1'...
3,DecisionTreeRegressor(),0.727042,0.009188,(ColumnTransformer(transformers=[('pipeline-1'...
4,MLPRegressor(),-5.089873,0.284398,(ColumnTransformer(transformers=[('pipeline-1'...


In [6]:
best_pipeline =  metrics_df.loc[metrics_df['r2_score']== metrics_df['r2_score'].max(), 'pipeline'][0]
best_pipeline.fit(X_train, y_train)
best_pipeline

# We can also make grid search, but it's enough for now

In [7]:
import pickle

with open('../../server_CHECK_SECOND/best_regressor_pipeline.pkl', 'wb') as f:
    pickle.dump(best_pipeline, f)

In [8]:
# I implemented model in cloud in "serve_CHECK_SECOND" folder

import requests

test_df = pd.read_csv('../data/test.csv')

#server_url = "http://localhost:5000/predict" # LOCAL
server_url = 'https://sirpoopy.pythonanywhere.com/web_bee_predict_sale_price' # CLOUD


response = requests.post(server_url, files={"file": test_df.to_json()})

if response.status_code == 200:
    result = pd.DataFrame({'id':test_df.index, 'SalePrice':eval(response.text)})
    result.to_csv('../data/result.csv', index=False)
else:
    raise Exception()

In [9]:
pd.read_csv('../data/result.csv')

Unnamed: 0,id,SalePrice
0,0,122673.66
1,1,154773.50
2,2,180265.40
3,3,175480.00
4,4,194945.67
...,...,...
1454,1454,81858.93
1455,1455,86099.00
1456,1456,153753.20
1457,1457,108280.00
