In [1]:
import pandas as pd

from sklearn.impute import SimpleImputer

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import cross_val_score

from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor

In [2]:
train_df = pd.read_csv('../data/train.csv')
train_df

Unnamed: 0,LandContour,GarageYrBlt,FullBath,1stFlrSF,BsmtFinSF1,TotRmsAbvGrd,ExterQual,OpenPorchSF,Heating,Condition2,...,GarageCars,KitchenQual,KitchenAbvGr,GrLivArea,2ndFlrSF,CentralAir,BsmtQual,TotalBsmtSF,Fireplaces,SalePrice
0,Lvl,2003.0,2,856,706,8,Gd,61,GasA,Norm,...,2,Gd,1,1710,854,Y,Gd,856,0,208500
1,Lvl,1976.0,2,1262,978,6,TA,0,GasA,Norm,...,2,TA,1,1262,0,Y,Gd,1262,1,181500
2,Lvl,2001.0,2,920,486,6,Gd,42,GasA,Norm,...,2,Gd,1,1786,866,Y,Gd,920,1,223500
3,Lvl,1998.0,1,961,216,7,TA,35,GasA,Norm,...,3,Gd,1,1717,756,Y,TA,756,1,140000
4,Lvl,2000.0,2,1145,655,9,Gd,84,GasA,Norm,...,3,Gd,1,2198,1053,Y,Gd,1145,1,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,Lvl,1999.0,2,953,0,7,TA,40,GasA,Norm,...,2,TA,1,1647,694,Y,Gd,953,1,175000
1456,Lvl,1978.0,2,2073,790,7,TA,0,GasA,Norm,...,2,TA,1,2073,0,Y,Gd,1542,2,210000
1457,Lvl,1941.0,2,1188,275,9,Ex,60,GasA,Norm,...,1,Gd,1,2340,1152,Y,TA,1152,2,266500
1458,Lvl,1950.0,1,1078,49,5,TA,0,GasA,Norm,...,1,Gd,1,1078,0,Y,TA,1078,0,142125


In [3]:
X_train = train_df.drop(columns='SalePrice')
y_train = train_df['SalePrice']

In [4]:
num_cols = ['GarageYrBlt', '1stFlrSF', 'BsmtFinSF1',
            'TotRmsAbvGrd', 'OpenPorchSF', 'GarageCars', 'GrLivArea', '2ndFlrSF', 'TotalBsmtSF']

cat_cols = [col for col in X_train.columns if not col in num_cols]


numeric_transformer = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler()
)

categorical_transformer = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(sparse_output=False, handle_unknown='ignore')
)


preprocessor = make_column_transformer(
    (numeric_transformer, num_cols),
    (categorical_transformer, cat_cols)
)

preprocessor

In [5]:
# Choosing the best model
models = [RandomForestRegressor(), LinearRegression(), SVR(), DecisionTreeRegressor(), MLPRegressor()]
metrics = []
std_l = []
pipeline_l = []

for model in models:
    regressor_pipeline = make_pipeline(
        preprocessor,
        model
    )

    cv = cross_val_score(regressor_pipeline, X_train, y_train, cv=3)

    metrics.append(cv.mean())
    std_l.append(cv.std())
    pipeline_l.append(regressor_pipeline)

metrics_df = pd.DataFrame({'model': models, 'r2_score': metrics, 'r2_std': std_l, 'pipeline': pipeline_l})
metrics_df



Unnamed: 0,model,r2_score,r2_std,pipeline
0,RandomForestRegressor(),0.851299,0.026259,(ColumnTransformer(transformers=[('pipeline-1'...
1,LinearRegression(),0.79824,0.049901,(ColumnTransformer(transformers=[('pipeline-1'...
2,SVR(),-0.053913,0.023465,(ColumnTransformer(transformers=[('pipeline-1'...
3,DecisionTreeRegressor(),0.684621,0.039622,(ColumnTransformer(transformers=[('pipeline-1'...
4,MLPRegressor(),-5.084605,0.280612,(ColumnTransformer(transformers=[('pipeline-1'...


In [6]:
best_pipeline =  metrics_df.loc[metrics_df['r2_score']== metrics_df['r2_score'].max(), 'pipeline'][0]
best_pipeline.fit(X_train, y_train)
best_pipeline

# We can also make grid search, but it's enough for now

In [9]:
import pickle

with open('../../server/best_regressor_pipeline.pkl', 'wb') as f:
    pickle.dump(best_pipeline, f)

In [69]:
import requests

#server_url = "http://localhost:5000/predict"
server_url = 'https://sirpoopy.pythonanywhere.com/web_bee_predict_sale_price'

response = requests.post(server_url, files={"file": train_df.to_json()})

if response.status_code == 200:
    print(response.text)
else:
    print(f"Error uploading CSV file: {response.status_code} - {response.text}")

[206870.0, 177102.95, 217425.0, 169444.2, 258903.26, 140499.93, 303327.72, 210272.96, 148165.5, 122989.5, 130449.0, 345066.27, 139169.0, 252406.13, 155509.5, 130310.76, 149486.5, 96897.3, 155442.0, 140203.5, 324448.12, 130053.0, 230769.78, 135564.0, 146081.35, 249456.34, 134619.5, 309574.91, 190404.5, 66156.27, 67718.66, 139308.8, 186649.47, 176600.0, 274208.79, 314697.19, 145879.8, 154203.5, 118355.25, 89088.5, 161510.5, 165418.0, 141649.3333333333, 124905.58, 137776.5, 301904.75, 260662.41, 246454.31, 118712.94, 128954.33, 174001.5, 116658.76, 99016.2, 373513.11, 133720.0, 172262.0, 176702.0, 195295.0, 424982.86, 124744.0, 163490.0, 92786.86, 207205.3, 152054.82, 220823.12, 315874.95, 208445.0, 220706.5, 85964.5, 232701.6, 250377.8, 117536.5, 186070.0, 143122.5, 131051.07, 92720.0, 127614.98, 123749.0, 134278.05, 112197.0, 199464.0, 160580.0, 237111.0, 129651.45, 169964.32, 266229.98, 174854.9, 163029.83, 103080.33, 119433.58, 107190.0, 118163.37, 150866.0, 162521.74, 207807.0, 18034

In [74]:
len(eval(response.text))

1460