In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, validation_curve
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

In [2]:
df = pd.read_csv('df_after_cleaning.csv').drop(['Unnamed: 0', 'source'], axis=1)
df

Unnamed: 0,postcode,house_is,property_subtype,price,rooms_number,area,equipped_kitchen_has,furnished,open_fire,terrace,...,land_surface,facades_number,swimming_pool_has,region,building_state_agg,postcode_median_price,building_state_median_price,property_subtype_median_price,building_property_subtype_median_facades,property_subtype_median_facades
0,4180,True,MIXED_USE_BUILDING,295000.0,3.0,242.0,True,False,False,True,...,1403.0,,False,W,good,229000.0,320000.0,310000.0,2.0,2.0
1,8730,True,VILLA,675000.0,4.0,349.0,True,False,False,False,...,1526.0,,False,F,good,241000.0,320000.0,540000.0,4.0,4.0
2,4020,True,APARTMENT_BLOCK,250000.0,5.0,303.0,True,False,False,False,...,760.0,,False,W,to_renovate,195000.0,230000.0,357500.0,,
3,1200,True,HOUSE,545000.0,4.0,235.0,True,True,False,False,...,63.0,,False,B,renovated,445000.0,310000.0,288000.0,3.0,3.0
4,1190,True,MIXED_USE_BUILDING,500000.0,2.0,220.0,True,False,False,False,...,193.0,,False,B,good,360000.0,320000.0,310000.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10048,4000,False,APARTMENT,245000.0,2.0,103.0,False,False,False,True,...,0.0,2.0,False,W,good,225000.0,320000.0,282500.0,2.0,2.0
10049,8790,False,APARTMENT,250000.0,1.0,300.0,False,False,False,False,...,0.0,2.0,False,F,good,257000.0,320000.0,282500.0,2.0,2.0
10050,2018,False,APARTMENT,298000.0,1.0,71.0,True,False,False,True,...,0.0,1.0,False,F,good,443475.0,320000.0,282500.0,2.0,2.0
10051,2000,False,FLAT_STUDIO,150000.0,1.0,40.0,True,False,False,False,...,0.0,2.0,False,F,to_renovate,497000.0,230000.0,149000.0,2.0,2.0


In [3]:
df.dtypes

postcode                                      int64
house_is                                       bool
property_subtype                             object
price                                       float64
rooms_number                                float64
area                                        float64
equipped_kitchen_has                           bool
furnished                                      bool
open_fire                                      bool
terrace                                        bool
terrace_area                                float64
garden                                         bool
garden_area                                 float64
land_surface                                float64
facades_number                              float64
swimming_pool_has                              bool
region                                       object
building_state_agg                           object
postcode_median_price                       float64
building_sta

In [4]:
# keeping only numerical features for this test
features_to_delete = ['postcode', 'house_is', 'property_subtype', 'equipped_kitchen_has', 'furnished', 'open_fire', 'terrace', 'terrace_area', 'garden', 'garden_area', 'facades_number', 'swimming_pool_has', 'region', 'building_state_agg']
df.drop(features_to_delete, axis=1, inplace=True)
df

Unnamed: 0,price,rooms_number,area,land_surface,postcode_median_price,building_state_median_price,property_subtype_median_price,building_property_subtype_median_facades,property_subtype_median_facades
0,295000.0,3.0,242.0,1403.0,229000.0,320000.0,310000.0,2.0,2.0
1,675000.0,4.0,349.0,1526.0,241000.0,320000.0,540000.0,4.0,4.0
2,250000.0,5.0,303.0,760.0,195000.0,230000.0,357500.0,,
3,545000.0,4.0,235.0,63.0,445000.0,310000.0,288000.0,3.0,3.0
4,500000.0,2.0,220.0,193.0,360000.0,320000.0,310000.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...
10048,245000.0,2.0,103.0,0.0,225000.0,320000.0,282500.0,2.0,2.0
10049,250000.0,1.0,300.0,0.0,257000.0,320000.0,282500.0,2.0,2.0
10050,298000.0,1.0,71.0,0.0,443475.0,320000.0,282500.0,2.0,2.0
10051,150000.0,1.0,40.0,0.0,497000.0,230000.0,149000.0,2.0,2.0


In [5]:
df.dropna(axis=0, inplace=True)
df

Unnamed: 0,price,rooms_number,area,land_surface,postcode_median_price,building_state_median_price,property_subtype_median_price,building_property_subtype_median_facades,property_subtype_median_facades
0,295000.0,3.0,242.0,1403.0,229000.0,320000.0,310000.0,2.0,2.0
1,675000.0,4.0,349.0,1526.0,241000.0,320000.0,540000.0,4.0,4.0
3,545000.0,4.0,235.0,63.0,445000.0,310000.0,288000.0,3.0,3.0
4,500000.0,2.0,220.0,193.0,360000.0,320000.0,310000.0,2.0,2.0
5,189000.0,3.0,200.0,100.0,229000.0,230000.0,288000.0,3.0,3.0
...,...,...,...,...,...,...,...,...,...
10048,245000.0,2.0,103.0,0.0,225000.0,320000.0,282500.0,2.0,2.0
10049,250000.0,1.0,300.0,0.0,257000.0,320000.0,282500.0,2.0,2.0
10050,298000.0,1.0,71.0,0.0,443475.0,320000.0,282500.0,2.0,2.0
10051,150000.0,1.0,40.0,0.0,497000.0,230000.0,149000.0,2.0,2.0


In [6]:
X = df.loc[:, df.columns != 'price'].to_numpy()
y = df['price'].to_numpy()

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4444, test_size=0.2)

In [8]:
X_train, y_train, X_test, y_test

(array([[3.000e+00, 1.250e+02, 0.000e+00, ..., 2.825e+05, 2.000e+00,
         2.000e+00],
        [3.000e+00, 1.160e+02, 0.000e+00, ..., 2.825e+05, 2.000e+00,
         2.000e+00],
        [2.000e+00, 8.600e+01, 0.000e+00, ..., 2.825e+05, 2.000e+00,
         2.000e+00],
        ...,
        [3.000e+00, 1.130e+02, 0.000e+00, ..., 2.825e+05, 2.000e+00,
         2.000e+00],
        [1.000e+00, 5.900e+01, 0.000e+00, ..., 2.825e+05, 2.000e+00,
         2.000e+00],
        [2.000e+00, 1.150e+02, 0.000e+00, ..., 2.825e+05, 2.000e+00,
         2.000e+00]]),
 array([285000., 345000., 289500., ..., 545000., 115000., 329000.]),
 array([[3.000e+00, 1.600e+02, 4.400e+02, ..., 2.880e+05, 3.000e+00,
         3.000e+00],
        [3.000e+00, 1.900e+02, 0.000e+00, ..., 2.825e+05, 2.000e+00,
         2.000e+00],
        [2.000e+00, 1.970e+02, 0.000e+00, ..., 2.825e+05, 2.000e+00,
         2.000e+00],
        ...,
        [1.000e+00, 1.100e+02, 1.100e+02, ..., 2.880e+05, 3.000e+00,
         3.000e+00],
   

In [9]:
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

score_train = lin_reg.score(X_train, y_train)
score_test = lin_reg.score(X_test, y_test)
print(f'train score:\t{score_train}\ntest score:\t{score_test}')

train score:	0.6242022626845257
test score:	0.5989326427282686


In [10]:
np.random.seed(4444)

sgd_reg = SGDRegressor()
sgd_reg.fit(X_train, y_train)
score_train = sgd_reg.score(X_train, y_train)
score_test = sgd_reg.score(X_test, y_test)
print(f'train score:\t{score_train}\ntest score:\t{score_test}')

train score:	-1.0424810304919478e+29
test score:	-1.0732132456067988e+29


In [11]:
sgd_reg.get_params()

{'alpha': 0.0001,
 'average': False,
 'early_stopping': False,
 'epsilon': 0.1,
 'eta0': 0.01,
 'fit_intercept': True,
 'l1_ratio': 0.15,
 'learning_rate': 'invscaling',
 'loss': 'squared_loss',
 'max_iter': 1000,
 'n_iter_no_change': 5,
 'penalty': 'l2',
 'power_t': 0.25,
 'random_state': None,
 'shuffle': True,
 'tol': 0.001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [12]:
degree = 2
poly_reg = make_pipeline(PolynomialFeatures(degree), LinearRegression())

poly_reg.fit(X_train, y_train)
score_train = poly_reg.score(X_train, y_train)
score_test = poly_reg.score(X_test, y_test)
print(f'train score:\t{score_train}\ntest score:\t{score_test}')

train score:	0.6703932493336051
test score:	0.626174224874755


In [13]:
degree = 3
poly_reg = make_pipeline(PolynomialFeatures(degree), LinearRegression())

poly_reg.fit(X_train, y_train)
score_train = poly_reg.score(X_train, y_train)
score_test = poly_reg.score(X_test, y_test)
print(f'train score:\t{score_train}\ntest score:\t{score_test}')

train score:	0.6883819849692823
test score:	0.6388119869221324


In [14]:
degree = 4
poly_reg = make_pipeline(PolynomialFeatures(degree), LinearRegression())

poly_reg.fit(X_train, y_train)
score_train = poly_reg.score(X_train, y_train)
score_test = poly_reg.score(X_test, y_test)
print(f'train score:\t{score_train}\ntest score:\t{score_test}')

train score:	0.7135329598027893
test score:	0.6349783989655258


In [15]:
degree = 5
poly_reg = make_pipeline(PolynomialFeatures(degree), LinearRegression())

poly_reg.fit(X_train, y_train)
score_train = poly_reg.score(X_train, y_train)
score_test = poly_reg.score(X_test, y_test)
print(f'train score:\t{score_train}\ntest score:\t{score_test}')

train score:	0.7272745427130718
test score:	0.5487414329150007


In [16]:
degree = 6
poly_reg = make_pipeline(PolynomialFeatures(degree), LinearRegression())

poly_reg.fit(X_train, y_train)
score_train = poly_reg.score(X_train, y_train)
score_test = poly_reg.score(X_test, y_test)
print(f'train score:\t{score_train}\ntest score:\t{score_test}')

train score:	0.7349731275963831
test score:	-0.639364357600601


In [17]:
poly_reg.get_params()

{'memory': None,
 'steps': [('polynomialfeatures', PolynomialFeatures(degree=6)),
  ('linearregression', LinearRegression())],
 'verbose': False,
 'polynomialfeatures': PolynomialFeatures(degree=6),
 'linearregression': LinearRegression(),
 'polynomialfeatures__degree': 6,
 'polynomialfeatures__include_bias': True,
 'polynomialfeatures__interaction_only': False,
 'polynomialfeatures__order': 'C',
 'linearregression__copy_X': True,
 'linearregression__fit_intercept': True,
 'linearregression__n_jobs': None,
 'linearregression__normalize': False}

In [18]:
pipe = make_pipeline(PolynomialFeatures(), LinearRegression())

In [None]:
np.random.seed(4444)
degree_range = np.arange(0,8)
train_scores, valid_scores = validation_curve(pipe,
                                              X_train, y_train,
                                              param_name='polynomialfeatures__degree',
                                              param_range=degree_range,
                                              cv=3)

Traceback (most recent call last):
  File "/home/jo/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/jo/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 330, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/home/jo/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 292, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/home/jo/anaconda3/lib/python3.8/site-packages/joblib/memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "/home/jo/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 740, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "/home/jo/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 693, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
  File "/home/jo/anaconda3/lib/python3.8/site-packa