# Регрессия

In [108]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import math

from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [63]:
PATH = "../../Datasets/bikes/"
tr_name = "train.csv"

In [75]:
tr = pd.read_csv(f"{PATH}{tr_name}")

In [76]:
tr['datetime'] = pd.to_datetime(tr['datetime'])
tr['hour'] = tr.datetime.dt.hour
tr['month'] = tr.datetime.dt.month
tr['year'] = tr.datetime.dt.year
tr['dayofweek'] = tr.datetime.dt.dayofweek
tr['day'] = tr.datetime.dt.day
tr['cnt'] = tr['count']
tr.drop(['datetime', 'count', 'casual', 'registered'], axis = 1, inplace = True)

In [77]:
tr.head()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,hour,month,year,dayofweek,day,cnt
0,1,0,0,1,9.84,14.395,81,0.0,0,1,2011,5,1,16
1,1,0,0,1,9.02,13.635,80,0.0,1,1,2011,5,1,40
2,1,0,0,1,9.02,13.635,80,0.0,2,1,2011,5,1,32
3,1,0,0,1,9.84,14.395,75,0.0,3,1,2011,5,1,13
4,1,0,0,1,9.84,14.395,75,0.0,4,1,2011,5,1,1


In [None]:
for i, col in enumerate(tr.columns[:-1]):
    plt.subplot(5, 3, i+1)
    plt.scatter(tr[col], tr['count'])
    plt.title(col)

## Модели

In [126]:
linreg = LinearRegression()
lasso = Lasso(random_state = 42)
lasso_cv = LassoCV(random_state = 42)
ridge = Ridge(random_state = 42)
ridge_cv = RidgeCV()
ranforreg = RandomForestRegressor()

scaler = StandardScaler()
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

models = [linreg, lasso, lasso_cv, ridge, ridge_cv, ranforreg]
models_name = ['linreg', 'lasso', 'lasso_cv', 'ridge', 'ridge_cv', 'forest']

In [78]:
cat_features = ['season', 'holiday', 'workingday', 'weather', 'hour', 'month', 'year', 'dayofweek', 'day']

In [79]:
X, Y = tr.drop('cnt', axis = 1), tr.cnt

In [85]:
X_scal = scaler.fit_transform(X)
X = pd.DataFrame(X_scal, columns = ['season', 'holiday', 'workingday', 'weather',
                                                'temp', 'atemp', 'humidity', 'windspeed',
                                                'hour', 'month', 'year', 'dayofweek', 'day'])

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [116]:
encoder.fit(X[cat_features])
X = pd.concat([X.drop(cat_features, axis = 1),
          pd.DataFrame(encoder.transform(X[cat_features]))], axis=1).reindex()

In [117]:
X.head()

Unnamed: 0,temp,atemp,humidity,windspeed,0,1,2,3,4,5,...,66,67,68,69,70,71,72,73,74,75
0,-1.333661,-1.092737,0.993213,-1.567754,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-1.438907,-1.182421,0.941249,-1.567754,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-1.438907,-1.182421,0.941249,-1.567754,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-1.333661,-1.092737,0.68143,-1.567754,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-1.333661,-1.092737,0.68143,-1.567754,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [128]:
def fit_and_res(model, x, y):
    x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.25, random_state=42)
    model.fit(x_train, y_train)
    mse = mean_squared_error(y_val, model.predict(x_val))
    #print(mse, math.sqrt(mse))
    return mse, math.sqrt(mse)

In [140]:
res = []
for model in models:
    res.append(fit_and_res(model, X, Y))
res = pd.DataFrame(np.array(res).T, columns = models_name, index = ['mse', 'sqrt'])
res



Unnamed: 0,linreg,lasso,lasso_cv,ridge,ridge_cv,forest
mse,10106.638724,10919.755375,10111.29299,10108.022462,10108.022462,2780.410294
sqrt,100.53178,104.497633,100.554925,100.538662,100.538662,52.729596
