## Regression

In [1]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split

train=pd.read_csv('./data/train_1.csv')

y=train['SalePrice']

train1=train.drop(['Id','SalePrice'],axis=1)

X=pd.get_dummies(train1).reset_index(drop=True)

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=123)

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

lm = LinearRegression()
pr = lm.fit(X_train, y_train)
pred = lm.predict(X_test)
np.sqrt(mean_squared_error(np.log(y_test), np.log(pred)))

0.12618361140972376

In [10]:
np.sqrt(mean_squared_error(y_test, pred))

24966.33741019887

In [16]:
def benchmark(model):
    pred=model.predict(X_test)
    logrmse=np.sqrt(mean_squared_error(np.log(y_test),np.log(pred)))
    return logrmse

benchmark(lm)

0.12618361140972376

### Preprocessing

In [18]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler

lm_model = make_pipeline(RobustScaler(), LinearRegression())
lm_model.fit(X_train, y_train)
benchmark(lm_model)

0.12618361141005513

In [17]:
benchmark(lm) - benchmark(lm_model)

-3.313738172749936e-13

### Ridge Regression 

$||y - Xw||^2_2 + alpha * ||w||^2_2
$

In [20]:
from sklearn.linear_model import Ridge

# Naive Ridge Regression

ridge_model = Ridge(alpha=0.1)
ridge_model.fit(X_train, y_train)
benchmark(lm) - benchmark(ridge_model)

-0.00037290862760808174

In [22]:
# RobustScaler Regression
ridge_model_pipe = make_pipeline(RobustScaler(), Ridge(alpha=0.1))
ridge_model_pipe.fit(X_train, y_train)
benchmark(ridge_model) - benchmark(ridge_model_pipe)

-2.8512902528476936e-06

In [24]:
# CV Ridge Regression
from sklearn.model_selection import KFold
kfolds = KFold(n_splits=10, shuffle=True, random_state=123)
from sklearn.linear_model import RidgeCV
r_alphas = [0.01, 0.1, 1, 3, 5, 7, 10, 100]

ridge_model_cv = make_pipeline(RobustScaler(), RidgeCV(alphas=r_alphas, cv=kfolds))
ridge_model_cv.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('robustscaler', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)), ('ridgecv', RidgeCV(alphas=array([1.e-02, 1.e-01, 1.e+00, 3.e+00, 5.e+00, 7.e+00, 1.e+01, 1.e+02]),
    cv=KFold(n_splits=10, random_state=123, shuffle=True),
    fit_intercept=True, gcv_mode=None, normalize=False, scoring=None,
    store_cv_values=False))])

In [25]:
benchmark(ridge_model_cv)

0.12438552197375949