In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LassoCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

In [4]:
#reading in data
house = pd.read_csv('../datasets/house_Clean.csv', keep_default_na=False, na_values=[''])
pd.set_option('display.max_columns', None)

In [5]:
house.drop(columns = 'Unnamed: 0', inplace=True)

In [6]:
house.head(1)

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,Lot Config,Land Slope,Neighborhood,Condition 1,Condition 2,Bldg Type,House Style,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Roof Style,Roof Matl,Exterior 1st,Exterior 2nd,Mas Vnr Type,Mas Vnr Area,Exter Qual,Exter Cond,Foundation,Bsmt Qual,Bsmt Cond,Bsmt Exposure,BsmtFin Type 1,BsmtFin SF 1,BsmtFin Type 2,BsmtFin SF 2,Bsmt Unf SF,Total Bsmt SF,Heating,Heating QC,Central Air,Electrical,1st Flr SF,2nd Flr SF,Low Qual Fin SF,Gr Liv Area,Bsmt Full Bath,Bsmt Half Bath,Full Bath,Half Bath,Bedroom AbvGr,Kitchen AbvGr,Kitchen Qual,TotRms AbvGrd,Functional,Fireplaces,Fireplace Qu,Garage Type,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Garage Qual,Garage Cond,Paved Drive,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,0.0,13517,Pave,,IR1,Lvl,AllPub,CulDSac,Gtl,Sawyer,RRAe,Norm,1Fam,2Story,6,8,1976,2005,Gable,CompShg,HdBoard,Plywood,BrkFace,289.0,Gd,TA,CBlock,TA,TA,No,GLQ,533.0,Unf,0.0,192.0,725.0,GasA,Ex,Y,SBrkr,725,754,0,1479,0.0,0.0,2,1,3,1,Gd,6,Typ,0,,Attchd,1976.0,RFn,2.0,475.0,TA,TA,Y,0,44,0,0,0,0,,,,0,3,2010,WD,130500


In [7]:
house=pd.get_dummies(columns=['Bsmt Exposure','BsmtFin Type 1','Garage Type','Garage Qual'], drop_first=True, data=house)

#### Polynomial Features

In [8]:
house[['Overall Qual', 'Gr Liv Area','Total Bsmt SF',
       'Garage Cars', 'Year Built','Year Remod/Add','Full Bath',
       'Garage Type_Attchd','Garage Qual_TA',
       'Bsmt Exposure_Gd','BsmtFin Type 1_GLQ']]

Unnamed: 0,Overall Qual,Gr Liv Area,Total Bsmt SF,Garage Cars,Year Built,Year Remod/Add,Full Bath,Garage Type_Attchd,Garage Qual_TA,Bsmt Exposure_Gd,BsmtFin Type 1_GLQ
0,6,1479,725.0,2.0,1976,2005,2,1,1,0,1
1,7,2122,913.0,2.0,1996,1997,2,1,1,0,1
2,5,1057,1057.0,1.0,1953,2007,1,0,1,0,1
3,5,1444,384.0,2.0,2006,2007,2,0,1,0,0
4,6,1445,676.0,2.0,1900,1993,2,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
2037,8,1728,1884.0,2.0,2007,2007,2,1,1,0,1
2038,4,861,861.0,2.0,1940,1950,1,0,1,0,0
2039,6,1913,896.0,2.0,1928,1950,1,0,0,0,0
2040,4,1200,1200.0,1.0,1956,1956,1,1,1,0,0


In [9]:
#Create variables

X= house[['Overall Qual', 'Gr Liv Area','Total Bsmt SF',
       'Garage Cars', 'Year Built','Year Remod/Add','Full Bath',
       'Garage Type_Attchd','Garage Qual_TA',
       'Bsmt Exposure_Gd','BsmtFin Type 1_GLQ']]
y= house['SalePrice']

In [10]:
#DSIR-Lancelot/4.02-lesson-regularization
# Instantiate our PolynomialFeatures object to create all two-way terms.

poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)

In [11]:
# Fit and transform our X data.
X_overfit = poly.fit_transform(X)

In [12]:
poly.get_feature_names(X.columns)

['Overall Qual',
 'Gr Liv Area',
 'Total Bsmt SF',
 'Garage Cars',
 'Year Built',
 'Year Remod/Add',
 'Full Bath',
 'Garage Type_Attchd',
 'Garage Qual_TA',
 'Bsmt Exposure_Gd',
 'BsmtFin Type 1_GLQ',
 'Overall Qual^2',
 'Overall Qual Gr Liv Area',
 'Overall Qual Total Bsmt SF',
 'Overall Qual Garage Cars',
 'Overall Qual Year Built',
 'Overall Qual Year Remod/Add',
 'Overall Qual Full Bath',
 'Overall Qual Garage Type_Attchd',
 'Overall Qual Garage Qual_TA',
 'Overall Qual Bsmt Exposure_Gd',
 'Overall Qual BsmtFin Type 1_GLQ',
 'Gr Liv Area^2',
 'Gr Liv Area Total Bsmt SF',
 'Gr Liv Area Garage Cars',
 'Gr Liv Area Year Built',
 'Gr Liv Area Year Remod/Add',
 'Gr Liv Area Full Bath',
 'Gr Liv Area Garage Type_Attchd',
 'Gr Liv Area Garage Qual_TA',
 'Gr Liv Area Bsmt Exposure_Gd',
 'Gr Liv Area BsmtFin Type 1_GLQ',
 'Total Bsmt SF^2',
 'Total Bsmt SF Garage Cars',
 'Total Bsmt SF Year Built',
 'Total Bsmt SF Year Remod/Add',
 'Total Bsmt SF Full Bath',
 'Total Bsmt SF Garage Type_Attc

In [13]:
#DSIR-Lancelot/4.02-lesson-regularization
# Create train/test splits.

X_train, X_test, y_train, y_test = train_test_split(
    X_overfit,
    y,
    test_size=0.8, # Making sure that I have much LESS for training so we'll overfit
    random_state=420
)

In [14]:
# Scale the data, creating a new variable
#fitting the X_train data and transforming both train and test

sc = StandardScaler()
Z_train = sc.fit_transform(X_train)
Z_test = sc.transform(X_test)

#### Linear Regression

In [15]:
lr = LinearRegression()
lr.fit(Z_train, y_train) # Fit on the SCALED data

LinearRegression()

In [16]:
# How does the model score on the training and test data?
print(f"Training R-Squared: {lr.score(Z_train, y_train)}")
print(" ")
print(f"Testing R-Squared: {lr.score(Z_test, y_test)}")

Training R-Squared: 0.9358518091249616
 
Testing R-Squared: 0.8547805677318965


- the above r2's are telling us that the model is over fit due to high training score and low testing

##### Ridge regression

- alpha is the strength of the regularization
- the higher alpha, the lower the coefs (betas) - the more we are regularizing the data
- grid searching can find the best value of alpha
- try different values here to see what gives you the best results
- introducing bias to reduce the variance

In [17]:
# Instantiate.
rr = Ridge()

# Fit on training, transformed data
rr.fit(Z_train, y_train)

# Evaluate model using R2.
print(f"Ridge Training R-Squared: {rr.score(Z_train, y_train)}")
print(f"Ridge Testing R-Squared: {rr.score(Z_test, y_test)}")

Ridge Training R-Squared: 0.9244799992571301
Ridge Testing R-Squared: 0.8696872238830917


In [18]:
#Using grid search to find the best alpha for ridge regression
#https://machinelearningmastery.com/how-to-tune-algorithm-parameters-with-scikit-learn/rr = Ridge()

rr_alphas = np.logspace(0, 100, 100)

rr_gridsearch = GridSearchCV(estimator=rr, param_grid=dict(alpha=rr_alphas),scoring = 'r2', cv=5)

rr_gridsearch.fit(Z_train, y_train)

# summarize the results of the grid search
print(rr_gridsearch.best_score_)
print(rr_gridsearch.best_estimator_.alpha)

0.8796681039044154
10.235310218990262


In [19]:
rr2 = Ridge(rr_gridsearch.best_estimator_.alpha) # Fit a single ridge model with the best alpha
rr2.fit(Z_train, y_train)
rr2.predict(X_test)

array([4.95430296e+10, 5.83956518e+10, 5.20223366e+10, ...,
       6.37586361e+10, 4.97176782e+10, 6.92554366e+10])

In [20]:
rr2.coef_

array([-3126.64828677, -3176.41879895, -4388.72320275, -5253.9784105 ,
        3535.33188983,  3000.09742537, -7078.85822927,  1100.92571828,
        2051.73994217,  1051.97766425, -4093.6704417 ,  9481.21118279,
       21156.27936135, 20757.21193789, 10843.61894018, -1468.22371622,
       -1498.5616158 ,  9483.33354903,    25.1686754 , -8024.22962599,
        8297.71196641,  9559.09770782,  3671.01156207, 11287.98691116,
        2870.43982905,   -33.6369139 ,  -486.30609539,  9601.47074599,
       -1030.28141728,  3109.47307774,  7705.61629128, -3190.40506798,
       -5038.83378887, 12883.7927408 , -2123.2643545 , -2733.71378924,
         174.10037067,  1594.59897963, -5475.41151444,  -321.84900864,
       17495.83953942,  1773.68713825, -2918.28620963, -3723.82025636,
         793.56013593,  1979.06608755,   865.86321151, -4967.60434429,
       -1683.54221016,  3530.42751228,  4008.37509379, -5549.00119471,
        1989.88573496,  1851.86694186,  1128.68662288, -3216.29948269,
      

In [44]:
rr2_coefs = list(zip(poly.get_feature_names(X.columns), rr2.coef_))

In [46]:
rr2_coefs_df=pd.DataFrame(data = rr2_coefs,columns=['Variable','Coef'])

In [48]:
pd.set_option('display.max_rows', None)
rr2_coefs_df.sort_values(by = 'Coef',ascending=False).head(10)

Unnamed: 0,Variable,Coef
12,Overall Qual Gr Liv Area,21156.279361
13,Overall Qual Total Bsmt SF,20757.211938
40,Total Bsmt SF BsmtFin Type 1_GLQ,17495.839539
33,Total Bsmt SF Garage Cars,12883.792741
23,Gr Liv Area Total Bsmt SF,11287.986911
14,Overall Qual Garage Cars,10843.61894
27,Gr Liv Area Full Bath,9601.470746
21,Overall Qual BsmtFin Type 1_GLQ,9559.097708
17,Overall Qual Full Bath,9483.333549
11,Overall Qual^2,9481.211183


#### LASSO regression

In [36]:
#DSIR-Lancelot/4.02-lesson-regularization
# Instantiate.
l1 = Lasso()

# Fit on training, transformed data
l1.fit(Z_train, y_train)

# Evaluate model using R2.
print(f"LASSO Training R-Squared: {l1.score(Z_train, y_train)}")
print(f"LASSO Testing R-Squared: {l1.score(Z_test, y_test)}")

LASSO Training R-Squared: 0.9263567054783934
LASSO Testing R-Squared: 0.8540745733440381


  model = cd_fast.enet_coordinate_descent(


In [37]:
# # Set up a list of Lasso alphas to check.
lasso_cv = LassoCV()

l_alphas = np.logspace(-3, 5, 20)

# # Cross-validate over our list of Lasso alphas.
lasso_cv = LassoCV(alphas=l_alphas, cv=5)

# # Fit model using best ridge alpha!
lasso_cv.fit(Z_train, y_train);


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

In [38]:
lasso_cv.alpha_

112.88378916846884

In [39]:
# Evaluate model using R2.
print(f"LASSO Training R-Squared: {lasso_cv.score(Z_train, y_train)}")
print(f"LASSO Testing R-Squared: {lasso_cv.score(Z_test, y_test)}")

LASSO Training R-Squared: 0.9189885743939658
LASSO Testing R-Squared: 0.8797043331960875


In [49]:
lasso_cv.coef_

array([-10822.43629849, -13207.4518367 ,  -2738.21762831,  -2214.41472181,
         9184.48933399,   8534.09231841,   -548.43980374,   4350.31385116,
        15801.84826888,      0.        ,     -0.        ,     -0.        ,
        57900.99423904,  43273.57656056,      0.        ,     -0.        ,
           -0.        ,   2135.2923259 ,     -0.        , -14471.40126231,
        17559.45584801,  13213.6740828 ,   -626.32581026,  13168.73659662,
           -0.        ,     -0.        ,  -2435.05208784,   3783.62124065,
           -0.        ,      0.        ,   4144.24293962,  -9524.83865519,
       -18504.00319519,  22500.77167377,  -2158.44756735,  -7938.46064258,
           -0.        ,      0.        , -13078.65999584,     -0.        ,
        20555.20822729,      0.        ,     -0.        ,  -5209.37598288,
           -0.        ,   1869.51268198,   2166.61035838,  -9356.47605711,
           -0.        ,      0.        ,      0.        , -12416.30279438,
          790.6714733 ,  

In [50]:
list(zip(poly.get_feature_names(X.columns), lasso_cv.coef_))

[('Overall Qual', -10822.43629848916),
 ('Gr Liv Area', -13207.451836698992),
 ('Total Bsmt SF', -2738.217628310317),
 ('Garage Cars', -2214.414721809061),
 ('Year Built', 9184.489333991702),
 ('Year Remod/Add', 8534.092318413506),
 ('Full Bath', -548.4398037365063),
 ('Garage Type_Attchd', 4350.313851164071),
 ('Garage Qual_TA', 15801.84826887637),
 ('Bsmt Exposure_Gd', 0.0),
 ('BsmtFin Type 1_GLQ', -0.0),
 ('Overall Qual^2', -0.0),
 ('Overall Qual Gr Liv Area', 57900.99423904039),
 ('Overall Qual Total Bsmt SF', 43273.576560557885),
 ('Overall Qual Garage Cars', 0.0),
 ('Overall Qual Year Built', -0.0),
 ('Overall Qual Year Remod/Add', -0.0),
 ('Overall Qual Full Bath', 2135.2923259027275),
 ('Overall Qual Garage Type_Attchd', -0.0),
 ('Overall Qual Garage Qual_TA', -14471.401262314663),
 ('Overall Qual Bsmt Exposure_Gd', 17559.455848009704),
 ('Overall Qual BsmtFin Type 1_GLQ', 13213.674082804582),
 ('Gr Liv Area^2', -626.3258102573395),
 ('Gr Liv Area Total Bsmt SF', 13168.73659661

In [52]:
lasso_df = pd.DataFrame(lasso_cv.coef_, columns=['Coefficient'], 
                        index=poly.get_feature_names(X.columns))
lasso_df.sort_values(by="Coefficient", ascending=False)

Unnamed: 0,Coefficient
Overall Qual Gr Liv Area,57900.994239
Overall Qual Total Bsmt SF,43273.576561
Total Bsmt SF Garage Cars,22500.771674
Total Bsmt SF BsmtFin Type 1_GLQ,20555.208227
Overall Qual Bsmt Exposure_Gd,17559.455848
Garage Qual_TA,15801.848269
Overall Qual BsmtFin Type 1_GLQ,13213.674083
Gr Liv Area Total Bsmt SF,13168.736597
Year Built,9184.489334
Year Remod/Add,8534.092318


### Kaggle Submission

In [25]:
kaggle = pd.read_csv('./datasets/test.csv', keep_default_na=False, na_values=[''])

#### Feature engineering on Kaggle Data

In [26]:
# Dummifying:
kaggle=pd.get_dummies(columns=['Bsmt Exposure','BsmtFin Type 1','Garage Type','Garage Qual'], drop_first=True, data=kaggle)

In [27]:
Kfeatures = kaggle[['Overall Qual', 'Gr Liv Area','Total Bsmt SF',
       'Garage Cars', 'Year Built','Year Remod/Add','Full Bath',
       'Garage Type_Attchd','Garage Qual_TA',
       'Bsmt Exposure_Gd','BsmtFin Type 1_GLQ']]

In [28]:
Kfeatures.isnull().sum()

Overall Qual          0
Gr Liv Area           0
Total Bsmt SF         0
Garage Cars           0
Year Built            0
Year Remod/Add        0
Full Bath             0
Garage Type_Attchd    0
Garage Qual_TA        0
Bsmt Exposure_Gd      0
BsmtFin Type 1_GLQ    0
dtype: int64

In [29]:
#Polynomial transformation

In [30]:
X= Kfeatures

In [31]:
#instantiate & fit_transform polynomial features
pf = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
X_pf = pf.fit_transform(X)

In [32]:
#Scaling the Data
sc = StandardScaler()
Z = sc.fit_transform(X_pf)

In [None]:
#Predict using Ridge Regression model
kaggle_preds6 = rr.predict(Z)

In [33]:
#Predict using Lasso Regression model
kaggle_preds7 = l.predict(Z)

In [57]:
kaggle_preds9 = lasso_cv.predict(Z)

#### Make a new Data frame

In [58]:
preds_dict = {
    'Id': kaggle['Id'],
    'SalePrice': kaggle_preds9
}

In [59]:
kaggle_preds9_tocsv = pd.DataFrame(preds_dict)

In [60]:
kaggle_preds9_tocsv.to_csv('Kaggle_preds9.csv',index=False,)