In [10]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.style as style
style.use('fivethirtyeight')

In [11]:
colors = ["#FF0B04", "#F1BE48",
           "#B9975B", "#8B5B29",
           "#524727",
         ]
sns.set_palette(sns.color_palette(colors))

In [12]:
train = pd.read_csv('train_clean.csv', index_col=0)

In [42]:
test = pd.read_csv('test_clean.csv', index_col=0)

In [43]:
set(test['1stFlrSF'] + test['2ndFlrSF'] + test['LowQualFinSF'] == test['GrLivArea'])

{True}

In [40]:
train['MSSubClass']

0        MSSubClass_MSSubClass_MSSubClass_MSSubClass_90
1       MSSubClass_MSSubClass_MSSubClass_MSSubClass_160
2       MSSubClass_MSSubClass_MSSubClass_MSSubClass_160
3        MSSubClass_MSSubClass_MSSubClass_MSSubClass_50
4        MSSubClass_MSSubClass_MSSubClass_MSSubClass_85
                             ...                       
1866    MSSubClass_MSSubClass_MSSubClass_MSSubClass_160
1867     MSSubClass_MSSubClass_MSSubClass_MSSubClass_60
1868     MSSubClass_MSSubClass_MSSubClass_MSSubClass_50
1869     MSSubClass_MSSubClass_MSSubClass_MSSubClass_60
1870    MSSubClass_MSSubClass_MSSubClass_MSSubClass_160
Name: MSSubClass, Length: 1871, dtype: object

In [41]:
test['MSSubClass']

0       MSSubClass_50
1       MSSubClass_30
2       MSSubClass_20
3       MSSubClass_60
4       MSSubClass_60
            ...      
619    MSSubClass_120
620     MSSubClass_60
621     MSSubClass_20
622     MSSubClass_60
623     MSSubClass_20
Name: MSSubClass, Length: 624, dtype: object

In [36]:
cat_feats = train.select_dtypes(['object','bool']).columns.to_list()

In [30]:
cat_feats = cat_feats + ['BedroomAbvGr']

In [34]:
# # Note on why TotalBsmt... later
X = train.drop(['SalePrice', 'TotalBsmtSF'], axis=1)

transformer = ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), cat_feats)], remainder='passthrough')
X = transformer.fit_transform(X)

y = np.log(train['SalePrice'])

ols = LinearRegression()
ols.fit(X, y)
ols.score(X, y)

0.9580848121420373

In [37]:
X = train.drop(['SalePrice', 'TotalBsmtSF'], axis=1)

transformer = ColumnTransformer([("Cat", OneHotEncoder(drop='first'), cat_feats)], remainder='passthrough')
X = transformer.fit_transform(X)

y = np.log(train['SalePrice'])

ols = LinearRegression()
ols.fit(X, y)
ols.score(X, y)

0.9580850723384223

In [38]:
X = test.drop(['SalePrice', 'TotalBsmtSF'], axis=1)
X = transformer.transform(X)
y = np.log(test['SalePrice'])
ols.score(X, y)

ValueError: Found unknown categories ['MSSubClass_190', 'MSSubClass_75', 'MSSubClass_30', 'MSSubClass_180', 'MSSubClass_50', 'MSSubClass_45', 'MSSubClass_60', 'MSSubClass_80', 'MSSubClass_20', 'MSSubClass_70', 'MSSubClass_85', 'MSSubClass_90', 'MSSubClass_160', 'MSSubClass_120', 'MSSubClass_40'] in column 0 during transform

In [20]:
print(train['YearBuilt'].min())
print(train['YearBuilt'].max())
print(test['YearBuilt'].min())
print(test['YearBuilt'].max())

1875
2010
1872
2009


In [None]:
train_dict = train.dtypes.to_dict()
test_dict = test.dtypes.to_dict()
rogue_types = []
for key in train_dict.keys():
    if train_dict[key] != test_dict[key]:
        rogue_types.append(key)
rogue_types

In [None]:
train[rogue_types].dtypes

In [None]:
test[rogue_types].dtypes

In [None]:
sorted(train.columns.to_list())==sorted(test.columns.to_list())

In [None]:
train.dtypes.to_dict() == test.dtypes.to_dict()

In [None]:
# 0.9580850723384223

In [None]:
# 0.9531732835572826

In [None]:
test.head()

In [None]:
cols_na = test.loc[:,test.isna().any(axis=0)].columns.to_list()
cols_na

In [None]:
X_tst = test.drop(['SalePrice','TotalBsmtSF'],axis=1)
X_tst = transformer.transform(X_tst)
y_tst = np.log(test['SalePrice'])
ols.score(X_tst, y_tst)

In [None]:
# 0.9533403078902463

In [None]:
X = train[['GrLivArea']]
y = np.log(train['SalePrice'])
ols.fit(X, y)
ols.score(X, y)

In [None]:
X_tst = test[['GrLivArea']]
y_tst = np.log(test['SalePrice'])
ols.score(X_tst, y_tst)

In [None]:
num_cols = train.select_dtypes(['float64','int64']).columns.to_list()

In [None]:
X = train.loc[:,num_cols]
y = np.log(train['SalePrice'])
ols.fit(X, y)
ols.score(X, y)

In [None]:
X = test.loc[:,num_cols]
y = np.log(test['SalePrice'])
ols.score(X, y)

In [None]:
X = train.loc[:,num_cols+['Neighborhood']]
transformer = ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), ['Neighborhood'])], remainder='passthrough')
X = transformer.fit_transform(X)
y = np.log(train['SalePrice'])
ols.fit(X, y)
ols.score(X, y)

In [None]:
X = test.loc[:,num_cols+['Neighborhood']]
X = transformer.transform(X)
y = np.log(test['SalePrice'])
ols.score(X, y)

In [None]:
def simple_linear_model_score(train, test, cols, target):
    ols = linear_model.LinearRegression()
    X = train[cols]
    cat_feats = X.select_dtypes(['object','bool']).columns.to_list()
    transformer = ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), cat_feats)], remainder='passthrough')
    X = transformer.fit_transform(X)
    y = np.log(train[target])
    
    ols.fit(X, y)
    train_score = ols.score(X, y)
    
    X_tst = test[cols]
    X_tst = transformer.transform(X_tst)
    y_tst = np.log(test[target])
    test_score = ols.score(X_tst, y_tst)
    
    return train_score, test_score

In [None]:
train_score, test_score = simple_linear_model_score(train,test,['GrLivArea'],'SalePrice')
print(train_score)
print(test_score)

In [None]:
fav_cat = ['YearBuilt', 'ExterQual', 'Neighborhood', 'KitchenQual']

In [None]:
simple_linear_model_score(train, test, num_cols + fav_cat, 'SalePrice')

In [None]:
scaler = StandardScaler(with_mean=False)

In [None]:
X = train.drop(['SalePrice', 'TotalBsmtSF'],axis=1)

transformer = ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), cat_feats)], remainder='passthrough')
X = transformer.fit_transform(X)

X = scaler.fit_transform(X)

y = np.log(train['SalePrice'])

lasso = linear_model.Lasso(alpha=0.1)

lasso.fit(X, y)
lasso.score(X, y)

In [None]:
X = train.drop(['SalePrice', 'TotalBsmtSF'],axis=1)

transformer = ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), cat_feats)], remainder='passthrough')
X = transformer.fit_transform(X)

X = scaler.fit_transform(X)

y = np.log(train['SalePrice'])

lasso = linear_model.Lasso(alpha=0.1)

selector = SelectFromModel(estimator=lasso)

X = selector.fit_transform(X, y)

ols.fit(X,y)
ols.score(X,y)

In [None]:
X = train.drop(['SalePrice', 'TotalBsmtSF'],axis=1)

transformer = ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), cat_feats)], remainder='passthrough')
X = transformer.fit_transform(X)

feat_names = transformer.get_feature_names()

X = scaler.fit_transform(X)

y = np.log(train['SalePrice'])

lasso = linear_model.Lasso(alpha=0.1)

selector = SelectFromModel(estimator=lasso)

X = selector.fit_transform(X, y)

mask = selector.get_support()

[a for a, b in zip(feat_names, mask) if b]


In [None]:
X = train.drop(['SalePrice', 'TotalBsmtSF'],axis=1)

transformer = ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), cat_feats)], remainder='passthrough')
X = transformer.fit_transform(X)

feat_names = transformer.get_feature_names()

X = scaler.fit_transform(X)

y = np.log(train['SalePrice'])

lasso = linear_model.Lasso(alpha=0.05)

selector = SelectFromModel(estimator=lasso)

X = selector.fit_transform(X, y)

mask = selector.get_support()

[a for a, b in zip(feat_names, mask) if b]

In [None]:
X = train.drop(['SalePrice', 'TotalBsmtSF'],axis=1)

transformer = ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), cat_feats)], remainder='passthrough')
X = transformer.fit_transform(X)

X = scaler.fit_transform(X)

y = np.log(train['SalePrice'])

lasso = linear_model.Lasso(alpha=0.05)

selector = SelectFromModel(estimator=lasso)

X = selector.fit_transform(X, y)

ols.fit(X,y)
ols.score(X,y)

In [None]:
X = train.drop(['SalePrice', 'TotalBsmtSF'],axis=1)

transformer = ColumnTransformer([("Cat", OneHotEncoder(handle_unknown = 'ignore'), cat_feats)], remainder='passthrough')
X = transformer.fit_transform(X)

X = scaler.fit_transform(X)

y = np.log(train['SalePrice'])

lasso = linear_model.Lasso(alpha=0.05)

lasso.fit(X,y)
lasso.score(X,y)

In [None]:
cat_feats[20]