In [28]:
import os

import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, normalize
from sklearn.ensemble import RandomForestRegressor # Model
from sklearn.metrics import mean_squared_error 
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

from scipy import stats
from scipy.stats import norm, skew 

import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
df_train = pd.read_csv('/content/drive/My Drive/data/House_Prices/train.csv',sep=',')
df_submission = pd.read_csv('/content/drive/My Drive/data/House_Prices/sample_submission.csv',sep=',')
df_test = pd.read_csv('/content/drive/My Drive/data/House_Prices/test.csv',sep=',')

In [7]:
df_train["SalePrice"] = np.log1p(df_train["SalePrice"])

In [8]:
# Save Id
train_x_id = df_train['Id']
test_x_id = df_test['Id']
# Drop Id
df_train.drop("Id", axis = 1, inplace = True)
df_test.drop("Id", axis = 1, inplace = True)
# Separate target from predictors
train_y_full = df_train.SalePrice
train_x_full = df_train.drop(['SalePrice'], axis=1)
# Divide data into training and validation subsets

train_x, valid_x, train_y, valid_y = train_test_split(train_x_full, train_y_full, train_size=0.8, test_size=0.2,
                                                                random_state=0)

In [9]:
cols_with_missing = [col for col in train_x.columns if train_x[col].isnull().any()] 
train_x[cols_with_missing]

Unnamed: 0,LotFrontage,Alley,MasVnrType,MasVnrArea,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Electrical,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageQual,GarageCond,PoolQC,Fence,MiscFeature
618,90.0,,BrkFace,452.0,Ex,TA,Av,GLQ,Unf,SBrkr,Gd,Attchd,2007.0,Unf,TA,TA,,,
870,60.0,,,0.0,TA,TA,No,Unf,Unf,SBrkr,,Detchd,1962.0,Unf,TA,TA,,,
92,80.0,Grvl,,0.0,Gd,TA,No,ALQ,Unf,SBrkr,,Detchd,1921.0,Unf,TA,TA,,,
817,,,BrkFace,148.0,Gd,TA,No,GLQ,Unf,SBrkr,Gd,Attchd,2002.0,RFn,TA,TA,,,
302,118.0,,BrkFace,150.0,Gd,TA,No,Unf,Unf,SBrkr,TA,Attchd,2001.0,RFn,TA,TA,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
763,82.0,,BrkFace,673.0,Gd,TA,Mn,GLQ,Unf,SBrkr,Gd,Attchd,1999.0,RFn,TA,TA,,,
835,60.0,,,0.0,Gd,TA,No,BLQ,Unf,SBrkr,,Attchd,1996.0,Unf,TA,TA,,,
1216,68.0,,,0.0,,,,,,SBrkr,,Attchd,1978.0,Unf,TA,TA,,,
559,,,BrkFace,18.0,Gd,TA,Gd,Unf,Unf,SBrkr,TA,Attchd,2003.0,Fin,TA,TA,,,


In [10]:
c = (train_x.dtypes == 'object')
categorical_cols = list(c[c].index)

print("Categorical variables:")
print(categorical_cols)

Categorical variables:
['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']


In [12]:
numerical_cols = [cname for cname in train_x.columns if train_x[cname].dtype in ['int64', 'float64']]

print("Numerical variables:")
print(numerical_cols)

Numerical variables:
['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']


In [31]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='mean')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [24]:
rf_model = RandomForestRegressor(n_estimators=500, random_state=0)

In [29]:
trans = MinMaxScaler()
model = KNeighborsClassifier()
pipeline = Pipeline(steps=[('t', trans), ('preprocessor', preprocessor),('m', rf_model)])
# evaluate the pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
#n_scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')

# report pipeline performance
#print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

In [25]:
def rmsle(valid_y, preds):
    return np.sqrt(mean_squared_error(valid_y, preds))

In [30]:
#my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
#                              ('model', rf_model)
#                             ])

# Preprocessing of training data, fit model 
pipeline.fit(train_x, train_y)

# Preprocessing of validation data, get predictions
preds = pipeline.predict(valid_x)

# Evaluate the model
score = rmsle(valid_y, preds)
print('RMSLE:', score)

ValueError: ignored

In [27]:
sub = pd.DataFrame()
sub['Id'] = test_x_id
sub['SalePrice'] = my_pipeline.predict(df_test)
sub.to_csv("/content/drive/My Drive/data/House_Prices/submission_Home_v5_eq.csv", index=False)
#results_df.to_csv("/content/drive/My Drive/data/House_Prices/submission_Home_v4_eq.csv", index=False)