In [None]:
import numpy as np
import pandas as pd
import category_encoders as ce
from sklearn import preprocessing
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
import xgboost
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import math
import warnings
import matplotlib.pyplot as plt

warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")
pd.set_option('display.max_rows', 1000)

trainingFile = "./data/train.csv"
testFile = "./data/test.csv"

trainDf = pd.read_csv(trainingFile, header = 0)
testDf = pd.read_csv(testFile, header = 0)

target = 'SalePrice'

Y = trainDf[target]

training = trainDf.drop(['Id', target], axis = 1)

training['GarageYrBlt'] = training['GarageYrBlt'].fillna(1900.0).astype(int)

cat_features = []
cat_features_idx = []
str_cols = []
str_cols_idx = []

pos = 0
for c in training.columns:
    if c != target:
        if training[c].dtype == np.object:
            str_cols.append(c)
            str_cols_idx.append(pos)
        pos = pos + 1
        
#for c in str_cols:
#  training[c] = training[c].fillna("$NULL")
  
enc = ce.OrdinalEncoder(cols = str_cols)
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
norm = Normalizer('l2')


lr = LinearRegression()




In [None]:
#display

print(training.dtypes)
display(training)


In [None]:
#correlations

t_pipe = Pipeline(steps = [
    ('catencode', enc), 
    ('null_handler', imp)])

transformed = t_pipe.fit_transform(training)

t_df = pd.DataFrame(data = transformed, columns = training.columns)

correlations = {}
features = t_df.columns

for f in features:
    if f != target:
        x1 = t_df[f]
        key = f + ' vs ' + target
        correlations[key] = pearsonr(x1, Y)[0]


data_correlations = pd.DataFrame(correlations, index=['Value']).T
sorted_c = data_correlations.loc[data_correlations['Value'].abs().sort_values(ascending=False).index]

pd.set_option('display.max_rows', None)
display(sorted_c)



In [None]:
# train with lr

rmses = []
pipelines = []
range = [0.01, 0.03, 0.05, 1.0, 1.5, 2.0]

transform_pipeline = Pipeline(steps = [
        ('catencode', enc), 
        ('null_handler', imp)])

fe = transform_pipeline.fit(training, Y)

for l in range:
    print("Train with alpha ", l)
    ridge_reg = Ridge(alpha=l, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=True, random_state=None, solver='auto', tol=0.001)

    pipeline = Pipeline(steps = [
        ('fe', fe),     
        ('lr', ridge_reg)])
    
    kf = KFold(n_splits = 10, random_state = 0)
    
    iter_rmse = []
    iteration = 0
    for train_idx, test_idx in kf.split(training):
        print("KFold iteration ", iteration )
        X_train, X_test = training.iloc[train_index], training.iloc[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        
        model = pipeline.fit(X_train, y_train)
        y_predict = model.predict(X_test)
        
        mse = mean_squared_error(y_test, y_predict)
        rmse = math.sqrt(mse)
        print(rmse)
        iter_rmse.append(rmse)
        iteration += 1
    
    rmses.append(np.mean(iter_rmse))
    pipelines.append(pipeline)

min_index = np.argmin(rmses)
print('Min RMSE index: ', min_index)


best_pipeline = pipelines[min_index]
print('Best pipeline', best_pipeline)

plt.plot(range, rmses)
plt.show()


In [None]:
sp_id = testDf['Id']
sp_x = testDf.drop('Id', axis = 1)

m = best_pipeline.fit(training, Y)
pred = m.predict(sp_x)


result = pd.DataFrame({'Id': sp_id, 'SalePrice': pred}, index = None)

display(result)
result.to_csv('./submission.csv', index = False)
