In [None]:
# Libraries

import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import warnings
import seaborn as sns

from collections import Counter

from pprint import pprint

from sklearn.manifold import TSNE
from sklearn.preprocessing import normalize
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


from bokeh.plotting import figure, output_file, show, ColumnDataSource
from bokeh.models import HoverTool

from scipy import stats
from scipy.stats import norm
from scipy.cluster.hierarchy import fcluster
from scipy.cluster.hierarchy import linkage, dendrogram

from xgboost import XGBRegressor


In [None]:
# Reading CSV files

test_data = pd.read_csv('data/test.csv')
train_data = pd.read_csv('data/train.csv')

In [None]:
# This function should remove NANs from a list
def cleanan(input_list):
    cleaned = [i for i in input_list if str(i) != 'nan']
    return (cleaned)


# Some Overview

In [None]:
# 1. Train Data

train_data.head(3)

In [None]:
train_data.isnull().sum()[train_data.isnull().sum() != 0]

In [None]:
# 2. Test Data

test_data.head(3)

In [None]:
test_data.isnull().sum()[test_data.isnull().sum() != 0]

# Here we split numeric and non-numeric data into to separate dataframes to deal with NANs separately

In [None]:
# 1. Train Data

cols_to_change_1 = train_data.select_dtypes(exclude=['float64','int64'])
cols_to_keep_1 = train_data.select_dtypes(include=['float64','int64'])

# indices should be added for future merge

cols_to_change_1['IDX']=cols_to_change_1.index
cols_to_keep_1['IDX']=cols_to_keep_1.index

# 2. Test Data

cols_to_change_2 = test_data.select_dtypes(exclude=['float64','int64'])
cols_to_keep_2 = test_data.select_dtypes(include=['float64','int64'])

# indices should be added for future merge

cols_to_change_2['IDX']=cols_to_change_2.index
cols_to_keep_2['IDX']=cols_to_keep_2.index

# Visual inspection of non-numeric data

In [None]:
# 1.TRAIN DATA
non_num=[]

for i in range(1, cols_to_change_1.shape[1]):
    non_num.append([cols_to_change_1.columns[i], 
                    cols_to_change_1[cols_to_change_1.columns[i]].isnull().sum(), 
                    cols_to_change_1[cols_to_change_1.columns[i]].unique(),
                   cleanan(cols_to_change_1[cols_to_change_1.columns[i]].unique())]) 
    

df_str = pd.DataFrame(non_num)
df_str.columns=['col','NANs','Unique','Unique_Clean']


df_str.sort_values(by='NANs', ascending=False)

In [None]:
# 2. TEST DATA
non_num=[]

for i in range(1, cols_to_change_2.shape[1]):
    non_num.append([cols_to_change_2.columns[i], 
                    cols_to_change_2[cols_to_change_2.columns[i]].isnull().sum(), 
                    cols_to_change_2[cols_to_change_2.columns[i]].unique(),
                   cleanan(cols_to_change_2[cols_to_change_2.columns[i]].unique())]) 
    

df_str = pd.DataFrame(non_num)
df_str.columns=['col','NANs','Unique','Unique_Clean']


df_str.sort_values(by='NANs', ascending=False)

# The columns in this loop have a large number of NANs. 
##### Instead of replacing them with most frequent, a random value of non-NANs with be substituted


In [None]:
# 1. TRAIN DATA


for colu in ['Alley','FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']:
    cols_to_change_1[colu]=cols_to_change_1[colu].apply(lambda v: np.random.choice(cleanan(cols_to_change_1[colu].unique())))

In [None]:
# 2. TEST DATA


for colu in ['Alley','FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']:
    cols_to_change_2[colu]=cols_to_change_2[colu].apply(lambda v: np.random.choice(cleanan(cols_to_change_2[colu].unique())))

# For the rest of the columns, NANs will be replaced with most frequent value
#### No need to filter non-NAN columns

In [None]:
# 1. TRAIN DATA

for colu in cols_to_change_1.columns:
    cols_to_change_1[colu].fillna(cols_to_change_1[colu].mode()[0], inplace = True)
    
        

In [None]:
# 2. TEST DATA

for colu in cols_to_change_2.columns:
    cols_to_change_2[colu].fillna(cols_to_change_2[colu].mode()[0], inplace = True)


# checking for any NAN left

In [None]:
# 1. TRAIN DATA

cols_to_change_1.isnull().sum()[cols_to_change_1.isnull().sum() != 0]

In [None]:
# 2. TEST DATA

cols_to_change_2.isnull().sum()[cols_to_change_2.isnull().sum() != 0]

# Creating the dummies values

In [None]:
# 1. TRAIN DATA

cols_to_change_dum_1 = pd.get_dummies(cols_to_change_1, drop_first=True)

In [None]:
# 1. TEST DATA

cols_to_change_dum_2 = pd.get_dummies(cols_to_change_2, drop_first=True)

# Visual inspection of numeric data

In [None]:
# 1. TRAIN DATA

num_vals=[]
for i in range(1, cols_to_keep_1.shape[1]):
    num_vals.append([cols_to_keep_1.columns[i], cols_to_keep_1[cols_to_keep_1.columns[i]].mean(), cols_to_keep_1[cols_to_keep_1.columns[i]].median(), cols_to_keep_1[cols_to_keep_1.columns[i]].isnull().sum()])

df_num = pd.DataFrame(num_vals)
df_num.columns=['col','mean','median','NANs']

df_num.sort_values(by='NANs', ascending=False)

In [None]:
# 2. TEST DATA

num_vals=[]
for i in range(1, cols_to_keep_2.shape[1]):
    num_vals.append([cols_to_keep_2.columns[i], cols_to_keep_2[cols_to_keep_2.columns[i]].mean(), cols_to_keep_2[cols_to_keep_2.columns[i]].median(), cols_to_keep_2[cols_to_keep_2.columns[i]].isnull().sum()])

df_num = pd.DataFrame(num_vals)
df_num.columns=['col','mean','median','NANs']

df_num.sort_values(by='NANs', ascending=False)

# Replacing the NaN values with median

In [None]:
# 1. TRAIN DATA

imp = Imputer(missing_values = 'NaN', strategy = 'median', axis =0)
imp.fit(cols_to_keep_1)
cols_to_keep_1_nonan = imp.transform(cols_to_keep_1)

# Convert into dataframe
cols_to_keep_1_df = pd.DataFrame(cols_to_keep_1_nonan)

# the new dataframe has no column header, so copy it from the main one
cols_to_keep_1_df.columns = cols_to_keep_1.columns

In [None]:
# 2. TEST DATA

imp = Imputer(missing_values = 'NaN', strategy = 'median', axis =0)
imp.fit(cols_to_keep_2)
cols_to_keep_2_nonan = imp.transform(cols_to_keep_2)

# Convert into dataframe
cols_to_keep_2_df = pd.DataFrame(cols_to_keep_2_nonan)

# the new dataframe has no column header, so copy it from the main one
cols_to_keep_2_df.columns = cols_to_keep_2.columns

# Merging the numeric and non-numeric parts

In [None]:
# 1. Train Data
new_train = pd.merge(cols_to_change_dum_1, cols_to_keep_1_df, how='left', on='IDX')

In [None]:
# 2. Test Data
new_test = pd.merge(cols_to_change_dum_2, cols_to_keep_2_df, how='left', on='IDX')

## Oooops! Some missing columns

In [None]:
new_train.shape

In [None]:
new_test.shape

In [None]:
# Find what columns are missing in test data
discrep = [value for value in new_train.columns if not(value in new_test.columns)]
discrep

In [None]:
# Backward check
discrep2 = [value for value in new_test.columns if not(value in new_train.columns)]
discrep2

In [None]:
# Create a 'blank' dataframe using the missing columns PLUS column 'IDX' for merging ref
new_test_cols = pd.DataFrame(columns=discrep)
new_test_cols['IDX']=new_test.IDX
new_test_cols.fillna(0, inplace=True)

In [None]:
new_test_adjusted = pd.merge(new_test, new_test_cols, how='left', on='IDX')

In [None]:
new_test_adjusted.shape

In [None]:
# Final check - Now the train and test dataset are the same.
discrep = [value for value in new_train.columns if not(value in new_test_adjusted.columns)]
discrep

# Analysis
#### Vinay's notebook

In [None]:
# To get a correlation 
corrmat = new_train.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True);
plt.show()

In [None]:
# To get the correlation coffeccient among the variables
k = 10 #number of variables for heatmap
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(new_train[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 8}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

In [None]:
# To check the unique values in overall quality
new_train.OverallQual.unique()

In [None]:
#box plot overallqual/saleprice
var = 'OverallQual'
data = pd.concat([new_train['SalePrice'], new_train[var]], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x=var, y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);
plt.show()

In [None]:
#box plot Kitchenqual/saleprice
var = 'KitchenQual'
data = pd.concat([train_data['SalePrice'], train_data[var]], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x=var, y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);
plt.show()

In [None]:
#box plot Garagequal/saleprice
var = 'GarageQual'
data = pd.concat([train_data['SalePrice'], train_data[var]], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x=var, y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);
plt.show()

In [None]:
#box plot RoofMatl/saleprice
var = 'RoofMatl'
data = pd.concat([train_data['SalePrice'], train_data[var]], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x=var, y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);
plt.show()

In [None]:
#box plot LandSlope/saleprice
var = 'LandSlope'
data = pd.concat([train_data['SalePrice'], train_data[var]], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x=var, y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);
plt.show()

In [None]:
#box plot Street/saleprice
var = 'Street'
data = pd.concat([train_data['SalePrice'], train_data[var]], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x=var, y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);
plt.show()

In [None]:
#box plot Neighborhood/saleprice
var = 'WoodDeckSF'
data = pd.concat([train_data['SalePrice'], train_data[var]], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x=var, y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);
plt.show()

In [None]:
#scatterplot
sns.set()
cols = ['SalePrice', 'YearRemodAdd', 'OverallQual', '1stFlrSF', 'GarageCars', 'TotalBsmtSF', 'FullBath']
sns.pairplot(train_data[cols], size = 3)
plt.show();

In [None]:
# Checking some outliers in the data using scatter plot
plt.scatter(train_data.GrLivArea, train_data.SalePrice, c = "blue", marker = "s")
plt.title("Looking for outliers")
plt.xlabel("GrLivArea")
plt.ylabel("SalePrice")
plt.show()
# We can get rid of living area more than 4000

In [None]:
# Checking some outliers in the data using scatter plot
plt.scatter(train_data.OverallQual, train_data.SalePrice, c = "blue", marker = "s")
plt.title("Looking for outliers")
plt.xlabel("OverallQual")
plt.ylabel("SalePrice")
plt.show()
# Looks ok

In [None]:
# Checking some outliers in the data using scatter plot
plt.scatter(train_data["1stFlrSF"], train_data.SalePrice, c = "blue", marker = "s")
plt.title("Looking for outliers")
plt.xlabel("1stFlrSF")
plt.ylabel("SalePrice")
plt.show()
# anything greater than 3500 looks fishy

In [None]:
# Checking some outliers in the data using scatter plot
plt.scatter(train_data["TotalBsmtSF"], train_data.SalePrice, c = "blue", marker = "s")
plt.title("Looking for outliers")
plt.xlabel("TotalBsmtSF")
plt.ylabel("SalePrice")
plt.show()
# anything greater than 3500 looks fishy

In [None]:
# Checking some outliers in the data using scatter plot
plt.scatter(train_data["FullBath"], train_data.SalePrice, c = "blue", marker = "s")
plt.title("Looking for outliers")
plt.xlabel("FullBath")
plt.ylabel("SalePrice")
plt.show()
# anything greater than 3500 looks fishy

# My analysis starts here

In [None]:
my_model = XGBRegressor()
# Add silent=True to avoid printing out updates with each cycle
my_model.fit(X, y, verbose=False)

# make predictions
y_pred = my_model.predict(X_test.as_matrix())

submission = pd.DataFrame({'Id': df_test.Id, 'SalePrice': y_pred})
submission.to_csv('NSS_twisted_horses_XGB.csv', index=False)
#Your submission scored 0.16555, which is an improvement of your previous score of 0.17572. Great job!

In [None]:
# ********** DO NOT RUN UNTIL XGB SOLVED ****************

my_pipeline = make_pipeline(Imputer(), XGBRegressor())

my_pipeline.fit(X, y)
y_pred = my_pipeline.predict(X_test)

submission = pd.DataFrame({'Id': df_test.Id, 'SalePrice': y_pred})
submission.to_csv('NSS_twisted_horses_pipeline_XGB.csv', index=False)
#Your submission scored 0.18125, which is not an improvement of your best score. Keep trying!

In [None]:
# Create arrays for the features and the response variable
y = new_train['SalePrice'].values
X = new_train.drop('SalePrice', axis=1).values

In [None]:
# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)

# Create the regressor: reg_all
reg_all = LinearRegression()

# Fit the regressor to the training data
reg_all.fit(X_train, y_train)

# Predict on the test data: y_pred
y_pred = reg_all.predict(X_test)

# Compute and print R^2 and RMSE
print("R^2: {}".format(reg_all.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error: {}".format(rmse))

In [None]:
y_submit = [reg_all.predict(X)]
pd.DataFrame(y_submit)

In [None]:

# Compute 5-fold cross-validation scores: cv_scores
cv_scores = cross_val_score(reg_all, X, y, cv=5)

# Print the 5-fold cross-validation scores
print(cv_scores)

print("Average 5-Fold CV Score: {}".format(np.mean(cv_scores)))


In [None]:
# Instantiate a lasso regressor: lasso
lasso = Lasso(alpha=0.4, normalize=True)

# Fit the regressor to the data
lasso.fit(X, y)

# Compute and print the coefficients
lasso_coef = lasso.coef_
print(lasso_coef)

# Plot the coefficients
plt.plot(range(len(new_train)), lasso_coef)
plt.xticks(range(len(new_train)), new_train.values, rotation=60)
plt.margins(0.02)
plt.show()

# ValueError: x and y must have same first dimension, but have shapes (1460,) and (247,)

In [None]:
# Setup the array of alphas and lists to store scores
alpha_space = np.logspace(-4, 0, 50)
ridge_scores = []
ridge_scores_std = []

# Create a ridge regressor: ridge
ridge = Ridge(alpha=0.1, normalize=True)

# Compute scores over range of alphas
for alpha in alpha_space:

    # Specify the alpha value to use: ridge.alpha
    ridge.alpha = alpha
    
    # Perform 10-fold CV: ridge_cv_scores
    ridge_cv_scores = cross_val_score(ridge, X, y, cv=10)
    
    # Append the mean of ridge_cv_scores to ridge_scores
    ridge_scores.append(np.mean(ridge_cv_scores))
    
    # Append the std of ridge_cv_scores to ridge_scores_std
    ridge_scores_std.append(np.std(ridge_cv_scores))

# Display the plot
# print("RS:",ridge_scores, "RS_std:",ridge_scores_std)
_ = plt.scatter(ridge_scores, ridge_scores_std, c = "blue", marker = ".")
_ = plt.title("Ridge Score")
_ = plt.xlabel("Ridge Score")
_ = plt.ylabel("Standardized RS")
plt.show()


In [None]:
# Create training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state=42)


# Instantiate a k-NN classifier: knn
knn = KNeighborsClassifier(n_neighbors=6)

# Fit the classifier to the training data
knn.fit(X_train, y_train)

# Predict the labels of the test data: y_pred
y_pred = knn.predict(X_test)

# Generate the confusion matrix and classification report
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
