In [None]:
#this allows us to read in the data and work with dataframes and work with arrays, 
import pandas as pd
import numpy as np
import seaborn as sns

#import our plotting libraries
%matplotlib inline
import matplotlib as mpl 
import matplotlib.pyplot as plt
import scipy.stats as stats
plt.style.use('ggplot')

#read in the entire house dataset,
house_df = pd.read_csv('house_prices_1/train.csv')

#outputs the no of rows and columns in the dataset,
print('=> The number of rows and columns in our dataset is (rows, columns):')
print(house_df.shape,'\n')

#outputs the columns header,
print('=> A sample of the column headers in the dataset are:')
print(house_df.iloc[:,2:8].columns.values)

#outputs the summary statistics and info,
print('\n=> These are the summary statistics for a sample of the columns:\n')
print(house_df.iloc[:,71:].describe())
print('\n=> These are the info for a sample of the columns:\n')
print(house_df.iloc[:,71:80].info())

In [None]:
sample_df = house_df[['SalePrice', 'PoolArea','MoSold']]
plt.figure(figsize=(15, 3))

#function for plotting stripplots given a dataframe
def stripplot_these(df):
    for idx, name in enumerate(df.columns):
        n = idx + 1
        plt.subplot(1,3,n)
        sns.stripplot(x=name, data=df, jitter=0.15, orient= 'v', alpha=.4)
    plt.tight_layout()
    plt.show()

stripplot_these(sample_df)

In [None]:
#plotting histogram of values with density values and outlier threshold 
mu = house_df.SalePrice.mean()
sd = house_df.SalePrice.std()
li = mu + 3 * sd
plt.figure(figsize=(10, 5))
plt.title('Density Histogram of Sale Price')
plt.hist(house_df.SalePrice, bins=75,density=True,color='orange')
plt.axvline(li, color='grey', linestyle='dashed', linewidth=2)
plt.show()

In [None]:
#list of column names to keep
col_names = ['SalePrice','OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea']

#creating new filtered dataframe
new_df =  house_df[col_names]

#print the new shape of the data,
print('The current number of rows and columns is:\n')
print(new_df.shape)
print('\nWe have reduce the number of variables from 81 to', new_df.shape[1],'\n')

#checking summary statistics and info of new dataframe
print(new_df.info())

In [None]:
#filtering the dataframe to include just the IV's (features) and another with just the DV's (target)
features = new_df.loc[:,new_df.columns != 'SalePrice']
targets = new_df.loc[:,new_df.columns == 'SalePrice']

#creating pairwise correlation of columns using Pearson's R,
corr = features.corr(method='pearson') 

#plotting the correlation coefficients on a seasborn heatmap matrix
fig, ax = plt.subplots(figsize=(6,5))
sns.heatmap(corr, annot=True, xticklabels=corr.columns, 
            yticklabels=corr.columns, ax=ax, linewidths=.5, 
            vmin = -1, vmax=1, center=0)

plt.title('Correlation HeatMap')
plt.show()

In [None]:
#importing our function for splitting the data and an additional cross validation function,
from sklearn.model_selection import train_test_split, cross_val_score

#splitting our dataset randomly with the test data containing 10% of the data,
X_train, X_test, y_train, y_test = train_test_split(features,targets, 
                                                    test_size=0.1, 
                                                    random_state=42)

#view number of training and testing data
print('Our training prediction variable contains :',len(y_train) ,'rows')
print('Our training independent variable contains :',len(X_train) ,'rows')
print('Our testing prediction variable contains :',len(y_test) ,'rows')
print('Our testing independent variable contains :',len(X_test) ,'rows')

In [None]:
#Allows us access to Scikit-learn linear regression model,
import sklearn.metrics
from sklearn.linear_model import LinearRegression

#Instantiate the regression model and setting parameters if any,
reg_model = LinearRegression()

In [None]:
#fitting the training data to the model,
reg_model.fit(X_train, y_train)

#outputs the coefficients
print('Intercept :', reg_model.intercept_[0], '\n')
print(pd.DataFrame({'features':X_train.columns,'coeficients':reg_model.coef_[0]}))

In [None]:
#outputs the training and testing scores
print("Training set score: {:.2f}".format(reg_model.score(X_train, y_train)))

In [None]:
from sklearn.model_selection import cross_val_score
import math

cv_reg_model = LinearRegression()
cv_scores = cross_val_score(cv_reg_model, features, targets,cv=10, scoring='r2')

#outputs the scores
print('Cross Validation scores: {}'.format(cv_scores))
print("\nAverage 10-Fold CV Score: {}".format(np.mean(cv_scores)))

In [None]:
#prediction values using testing set
y_pred = reg_model.predict(X_test)

#printing sample of predictions
print('Sample predictions are:\n', y_pred[:5].flatten())

y_pred_a = pd.DataFrame(y_pred)
y_test_a = y_test.reset_index(drop=True, inplace=False)
pred_act_df = pd.concat([y_test_a,y_pred_a], axis=1).sort_values(1,axis=1).reset_index(drop=True)

#output samples of our predicted values
plt.figure(figsize=(6, 6))
maxlimit = pred_act_df['SalePrice'].max()
plt.xlim(-1, maxlimit)
plt.ylim(-1, maxlimit)
plt.scatter(pred_act_df[0], pred_act_df['SalePrice'], s=10)
plt.xlabel('Predicted Sale Price')
plt.ylabel('Actual Sale Price')
plt.plot([-1,maxlimit], [-1,maxlimit], ls="--", c=".3")
plt.title('Actual vs Predicted Sale Prices')
plt.show()

In [None]:
#stats model library allows us to run OLS directly,
import statsmodels.api as sm 

#re-run OLS model as sm_model using training and testing dataset,
X_train_sm = sm.add_constant(X_train)
sm_train_model = sm.OLS(y_train, X_train_sm).fit()
print('Training set adj r2: {}'.format(sm_train_model.rsquared_adj))

#run CV again and loop through results applying adjusted r2 formula. Output the results
cv_scores_r2 = cross_val_score(cv_reg_model, features, targets,cv=10, scoring='r2')
cv_scores_adj = []
n = len(features)
k = len(features.columns)
for r in cv_scores_r2:
    adj_r2 = 1-(((1-r)*(n-1))/(n-k-1))
    cv_scores_adj.append(adj_r2)
print('Average 10-Fold CV adj r2: {}'.format(np.mean(cv_scores_adj)))

In [None]:
#for calculating MSE
from sklearn.metrics import mean_squared_error

y_pred_test = reg_model.predict(X_test)
y_pred_train = reg_model.predict(X_train)

rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))

print("Root Mean Squared Error of Training Set: {}".format(rmse_train))
print("Root Mean Squared Error of Testing Set: {}".format(rmse_test))

In [None]:
#lets us use the stats model ols model
import statsmodels.api as sm 

#training the OLS algorithm and outputting the summary statistics
X_train_sm = sm.add_constant(X_train)
sm_model = sm.OLS(y_train, X_train_sm).fit()
print(sm_model.summary())
print(sm_model.rsquared_adj)

In [None]:
#calculate the residuals
y_pred = pd.DataFrame(y_pred)
y_test = y_test.reset_index(drop=True)
residuals = y_test.SalePrice - y_pred[0]

#plotting Residual and Probabililty graph
plt.figure(figsize=(18, 5))
plt.subplot(1,2,1)
plt.axhline(0, color="blue")
plt.title('Plot of Residuals')
plt.scatter(residuals.index,residuals, s=20)

plt.subplot(1,2,2)
plt.title('Probability Plot')
stats.probplot(residuals, dist='norm',plot=plt)
plt.show()