In [90]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from scipy.stats import chi2_contingency
import seaborn as sns
from sklearn.model_selection import train_test_split

# 1. Preprocessing & Exploration

In [91]:
og_df = pd.read_csv('../AmesHousingForecast/ames.csv')
og_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2930 entries, 0 to 2929
Data columns (total 81 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   MS_SubClass         2930 non-null   object 
 1   MS_Zoning           2930 non-null   object 
 2   Lot_Frontage        2930 non-null   int64  
 3   Lot_Area            2930 non-null   int64  
 4   Street              2930 non-null   object 
 5   Alley               2930 non-null   object 
 6   Lot_Shape           2930 non-null   object 
 7   Land_Contour        2930 non-null   object 
 8   Utilities           2930 non-null   object 
 9   Lot_Config          2930 non-null   object 
 10  Land_Slope          2930 non-null   object 
 11  Neighborhood        2930 non-null   object 
 12  Condition_1         2930 non-null   object 
 13  Condition_2         2930 non-null   object 
 14  Bldg_Type           2930 non-null   object 
 15  House_Style         2930 non-null   object 
 16  Overal

In [92]:
og_df.head()

Unnamed: 0,MS_SubClass,MS_Zoning,Lot_Frontage,Lot_Area,Street,Alley,Lot_Shape,Land_Contour,Utilities,Lot_Config,...,Fence,Misc_Feature,Misc_Val,Mo_Sold,Year_Sold,Sale_Type,Sale_Condition,Sale_Price,Longitude,Latitude
0,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,141,31770,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Corner,...,No_Fence,,0,5,2010,WD,Normal,215000,-93.619754,42.054035
1,One_Story_1946_and_Newer_All_Styles,Residential_High_Density,80,11622,Pave,No_Alley_Access,Regular,Lvl,AllPub,Inside,...,Minimum_Privacy,,0,6,2010,WD,Normal,105000,-93.619756,42.053014
2,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,81,14267,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Corner,...,No_Fence,Gar2,12500,6,2010,WD,Normal,172000,-93.619387,42.052659
3,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,93,11160,Pave,No_Alley_Access,Regular,Lvl,AllPub,Corner,...,No_Fence,,0,4,2010,WD,Normal,244000,-93.61732,42.051245
4,Two_Story_1946_and_Newer,Residential_Low_Density,74,13830,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Inside,...,Minimum_Privacy,,0,3,2010,WD,Normal,189900,-93.638933,42.060899


In [93]:
na_count = og_df.isnull().sum()

na_count = na_count[na_count > 0]

print(na_count)

Series([], dtype: int64)


In [94]:

# Splitting the data into training and testing sets without stratification
features_train, features_test, target_train, target_test = train_test_split(og_df.drop(columns=["Sale_Price"]), og_df["Sale_Price"], test_size=0.3333, random_state=42)

#drop column misc_feature beacuse misc_value contains the exact same information
features_train = features_train.drop(columns=["Misc_Feature"])
features_test = features_test.drop(columns=["Misc_Feature"])

features_train[['Mas_Vnr_Type', 'Mas_Vnr_Area']] = features_train[['Mas_Vnr_Type', 'Mas_Vnr_Area']].fillna('NA')
features_test[['Mas_Vnr_Type', 'Mas_Vnr_Area']] = features_test[['Mas_Vnr_Type', 'Mas_Vnr_Area']].fillna('NA')



In [95]:
# seperate to numerical and categorical columns
categorical_columns = features_train.select_dtypes(include = 'object').columns
numerical_columns = features_train.select_dtypes(include = 'number').columns

print(f'Categorical Count: {len(categorical_columns)} \n Numeric Count: {len(numerical_columns)}')

Categorical Count: 45 
 Numeric Count: 34


In [96]:
insignificant_columns = []


In [97]:
from scipy.stats import chi2_contingency

# Function to perform the Chi-Square test for categorical variables
def chi2_test(columns, features, target):
    results = {}
    for var in columns:
        contingency_table = pd.crosstab(features[var], target)
        chi2, p, dof, expected  = chi2_contingency(contingency_table)
        results[var] = {'Chi2': chi2, 'p-value': p}
        
    return results

# Store the results of the Chi-Square test for each categorical variable
results_ws = chi2_test(categorical_columns, features_train[categorical_columns], target_train)

for var, result in results_ws.items():
    print(f"Chi-Square Test for {var} and Sale_Price:\n  Chi2 = {result['Chi2']}, p-value = {result['p-value']} \n")
    
    # If the p-value is greater than 0.05, add the variable to the list of columns to remove
    if result['p-value'] > 0.05:
        insignificant_columns.append(var)


insignificant_columns


Chi-Square Test for MS_SubClass and Sale_Price:
  Chi2 = 10563.990692907704, p-value = 0.9999738059421928 

Chi-Square Test for MS_Zoning and Sale_Price:
  Chi2 = 7709.2558239148475, p-value = 2.391936563041048e-142 

Chi-Square Test for Street and Sale_Price:
  Chi2 = 1526.413815867612, p-value = 2.5626771763513798e-48 

Chi-Square Test for Alley and Sale_Price:
  Chi2 = 1490.968397544293, p-value = 0.9681146445359857 

Chi-Square Test for Lot_Shape and Sale_Price:
  Chi2 = 2851.8699667072547, p-value = 1.6038098441232264e-10 

Chi-Square Test for Land_Contour and Sale_Price:
  Chi2 = 2582.6200253261413, p-value = 0.0033658900421201887 

Chi-Square Test for Utilities and Sale_Price:
  Chi2 = 1369.9853846153846, p-value = 0.999983969727973 

Chi-Square Test for Lot_Config and Sale_Price:
  Chi2 = 3294.62361361067, p-value = 0.09191992032821766 

Chi-Square Test for Land_Slope and Sale_Price:
  Chi2 = 1775.2697997875187, p-value = 0.0009419814485799178 

Chi-Square Test for Neighborhood

['MS_SubClass',
 'Alley',
 'Utilities',
 'Lot_Config',
 'Condition_1',
 'Bldg_Type',
 'House_Style',
 'Roof_Style',
 'Roof_Matl',
 'Exterior_2nd',
 'Mas_Vnr_Type',
 'Bsmt_Cond',
 'BsmtFin_Type_1',
 'BsmtFin_Type_2',
 'Garage_Cond',
 'Paved_Drive',
 'Pool_QC',
 'Fence']

In [None]:
from scipy.stats import pearsonr

# Calculate the Pearson correlation and p-values for each numeric column
correlation_pvalues = {}
for col in numerical_columns:
    corr, p_value = pearsonr(features_train[col], target_train)
    correlation_pvalues[col] = (corr, p_value)

# Display the correlation values and p-values
for col, (corr, p_value) in correlation_pvalues.items():
    print(f"{col}: correlation = {corr}, p-value = {p_value}")

# Add columns with p-value over 0.05 to the list of insignificant columns
for col, (corr, p_value) in correlation_pvalues.items():
    if p_value > 0.05:
        insignificant_columns.append(col)


insignificant_columns



Lot_Frontage: correlation = 0.18632989078798812, p-value = 1.0240298738132427e-16
Lot_Area: correlation = 0.2603638891730973, p-value = 1.2564134838812876e-31
Year_Built: correlation = 0.5382432636436448, p-value = 4.011049385730871e-147
Year_Remod_Add: correlation = 0.5149467084584333, p-value = 1.0223848778826575e-132
Mas_Vnr_Area: correlation = 0.49745476976129654, p-value = 1.2978966892573367e-122
BsmtFin_SF_1: correlation = -0.1497123735043271, p-value = 2.941399310067601e-11
BsmtFin_SF_2: correlation = 0.038367922732731735, p-value = 0.09005209688592127
Bsmt_Unf_SF: correlation = 0.17230489925961964, p-value = 1.7583996840329718e-14
Total_Bsmt_SF: correlation = 0.6159422025790822, p-value = 2.3180027931610457e-204
First_Flr_SF: correlation = 0.6079511962524037, p-value = 9.885220386016263e-198
Second_Flr_SF: correlation = 0.290080191243597, p-value = 3.5736693212943664e-39
Low_Qual_Fin_SF: correlation = -0.013695013051843192, p-value = 0.5452711569672127
Gr_Liv_Area: correlation 

['MS_SubClass',
 'Alley',
 'Utilities',
 'Lot_Config',
 'Condition_1',
 'Bldg_Type',
 'House_Style',
 'Roof_Style',
 'Roof_Matl',
 'Exterior_2nd',
 'Mas_Vnr_Type',
 'Bsmt_Cond',
 'BsmtFin_Type_1',
 'BsmtFin_Type_2',
 'Garage_Cond',
 'Paved_Drive',
 'Pool_QC',
 'Fence',
 'BsmtFin_SF_2',
 'Low_Qual_Fin_SF',
 'Bsmt_Half_Bath',
 'Three_season_porch',
 'Misc_Val',
 'Mo_Sold']

In [99]:
#select only relevant columns

features_test = features_test.drop(insignificant_columns, axis=1)
features_train = features_train.drop(insignificant_columns, axis=1)

categorical_columns = list(set(categorical_columns)-set(insignificant_columns))
numerical_columns = list(set(numerical_columns)-set(insignificant_columns))

In [103]:
features_test

Unnamed: 0,MS_Zoning,Lot_Frontage,Lot_Area,Street,Lot_Shape,Land_Contour,Land_Slope,Neighborhood,Condition_2,Overall_Qual,...,Wood_Deck_SF,Open_Porch_SF,Enclosed_Porch,Screen_Porch,Pool_Area,Year_Sold,Sale_Type,Sale_Condition,Longitude,Latitude
1357,5,0,5100,1,2,3,0,18,2,8,...,192,63,0,0,0,2008,7,4,-93.621065,42.029038
2367,5,21,1890,1,2,3,0,2,2,0,...,0,0,0,0,0,2006,7,4,-93.627103,42.051798
2822,4,62,7162,1,2,3,0,7,2,5,...,168,57,0,0,0,2006,7,4,-93.672379,42.018990
2126,4,60,8070,1,2,3,0,5,2,2,...,0,0,0,0,0,2007,7,4,-93.692415,42.019028
1544,5,50,7000,1,2,3,0,10,2,0,...,0,0,116,0,0,2008,7,4,-93.628409,42.022607
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016,5,51,6120,1,2,3,0,3,2,1,...,25,0,0,0,0,2007,7,4,-93.621504,42.032505
2681,5,51,6120,1,2,3,0,3,2,1,...,0,0,55,0,0,2006,7,4,-93.622415,42.032418
2129,4,43,12352,1,1,3,0,5,2,5,...,0,74,0,0,0,2007,7,4,-93.688920,42.017826
969,4,75,12518,1,3,3,0,12,2,1,...,144,0,0,0,0,2009,7,4,-93.604264,41.993540


In [101]:
features_train

Unnamed: 0,MS_Zoning,Lot_Frontage,Lot_Area,Street,Lot_Shape,Land_Contour,Land_Slope,Neighborhood,Condition_2,Overall_Qual,...,Wood_Deck_SF,Open_Porch_SF,Enclosed_Porch,Screen_Porch,Pool_Area,Year_Sold,Sale_Type,Sale_Condition,Longitude,Latitude
1498,Residential_Low_Density,313,63887,Pave,Irregular,Bnk,Gtl,Edwards,Norm,Very_Excellent,...,214,292,0,0,480,2008,New,Partial,-93.674898,42.016804
1766,Residential_Low_Density,83,10790,Pave,Regular,Lvl,Gtl,Northridge,Norm,Good,...,120,38,0,0,0,2007,WD,Normal,-93.649901,42.053467
2192,Residential_Low_Density,60,8172,Pave,Regular,HLS,Gtl,Edwards,Norm,Below_Average,...,196,0,0,0,0,2007,COD,Family,-93.663455,42.018730
192,Residential_Low_Density,0,7793,Pave,Slightly_Irregular,Bnk,Gtl,Brookside,Norm,Good,...,0,0,60,0,0,2010,WD,Normal,-93.625825,42.030187
801,Residential_Low_Density,79,12327,Pave,Slightly_Irregular,Low,Mod,Sawyer_West,Norm,Very_Good,...,349,40,0,0,0,2009,WD,Normal,-93.685986,42.031845
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1638,Residential_Low_Density,195,41600,Pave,Slightly_Irregular,Lvl,Gtl,Gilbert,Norm,Average,...,144,0,0,0,0,2007,WD,Normal,-93.622874,42.060096
1095,Residential_Low_Density,0,7750,Pave,Regular,Lvl,Gtl,Gilbert,Norm,Good,...,100,35,0,0,0,2008,WD,Normal,-93.643632,42.059507
1130,Floating_Village_Residential,64,8791,Pave,Slightly_Irregular,Lvl,Gtl,Somerset,Norm,Above_Average,...,216,56,0,0,0,2008,WD,Normal,-93.639662,42.050899
1294,Residential_Medium_Density,81,7308,Pave,Regular,Lvl,Gtl,Old_Town,Norm,Average,...,0,0,236,0,0,2008,WD,Normal,-93.620219,42.030482


In [109]:
categorical_columns

['Bsmt_Exposure',
 'Bsmt_Qual',
 'Central_Air',
 'Condition_2',
 'Electrical',
 'Exter_Cond',
 'Exter_Qual',
 'Exterior_1st',
 'Fireplace_Qu',
 'Foundation',
 'Functional',
 'Garage_Finish',
 'Garage_Qual',
 'Garage_Type',
 'Heating',
 'Heating_QC',
 'Kitchen_Qual',
 'Land_Contour',
 'Land_Slope',
 'Lot_Shape',
 'MS_Zoning',
 'Neighborhood',
 'Overall_Cond',
 'Overall_Qual',
 'Sale_Condition',
 'Sale_Type',
 'Street']

In [105]:
numerical_columns

['Bsmt_Unf_SF',
 'Screen_Porch',
 'Enclosed_Porch',
 'Garage_Cars',
 'Open_Porch_SF',
 'Latitude',
 'Lot_Area',
 'Gr_Liv_Area',
 'Pool_Area',
 'Total_Bsmt_SF',
 'Bsmt_Full_Bath',
 'Year_Remod_Add',
 'TotRms_AbvGrd',
 'Bedroom_AbvGr',
 'Garage_Area',
 'Lot_Frontage',
 'Mas_Vnr_Area',
 'Half_Bath',
 'Kitchen_AbvGr',
 'Year_Sold',
 'Wood_Deck_SF',
 'Longitude',
 'Year_Built',
 'Full_Bath',
 'Fireplaces',
 'First_Flr_SF',
 'Second_Flr_SF',
 'BsmtFin_SF_1']

# 2. Transformation

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

for col in categorical_columns:
    features_train[col] = label_encoder.fit_transform(features_train[col])
    features_test[col] = label_encoder.fit_transform(features_test[col])


print(features_train[categorical_columns].head())
print(features_test[categorical_columns].head())

      Functional  Lot_Shape  Overall_Cond  Garage_Finish  Exter_Cond  \
1498           7          0             1              0           4   
1766           7          2             1              0           4   
2192           7          2             1              3           4   
192            7          3             5              3           4   
801            3          3             7              0           4   

      Neighborhood  Street  Foundation  Overall_Qual  Kitchen_Qual  ...  \
1498             7       1           2             7             0  ...   
1766            17       1           2             5             2  ...   
2192             7       1           1             2             3  ...   
192              3       1           0             5             2  ...   
801             22       1           1             8             2  ...   

      Land_Slope  Fireplace_Qu  Condition_2  Sale_Condition  Sale_Type  \
1498           0             2            