In [174]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from scipy.stats import chi2_contingency
import seaborn as sns

# 1. Preprocessing & Exploration

In [175]:
og_df = pd.read_csv('../AmesHousingForecast/ames.csv')
og_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2930 entries, 0 to 2929
Data columns (total 81 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   MS_SubClass         2930 non-null   object 
 1   MS_Zoning           2930 non-null   object 
 2   Lot_Frontage        2930 non-null   int64  
 3   Lot_Area            2930 non-null   int64  
 4   Street              2930 non-null   object 
 5   Alley               2930 non-null   object 
 6   Lot_Shape           2930 non-null   object 
 7   Land_Contour        2930 non-null   object 
 8   Utilities           2930 non-null   object 
 9   Lot_Config          2930 non-null   object 
 10  Land_Slope          2930 non-null   object 
 11  Neighborhood        2930 non-null   object 
 12  Condition_1         2930 non-null   object 
 13  Condition_2         2930 non-null   object 
 14  Bldg_Type           2930 non-null   object 
 15  House_Style         2930 non-null   object 
 16  Overal

In [176]:
og_df.head()

Unnamed: 0,MS_SubClass,MS_Zoning,Lot_Frontage,Lot_Area,Street,Alley,Lot_Shape,Land_Contour,Utilities,Lot_Config,...,Fence,Misc_Feature,Misc_Val,Mo_Sold,Year_Sold,Sale_Type,Sale_Condition,Sale_Price,Longitude,Latitude
0,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,141,31770,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Corner,...,No_Fence,,0,5,2010,WD,Normal,215000,-93.619754,42.054035
1,One_Story_1946_and_Newer_All_Styles,Residential_High_Density,80,11622,Pave,No_Alley_Access,Regular,Lvl,AllPub,Inside,...,Minimum_Privacy,,0,6,2010,WD,Normal,105000,-93.619756,42.053014
2,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,81,14267,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Corner,...,No_Fence,Gar2,12500,6,2010,WD,Normal,172000,-93.619387,42.052659
3,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,93,11160,Pave,No_Alley_Access,Regular,Lvl,AllPub,Corner,...,No_Fence,,0,4,2010,WD,Normal,244000,-93.61732,42.051245
4,Two_Story_1946_and_Newer,Residential_Low_Density,74,13830,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Inside,...,Minimum_Privacy,,0,3,2010,WD,Normal,189900,-93.638933,42.060899


In [177]:
na_count = og_df.isnull().sum()

na_count = na_count[na_count > 0]

print(na_count)

Mas_Vnr_Type    1775
Misc_Feature    2824
dtype: int64


In [178]:
# fill missing values
df = og_df.copy()
df[['Mas_Vnr_Type', 'Mas_Vnr_Area']] = df[['Mas_Vnr_Type', 'Mas_Vnr_Area']].fillna('None')

In [179]:
# seperate to numerical and categorical columns
categorical_columns = df.select_dtypes(include = 'object').columns 
numerical_columns = df.select_dtypes(include = 'number').columns

print(f'Ordinal Count: {len(categorical_columns)} \n Numeric Count: {len(numerical_columns)}')

Ordinal Count: 46 
 Numeric Count: 35


In [180]:
insignificant_columns = []

In [181]:
# Function to perform the Chi-Square test for categorical variables
def chi2_test(columns, features, target):
    results = {}
    for var in columns:
        contingency_table = pd.crosstab(features[var], target)
        chi2, p, dof, expected  = chi2_contingency(contingency_table)
        results[var] = {'Chi2': chi2, 'p-value': p}
        
    return results

# Store the results of the Chi-Square test for each categorical variable
results_ws = chi2_test(ordinal_columns, df[ordinal_columns], df['Sale_Price'])


for var, result in results_ws.items():
    print(f"Chi-Square Test for {var} and Sale_Price:\n  Chi2 = {result['Chi2']}, p-value = {result['p-value']} \n")
    
    # If the p-value is greater than 0.05, add the variable to the list of columns to remove
    if result['p-value'] > 0.05:
        insignificant_columns.append(var)

Chi-Square Test for MS_SubClass and Sale_Price:
  Chi2 = 16697.585326401397, p-value = 4.120176321886276e-12 

Chi-Square Test for MS_Zoning and Sale_Price:
  Chi2 = 9804.663989198862, p-value = 9.452315281895848e-170 

Chi-Square Test for Street and Sale_Price:
  Chi2 = 2050.887635040308, p-value = 5.6641780213080595e-70 

Chi-Square Test for Alley and Sale_Price:
  Chi2 = 1965.7318979066945, p-value = 0.9348140873374339 

Chi-Square Test for Lot_Shape and Sale_Price:
  Chi2 = 3576.816704870359, p-value = 2.2392220206735227e-09 

Chi-Square Test for Land_Contour and Sale_Price:
  Chi2 = 3342.175495340217, p-value = 0.0009832578467397724 

Chi-Square Test for Utilities and Sale_Price:
  Chi2 = 1802.7526399975156, p-value = 0.9999871897161289 

Chi-Square Test for Lot_Config and Sale_Price:
  Chi2 = 4133.560336488827, p-value = 0.45520290717888945 

Chi-Square Test for Land_Slope and Sale_Price:
  Chi2 = 2147.5224599116864, p-value = 0.0927329574128246 

Chi-Square Test for Neighborhood

In [182]:
# correlation  for sale_price
corr = df[numerical_columns].corr()['Sale_Price'].sort_values(ascending=False)


In [183]:
# Find columns with absolute correlation less than 0.3
low_corr_columns = corr[abs(corr) < 0.3].index.tolist()

# Avoid adding 'Sale_Price' itself to the list
low_corr_columns = [col for col in low_corr_columns if col != 'Sale_Price']

# Append the low correlation columns to the existing list
insignificant_columns.extend(low_corr_columns)

In [184]:
insignificant_columns

['Alley',
 'Utilities',
 'Lot_Config',
 'Land_Slope',
 'Condition_1',
 'Bldg_Type',
 'House_Style',
 'Roof_Style',
 'Roof_Matl',
 'Mas_Vnr_Type',
 'BsmtFin_Type_2',
 'Garage_Cond',
 'Paved_Drive',
 'Pool_QC',
 'Fence',
 'Misc_Feature',
 'Latitude',
 'Half_Bath',
 'Bsmt_Full_Bath',
 'Second_Flr_SF',
 'Lot_Area',
 'Lot_Frontage',
 'Bsmt_Unf_SF',
 'Bedroom_AbvGr',
 'Screen_Porch',
 'Pool_Area',
 'Mo_Sold',
 'Three_season_porch',
 'BsmtFin_SF_2',
 'Misc_Val',
 'Year_Sold',
 'Bsmt_Half_Bath',
 'Low_Qual_Fin_SF',
 'Kitchen_AbvGr',
 'Enclosed_Porch',
 'BsmtFin_SF_1',
 'Longitude']

In [185]:
for col in insignificant_columns:
    if col in numerical_columns:
        numerical_columns = numerical_columns.drop(col)
    if col in categorical_columns:
        categorical_columns = categorical_columns.drop(col)

df = df.drop(insignificant_columns, axis=1)

# 2. Transformation

In [186]:
from sklearn.preprocessing import LabelEncoder
label_endcoder = LabelEncoder()

for col in categorical_columns:
    if col in df.columns:
        df[col] = label_endcoder.fit_transform(df[col])

df[categorical_columns].head()

Unnamed: 0,MS_SubClass,MS_Zoning,Street,Lot_Shape,Land_Contour,Neighborhood,Condition_2,Overall_Qual,Overall_Cond,Exterior_1st,...,Central_Air,Electrical,Kitchen_Qual,Functional,Fireplace_Qu,Garage_Type,Garage_Finish,Garage_Qual,Sale_Type,Sale_Condition
0,2,5,1,3,3,15,2,0,1,3,...,1,4,4,7,2,0,0,5,9,4
1,2,4,1,2,3,15,2,1,0,13,...,1,4,4,7,3,0,3,5,9,4
2,2,5,1,3,3,15,2,0,0,14,...,1,4,2,7,3,0,3,5,9,4
3,2,5,1,2,3,15,2,5,1,3,...,1,4,0,7,5,0,0,5,9,4
4,13,5,1,3,3,8,2,1,1,13,...,1,4,4,7,5,0,0,5,9,4


In [187]:
# split the data into training and testing sets
train, test = train_test_split(df, test_size = 0.3333, random_state = 42)

print(f'Train Count: {len(train)} \n Test Count: {len(test)}')

Train Count: 1953 
 Test Count: 977
