In [None]:
# !pip install pandas
# !pip install numpy
# !pip install matplotlib
# !pip install seaborn
# !pip install scikit-learn

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Load the dataset

In [None]:
# Load the data
df = pd.read_csv("../data/train.csv")

In [None]:
# Import dataProcessor class from data_processor.py file
from data_processor import DataProcessor

data_processor = DataProcessor() 
df_num = data_processor.numerical_data(df)
df_ord = data_processor.ordinal_data(df)

In [None]:
df_num.head()

In [56]:
from sklearn.preprocessing import OrdinalEncoder

# Encode the ordinal data
encoder = OrdinalEncoder()
encoded_data = encoder.fit_transform(df_ord)

# Convert the encoded data back to a DataFrame
encoded_ord_df = pd.DataFrame(encoded_data, columns=df_ord.columns)

# Add salesPrice column to the encoded_ord_df
encoded_ord_df['SalePrice'] = df_num['SalePrice']

encoded_ord_df.head()


Unnamed: 0,LandSlope,BsmtQual,BsmtCond,BsmtFinType1,BsmtFinType2,HeatingQC,Electrical,KitchenQual,Functional,FireplaceQu,GarageQual,GarageCond,PoolQC,SalePrice
0,0.0,2.0,3.0,2.0,5.0,0.0,4.0,2.0,6.0,,4.0,4.0,,208500
1,0.0,2.0,3.0,0.0,5.0,0.0,4.0,3.0,6.0,4.0,4.0,4.0,,181500
2,0.0,2.0,3.0,2.0,5.0,0.0,4.0,2.0,6.0,4.0,4.0,4.0,,223500
3,0.0,3.0,1.0,0.0,5.0,2.0,4.0,2.0,6.0,2.0,4.0,4.0,,140000
4,0.0,2.0,3.0,2.0,5.0,0.0,4.0,2.0,6.0,4.0,4.0,4.0,,250000


Analysis of **Numerical** data
1. Use correlation matrix to find top features.
2. Use pairplot to visualize them.1

In [None]:
# Find the correlation between the numerical data and the target
correlation = df_num.corr()
correlation = correlation['SalePrice'].sort_values(ascending=False)
correlation = correlation.drop('SalePrice')
print(correlation)


In [None]:
# top features with high correlation with the target
threshold = 0.4
top_features = correlation[correlation > threshold]
print("Features with correlation higher than 0.4: ")
print(top_features)
print("\n")

# Check for multicollinearity among the top features
corr_matrix_top_features = df_num[top_features.index].corr()
plt.figure(figsize=(10, 10))
sns.heatmap(corr_matrix_top_features, annot=True)
plt.show()

In [None]:
# Drop one of the highly correlated features with one another,correration > 0.8
high_corr_feature_pairs = []
for i in range(len(top_features)):
    for j in range(i+1, len(top_features)):
        if abs(corr_matrix_top_features.iloc[i, j]) > 0.8:
            high_corr_feature_pairs.append((top_features.index[i], top_features.index[j]))
print("Highly correlated feature pairs: ", high_corr_feature_pairs)
print("\n")

# Randomly drop one of the features in the highly correlated pairs
import random
for pair in high_corr_feature_pairs:
    drop_feature = random.choice(pair)
    if drop_feature in top_features:
        top_features = top_features.drop(drop_feature)

print("Features with correlation higher than 0.8 after dropping highly correlated features: ")
print(top_features)


In [None]:
# Now we plot pairplot for top features
sns.pairplot(df_num[top_features.index])
plt.show()

Analysis of **ordinal** data
1. Use correlation matrix to find top features.
2. Use pairplot to visualize them.

In [57]:
# Analysis of ordinal data
# Find the correlation between the ordinal data and the target
ord_correlation = encoded_ord_df.corr()
ord_correlation = ord_correlation['SalePrice'].sort_values(ascending=False)
ord_correlation = ord_correlation.drop('SalePrice')
print(ord_correlation)

Electrical      0.234990
GarageCond      0.150576
Functional      0.115328
GarageQual      0.115119
BsmtCond        0.060362
LandSlope       0.051152
BsmtFinType2    0.041161
BsmtFinType1   -0.064531
FireplaceQu    -0.166817
HeatingQC      -0.400178
KitchenQual    -0.589189
PoolQC         -0.595229
BsmtQual       -0.611179
Name: SalePrice, dtype: float64


In [None]:
# top ordinal features with high correlation with the target
top_ord_features = ord_correlation.head(5) # top 5 ordinal features
print("Top 5 ordinal features: ")
print(top_ord_features)
print("\n")

# Check for multicollinearity among the top ordinal features
ord_corr_matrix_top_features = encoded_ord_df[top_ord_features.index].corr()
plt.figure(figsize=(10, 10))
sns.heatmap(ord_corr_matrix_top_features, annot=True)
plt.show()

Ordinal features with correlation higher than 0.4: 
Series([], Name: SalePrice, dtype: float64)




ValueError: zero-size array to reduction operation fmin which has no identity

<Figure size 1000x1000 with 0 Axes>

In [None]:
# Drop one of the highly correlated ordinal features with one another,correration > 0.8
ord_high_corr_feature_pairs = []
for i in range(len(top_ord_features)):
    for j in range(i+1, len(top_ord_features)):
        if abs(ord_corr_matrix_top_features.iloc[i, j]) > 0.8:
            ord_high_corr_feature_pairs.append((top_ord_features.index[i], top_ord_features.index[j]))
print("Highly correlated ordinal feature pairs: ")
print(ord_high_corr_feature_pairs)
print("\n")

# Randomly drop one of the features in the highly correlated pairs
import random
for pair in ord_high_corr_feature_pairs:
    drop_feature = random.choice(pair)
    if drop_feature in top_ord_features:
        top_ord_features = top_ord_features.drop(drop_feature)

print("Ordinal features with correlation higher than 0.8 after dropping highly correlated features: ")
print(top_ord_features)