In [472]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [473]:
houses = pd.read_csv('/home/satire/PycharmProjects/Statistics/csv/test.csv')
houses_test = pd.read_csv('/home/satire/PycharmProjects/Statistics/csv/train.csv')

In [474]:
print(f"Shape of houses dataset: {houses.shape}")
print(f"Shape of houses_test dataset: {houses_test.shape}")

Shape of houses dataset: (1459, 80)
Shape of houses_test dataset: (1460, 81)


In [475]:
def handle_missing_values(df, threshold=0.70):
    df = df.drop(['Id'], axis=1)
    total_missing = df.isnull().sum().sort_values(ascending=False)
    percent_missing = (df.isnull().sum() / df.isnull().count()).sort_values(ascending=False)
    missing_values = pd.concat([total_missing, percent_missing], axis=1, keys=['Total', 'Percent'])

    missing_features = percent_missing[percent_missing > threshold].index
    df.drop(missing_features, axis=1, inplace=True)

    print(f"Missing features dropped: {missing_features}")
    print(f"Remaining missing values:\n{df.isnull().sum()}")

    return df

In [476]:
houses = handle_missing_values(houses)
houses_test = handle_missing_values(houses_test)

Missing features dropped: Index(['PoolQC', 'MiscFeature', 'Alley', 'Fence'], dtype='object')
Remaining missing values:
MSSubClass         0
MSZoning           4
LotFrontage      227
LotArea            0
Street             0
                ... 
MiscVal            0
MoSold             0
YrSold             0
SaleType           1
SaleCondition      0
Length: 75, dtype: int64
Missing features dropped: Index(['PoolQC', 'MiscFeature', 'Alley', 'Fence'], dtype='object')
Remaining missing values:
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
Street             0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 76, dtype: int64


In [477]:
def fill_numerical_missing(df):
    numerical_columns = df.select_dtypes(include=[np.number])
    df[numerical_columns.columns] = numerical_columns.fillna(numerical_columns.mean())
    return df

In [478]:
houses = fill_numerical_missing(houses)
houses_test = fill_numerical_missing(houses_test)

In [479]:
print(f"Numerical Variables in houses: {houses.select_dtypes(include=[np.number]).columns}")
print(f"Numerical Variables in houses_test: {houses_test.select_dtypes(include=[np.number]).columns}")

Numerical Variables in houses: Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold'],
      dtype='object')
Numerical Variables in houses_test: Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAb

In [480]:
houses_categorical = houses.select_dtypes(exclude=[np.number])
houses_categorical_test = houses_test.select_dtypes(exclude=[np.number])

In [481]:
print(f"Categorical Variables in houses: {houses_categorical.columns}")
print(f"Categorical Variables in houses_test: {houses_categorical_test.columns}")

Categorical Variables in houses: Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition'],
      dtype='object')
Categorical Variables in houses_test: Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 