In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.api import OLS

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms

%matplotlib inline

ModuleNotFoundError: No module named 'torch'

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
BASE_COLOR = sns.color_palette()[0]
TEST_SIZE = 0.2
RANDOM_STATE = 42
BATCH_SIZE = 32
LR = 0.01

# Load train and test Datasets

In [None]:
train_df = pd.read_csv("./data/train.csv")
test_df = pd.read_csv("./data/test.csv")

In [None]:
train_df.head()

# Wrangle train Dataset

## Dropping NaN values

droping NaN value with as follows:
 - if column has NaN values more than 200 entry will drop that column
 - if column has NaN values less than 200:
 - if column is numeric will fillna with the `mean`
 - if column is categorical will dropit

In [None]:
nan_cols = train_df.isna().sum()
nan_cols = nan_cols[nan_cols > 0]
nan_cols

In [None]:
nan_cols_to_drop = nan_cols[nan_cols > 200]
train_df = train_df.drop(nan_cols_to_drop.index.values, axis='columns')
train_df.head()

In [None]:
nan_cols = train_df.isna().sum()
nan_cols = nan_cols[nan_cols > 0]
# nan_cols
non_numeric_nan_cols = train_df[nan_cols.index.values].select_dtypes('object').columns
non_numeric_nan_cols

In [None]:
train_df = train_df.drop(non_numeric_nan_cols, axis="columns")
train_df.head()

## Checking any duplicates

In [None]:
train_df[train_df.duplicated()]

**No Duplicated** row in the dataset

In [None]:
nan_cols = train_df.isna().sum()
nan_cols = nan_cols[nan_cols > 0]
nan_cols

In [None]:
train_df['MasVnrArea'].fillna(train_df['MasVnrArea'].mean(), inplace=True)
train_df['GarageYrBlt'].fillna(train_df['GarageYrBlt'].mean(), inplace=True)

nan_cols = train_df.isna().sum()
nan_cols = nan_cols[nan_cols > 0]

assert len(nan_cols) == 0

## Inspecting Categorical Columns

In [None]:
numeric_cols = train_df.select_dtypes('number').columns
categorical_cols = train_df.select_dtypes('object').columns

In [None]:
numeric_cols

In [None]:
categorical_cols

### MSZoning Analysis

In [None]:
train_df['MSZoning'].value_counts(normalize=True).sort_values(ascending=False)

**MSZoning** has a hig bias twards the **Residential Low Density** and **Residential Medium Density** zoning types.

### HouseStyle Analysis

In [None]:
train_df['HouseStyle'].value_counts(normalize=True).sort_values(ascending=False)

**HouseStyle** has major proportions twards **One Story**, **Two Story** and **One and one-half story: 2nd level unfinished**

### Functional Analysis

In [None]:
train_df['Functional'].value_counts(normalize=True).sort_values(ascending=False)

**Functional** column has large proportion twards **Typical Functionality**

### SaleCondition Analysis

In [None]:
train_df['SaleCondition'].value_counts(normalize=True).sort_values(ascending=False)

**SaleCondition** has large proportion in being **Normal**

### Heating Analysis

In [None]:
train_df['Heating'].value_counts(normalize=True).sort_values(ascending=False)

**Heating** columns has large proportion twards **Gas forced warm air furnace**

## Cleaning Categorical Columns

In [None]:
## MSZoning Column
train_df['zone_low_density'] = train_df['MSZoning'] == "RL"

## HouseStyle Column
train_df['one_story_style'] = train_df['HouseStyle'] == '1Story'
train_df['two_story_style'] = train_df['HouseStyle'] == '2Story'
train_df['one_half_story_style'] = train_df['HouseStyle'] == '1.5Fin'

## Functional Column
train_df['typical_functionality'] = train_df['Functional'] == 'Typ'

## Sale Condition
train_df['sale_normal_condition'] = train_df['SaleCondition'] == 'Normal'

## Heating Column
train_df['gas_heating'] = train_df['Heating'] == 'GasA'

## conver to numeric values
cols = ['zone_low_density', 'one_story_style', 'two_story_style', 'typical_functionality',
        'one_half_story_style', 'gas_heating', 'sale_normal_condition']

train_df[cols] = train_df[cols].astype('int')

train_df[
    ['zone_low_density', 'one_story_style', 'two_story_style', 'typical_functionality',
     'one_half_story_style', 'gas_heating', 'sale_normal_condition']
].head()

## Inspecting Numerical Columns

In [None]:
numeric_cols

### MSSubClass Analysis
 - **20	1-STORY 1946 & NEWER ALL STYLES**
 - 30	1-STORY 1945 & OLDER
 - 40	1-STORY W/FINISHED ATTIC ALL AGES
 - 45	1-1/2 STORY - UNFINISHED ALL AGES
 - 50	1-1/2 STORY FINISHED ALL AGES
 - **60	2-STORY 1946 & NEWER**
 - 70	2-STORY 1945 & OLDER
 - 75	2-1/2 STORY ALL AGES
 - 80	SPLIT OR MULTI-LEVEL
 - 85	SPLIT FOYER
 - 90	DUPLEX - ALL STYLES AND AGES
 - 120	1-STORY PUD (Planned Unit Development) - 1946 & NEWER
 - 150	1-1/2 STORY PUD - ALL AGES
 - 160	2-STORY PUD - 1946 & NEWER
 - 180	PUD - MULTILEVEL - INCL SPLIT LEV/FOYER
 - 190	2 FAMILY CONVERSION - ALL STYLES AND AGES
 
 this column must be conveted to be **categorical column**

In [None]:
train_df['MSSubClass'] = train_df['MSSubClass'].astype('object')

train_df['MSSubClass'].value_counts(normalize=True).plot(kind='bar')
plt.title("Count of each House Class")
plt.xlabel('Class')
plt.ylabel('Count');

### YearBuilt Analysis

In [None]:
(train_df['YearBuilt'].value_counts()
 [train_df['YearBuilt'].value_counts() > 20]
).plot(kind='bar')

### YearRemodAdd Analysis

In [None]:
train_df['time_taken_to_remodel'] = train_df['YearRemodAdd'] - train_df['YearBuilt']
train_df['time_taken_to_remodel'].describe()

In [None]:
train_df['time_taken_to_sell'] = train_df['YrSold'] - train_df['YearBuilt']
train_df['time_taken_to_sell'].describe()

In [None]:
train_df['time_taken_to_sell_after_remodel'] = train_df['YrSold'] - train_df['YearRemodAdd']
train_df['time_taken_to_sell_after_remodel'].describe()

on average it took **36 Years** to sell a house and **22 Year** to sell a house that has been **ReModeled**

### LotArea Analysis

In [None]:
train_df['LotArea'].plot(kind='hist', bins=50)
plt.title("Dist. of Lot Area")
plt.xlabel("Lot Area")
plt.ylabel('Count')
plt.xlim([0, 50_000]);

print(train_df['LotArea'].describe())


most of the houses has a Lot Area around **5,000** to **15,000** square feet

### TotalBsmtSF

In [None]:
print(train_df['TotalBsmtSF'].describe())

train_df['TotalBsmtSF'].plot(kind='hist', bins=50)
plt.title("Dist. of Total Basment Area")
plt.xlabel("Basment Area")
plt.ylabel('Count');

plt.xlim([0, 3000]);

most of the houses has a Basment Area from **500** to **1500**

### Bedroom and Bathroom Analysis

In [None]:
train_df['BedroomAbvGr'].value_counts()

In [None]:
train_df['FullBath'].value_counts()

most of the house have a **3, 2 or 4** bedrooms above the grade
and **2 or 1** bathroom above the grade

### OverallQual and OverallCond Analysis

In [None]:
train_df['OverallQual'].value_counts().plot(kind='bar');
plt.title("Quality of the House")
plt.xlabel("Quality Rating")
plt.ylabel("Count");

In [None]:
train_df['OverallCond'].value_counts().plot(kind='bar')
plt.title("Condition of the House")
plt.xlabel("Condition Rating")
plt.ylabel("Count");

most of the houses have a rating condition and quality around **5 or 6**

### SalePrice Analysis

In [None]:
print(train_df['SalePrice'].describe())

train_df['SalePrice'].plot(kind='hist', bins=30)
plt.title("Price of House")
plt.xlabel("Price")
plt.ylabel("Count");

most houses has price ranging between **1,000,000** to **2,500,000** us dollars

In [None]:
plt.figure(figsize=(8, 5))
sns.boxplot(data=train_df, x='SalePrice')
plt.title("House Price Dist");

### Corrletation Matrix in the Dataset

In [None]:
corr = train_df.corr()

fig, ax = plt.subplots(nrows=1, figsize=(15, 14))
sns.heatmap(corr, ax=ax)
ax.set_title("Correlation Matrix");

### SalePrice vs OverallCond

In [None]:
plt.figure(figsize=(8, 5))
sns.boxplot(data=train_df, x='OverallCond', y='SalePrice', color=BASE_COLOR);
plt.title("Sale Price by House Overall Condition");

### SalePrice vs TotalBsmtSF

In [None]:
plt.figure(figsize=(8, 5))
sns.regplot(data=train_df, x='TotalBsmtSF', y='SalePrice', color=BASE_COLOR);
plt.title("Sale Price by House Basment Area");

the relation shows existance of outlier point at **6000** so, it would be better it we remove it

In [None]:
train_df.drop(train_df[train_df['TotalBsmtSF'] > 5000].index, inplace=True)
train_df[train_df['TotalBsmtSF'] > 5000]

In [None]:
plt.figure(figsize=(8, 5))
sns.regplot(data=train_df, x='TotalBsmtSF', y='SalePrice', line_kws=dict(color='r'));
plt.title("Sale Price by House Basment Area");

### SalePrice vs GrLivArea

In [None]:
plt.figure(figsize=(8, 5))
sns.regplot(data=train_df,  x='GrLivArea', y='SalePrice', line_kws=dict(color='r'));
plt.title("Sale Price by House Basment Area");

there is outliers in the GrLivArea greater than **4000** so removing it

In [None]:
train_df.drop(train_df[train_df['GrLivArea'] > 4000].index, inplace=True)
train_df[train_df['GrLivArea'] > 4000]

In [None]:
plt.figure(figsize=(8, 5))
sns.regplot(data=train_df, x='GrLivArea', y='SalePrice', line_kws=dict(color='r'));
plt.title("Sale Price by House Ground Area");

### GarageArea vs SalePrice

In [None]:
plt.figure(figsize=(8, 5))
sns.regplot(data=train_df, x='GarageArea', y='SalePrice', line_kws=dict(color='r'));
plt.title("Sale Price by Garage Area");

positive correlation indicating an increase in price as the garage area increases.

### GarageCars vs SalePrice

In [None]:
plt.figure(figsize=(8, 5))
sns.boxplot(data=train_df, x='GarageCars', y='SalePrice', color=BASE_COLOR);
plt.title("Sale Price by Number of Cars in Garage");

positive correlation indicating an increase in price as number of cars garage can fit increases

In [None]:
## GarageCars vs GarageArea
## False correlation or colinearity in the dataset
## Does make sense

plt.figure(figsize=(8, 5))
sns.boxplot(data=train_df, x='GarageCars', y='GarageArea', color=BASE_COLOR);

### SalePrice vs FullBath

In [None]:
plt.figure(figsize=(8, 5))
sns.boxplot(data=train_df, x='FullBath', y='SalePrice', color=BASE_COLOR);
plt.title("Sale Price by Number of Bathrooms");

# Statmodels OLS Model Analysis

In [None]:
ols_df = train_df.select_dtypes('number')
ols_df.head()

In [None]:
ols_df = ols_df.drop(['YrSold', 'YearBuilt', 'YearRemodAdd', 'Id'], axis='columns')
ols_df

In [None]:
ols_df['intercept'] = 1

In [None]:
ols_model = OLS(ols_df['SalePrice'], ols_df.drop('SalePrice', axis='columns'))
result = ols_model.fit()
result.summary()

# Sklearn Models Analysis

In [None]:
features_cols = [
    "time_taken_to_sell",
    "OverallQual",
    "OverallCond",
    "MasVnrArea",
    "TotalBsmtSF",
    "GrLivArea",
    "BedroomAbvGr",
    "TotRmsAbvGrd",
    "GarageArea",
    "ScreenPorch",
    "zone_low_density",
    "typical_functionality",
    "sale_normal_condition",
]

In [None]:
features_df = train_df[features_cols]
features_df.head()

In [None]:
X = features_df.values
y = train_df['SalePrice']

In [None]:
sc = StandardScaler()

X = sc.fit_transform(X)
X

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, 
                                                    random_state=RANDOM_STATE)

assert x_train.shape[1] == x_test.shape[1]

## Linear Model

In [None]:
lm = LinearRegression()
lm.fit(x_train, y_train)
lm_preds = lm.predict(x_test)

print("R2 Score for Linear Model : ", lm.score(x_test, y_test))
print("RMSE for Linear Model : ", np.sqrt(mean_squared_error(y_test, lm_preds)))

## RandomForest Model

In [None]:
rf_model = RandomForestRegressor()
rf_model.fit(x_train, y_train)
rf_preds = rf_model.predict(x_test)

print("R2 Score for Random Forest Model : ", rf_model.score(x_test, y_test))
print("RMSE for Random Forest Model : ", np.sqrt(mean_squared_error(y_test, rf_preds)))

## DecisionTreeRegressor Model

In [None]:
dt_model = DecisionTreeRegressor()
dt_model.fit(x_train, y_train)
dt_preds = dt_model.predict(x_test)

print("R2 Score for Decision Tree Model : ", rf_model.score(x_test, y_test))
print("RMSE for Decision Tree Model : ", np.sqrt(mean_squared_error(y_test, dt_preds)))

**Random Forest Model** is the best till this moment with **RMSE = 25126.752** and **R2 = 0.8797** and I will use to predict the test dataframe

## Generating Prediction on Test DataFrame

### Cleaning Test Dataset like in Train Dataset

In [None]:
## MSZoning Column
test_df['zone_low_density'] = test_df['MSZoning'] == "RL"

## HouseStyle Column
test_df['one_story_style'] = test_df['HouseStyle'] == '1Story'
test_df['two_story_style'] = test_df['HouseStyle'] == '2Story'
test_df['one_half_story_style'] = test_df['HouseStyle'] == '1.5Fin'

## Functional Column
test_df['typical_functionality'] = test_df['Functional'] == 'Typ'

## Sale Condition
test_df['sale_normal_condition'] = test_df['SaleCondition'] == 'Normal'

## Heating Column
test_df['gas_heating'] = test_df['Heating'] == 'GasA'

## conver to numeric values
cols = ['zone_low_density', 'one_story_style', 'two_story_style', 'typical_functionality',
        'one_half_story_style', 'gas_heating', 'sale_normal_condition']

test_df[cols] = test_df[cols].astype('int')

# test_df[
#     ['zone_low_density', 'one_story_style', 'two_story_style', 'typical_functionality',
#      'one_half_story_style', 'gas_heating', 'sale_normal_condition']
# ].head()

In [None]:
test_df['time_taken_to_sell'] = test_df['YrSold'] - test_df['YearBuilt']

In [None]:
test_features_df = test_df[features_cols]
test_features_df.head()

In [None]:
test_features_df.isna().sum()

In [None]:
test_features_df['MasVnrArea'].fillna(test_features_df['MasVnrArea'].mean(), inplace=True)
test_features_df['TotalBsmtSF'].fillna(test_features_df['TotalBsmtSF'].mean(), inplace=True)
test_features_df['GarageArea'].fillna(test_features_df['GarageArea'].mean(), inplace=True)

assert len(test_features_df.isna().sum()[test_features_df.isna().sum() != 0]) == 0

In [None]:
X_test = test_features_df.values
X_test = sc.fit_transform(X_test)
X_test

### Evaluating Best Machine Learning Model.

In [None]:
test_preds = rf_model.predict(X_test)
test_preds

In [None]:
test_df['SalePrice'] = test_preds
test_df[['Id', 'SalePrice']].head()

In [None]:
## exporting the predictions to csv
test_df[['Id', 'SalePrice']].to_csv('predictions_1.csv', index=False)

## PyTorch Neural Network Model

In [None]:
class HouseDataset(Dataset):
    def __init__(self, x_data, y_data, transform=None):
        """
        Args:
            x_data (np.ndarray) : x_train data array
            y_data (np.ndarray) : y_train data array
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        
        self.x_data = x_data
        self.y_data = y_data
        self.transform = transform
        
    def __len__(self):
        return self.x_data.shape[0]
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
            
        if not isinstance(self.y_data, np.ndarray):
            self.y_data = np.array(self.y_data)
        
        x_sample = self.x_data[idx]
        y_sample = self.y_data[idx]
        
        if self.transform is not None:
            x_sample = self.transform(x_sample)
            
        return x_sample, y_sample
    
    
train_ds = HouseDataset(x_train, y_train)
train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)

sample = next(iter(train_dl))
sample[0], sample[1]

In [None]:
class NNet(nn.Module):
    def __init__(self, input_size: int, hidden_sizes: list, output_size: int=1) -> None:
        super().__init__()
        
        self.fc1 = nn.Linear(input_size, hidden_sizes[0])
        self.fc2 = nn.Linear(hidden_sizes[0], hidden_sizes[1])
        self.fc3 = nn.Linear(hidden_sizes[1], hidden_sizes[2])
        self.fc4 = nn.Linear(hidden_sizes[2], output_size)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        out = self.fc4(x)
        
        return out

class RMSELoss(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.mse = nn.MSELoss()
    
    def forward(self, y_pred, y_true):
        return torch.sqrt(self.mse(y_pred, y_true))


In [None]:
def train(train_dl, model, optimizer, criterion, epochs=100) -> list:
    losses = []

    print("{:<8}|{:>15}".format("Epoch", "Loss"))
    print("="*24)
    epoch_loss = 0
    
    for epoch in range(epochs):
        epoch_losses = []
        for x_batch, y_batch in train_dl:
            
            y_preds = model(x_batch.float())
            loss = criterion(y_preds, y_batch.float())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()
            epoch_losses.append(loss.item())
        
        losses.append(np.mean(epoch_losses))
        
        if epoch % 10 == 0:
            print("{:<8}|{:>15}".format(epoch, round(np.mean(epoch_losses), 3)))

    return losses


In [None]:
## initializing mode
n_feaures = x_train.shape[1]
hidden_sizes = [32, 16, 8]
output_size = 1

model = NNet(n_feaures, hidden_sizes, output_size)

print(model)

## training Model
optimizer = optim.Adam(model.parameters(), lr=LR)
criterion = RMSELoss()

losses = train(train_dl, model, optimizer, criterion, epochs=200)

In [None]:
%matplotlib inline

fig, ax = plt.subplots(nrows=1, figsize=(10, 6))

ax.plot(losses, 'r-', lw=2)
ax.set_title("Torch Model Loss")
ax.set_xlabel("Epoch")
ax.set_ylabel("Loss");
ax.set_ylim([70000, 90000])