In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
!cp /content/drive/MyDrive/2-folder/kaggle/df_utils.py /content/
import df_utils

In [15]:
df_train_loaded = pd.read_csv('/content/drive/MyDrive/2-folder/kaggle/housing-prices-competition/train.csv')
df_test_loaded = pd.read_csv('/content/drive/MyDrive/2-folder/kaggle/housing-prices-competition/test.csv')

# num_col = len(df_train.columns)
# print(f"num_col = {num_col}")
# print(df_train["SalePrice"][:5])

df_train = df_train_loaded.drop(['Id'], axis=1)
df_test = df_test_loaded.drop(['Id'], axis=1)

# Drop all columns with with more than 20% of missing values
percent_missing = df_train.isnull().sum() * 100 / len(df_train)
missing_value_df = pd.DataFrame({'column_name': df_train.columns,
                                 'percent_missing': percent_missing})
missing_value_df = missing_value_df.loc[missing_value_df['percent_missing'] >= 20]
# print(missing_value_df)
df_train = df_train.drop(['Alley', 'MasVnrType', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'], axis=1)
df_test = df_test.drop(['Alley', 'MasVnrType', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature'], axis=1)

def prepare_df(df_train, df_test, max_unique = 0, skip_cols=None):
  df_train_copy = df_train.copy()
  df_test_copy = df_test.copy()
  for column in df_train_copy:
    if column in skip_cols:
      continue
    print(column)
    column_type = df_train_copy[column].dtype
    check_nan = df_train_copy[column].isnull().values.any()
    # print(f'check_nan = {check_nan}')
    if column_type == 'object':
        # print(f'The column {column} contains string data')
        unique = df_train_copy[column].nunique()
        # print(f'unique = {unique}')
        if unique > max_unique:
          df_train_copy = df_train_copy.drop([column], axis=1)
          df_test_copy = df_test_copy.drop([column], axis=1)
          continue
        if check_nan:
          df_utils.fill_with_mode(column, df_train_copy)
          df_utils.fill_with_mode(column, df_test_copy)
        df_train_copy = df_utils.one_hot_encoding(df_train_copy, column)
        df_test_copy = df_utils.one_hot_encoding(df_test_copy, column)
    else:
        # print(f'The column {column} does not contain string data')
        if check_nan:
          df_utils.fill_with_mean(column, df_train_copy)
          df_utils.fill_with_mean(column, df_test_copy)
  return df_train_copy, df_test_copy

df_train, df_test = prepare_df(df_train, df_test, 5, ['SalePrice'])
print(df_train)

# Calculate the correlation of features with the target
correlation = df_train.corr()
sorted_corr = correlation['SalePrice'].sort_values(ascending=False)
# print(sorted_corr)
columns = []
for i, v in sorted_corr.items():
  if v > 0.2 and i != 'SalePrice':
    print('index: ', i, 'value: ', v)
    columns.append(i)
# print(columns)

# Remove target label from training set
y_train = df_train['SalePrice'].values
# print(y[0:5])
df_train = df_train.drop(['SalePrice'], axis=1)

# Extract columns with high correlaton
df_train = df_train[columns]
df_test = df_test[columns]
print(len(df_train.columns))
print(len(df_test.columns))
print(df_train.head(5))

# normalize dataframe
df_train=(df_train-df_train.mean())/df_train.std()
df_test=(df_test-df_test.mean())/df_test.std()

# create X and y for training
X = df_train.values
y = y_train

# create train, validation and test splits
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

MSSubClass
MSZoning
LotFrontage
LotArea
Street
LotShape
LandContour
Utilities
LotConfig
LandSlope
Neighborhood
Condition1
Condition2
BldgType
HouseStyle
OverallQual
OverallCond
YearBuilt
YearRemodAdd
RoofStyle
RoofMatl
Exterior1st
Exterior2nd
MasVnrArea
ExterQual
ExterCond
Foundation
BsmtQual
BsmtCond
BsmtExposure
BsmtFinType1
BsmtFinSF1
BsmtFinType2
BsmtFinSF2
BsmtUnfSF
TotalBsmtSF
Heating
HeatingQC
CentralAir
Electrical
1stFlrSF
2ndFlrSF
LowQualFinSF
GrLivArea
BsmtFullBath
BsmtHalfBath
FullBath
HalfBath
BedroomAbvGr
KitchenAbvGr
KitchenQual
TotRmsAbvGrd
Functional
Fireplaces
GarageType
GarageYrBlt
GarageFinish
GarageCars
GarageArea
GarageQual
GarageCond
PavedDrive
WoodDeckSF
OpenPorchSF
EnclosedPorch
3SsnPorch
ScreenPorch
PoolArea
MiscVal
MoSold
YrSold
SaleType
SaleCondition
      MSSubClass  LotFrontage  LotArea  OverallQual  OverallCond  YearBuilt  \
0             60         65.0     8450            7            5       2003   
1             20         80.0     9600            6   