In [77]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import ConcatDataset, DataLoader, Sampler, Dataset
from src.data_loader import *
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
import random
import os
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as mse

### 0. Prepare Dataset

In [24]:
f_path = 'data/selected_statslib.csv'
statslib_df = pd.read_csv(f_path)
statslib_df.drop(columns='Unnamed: 0', axis=1, inplace=True)

f_path = 'data/selected_GBREB.csv'
GBREB_df = pd.read_csv(f_path)
GBREB_df.drop(columns='Unnamed: 0', axis=1, inplace=True)

In [25]:
total = pd.concat([statslib_df, GBREB_df], axis=0)
total.fillna(0, inplace=True)
# total.fillna(0, inplace=True)
total.head()

Unnamed: 0,RM,PTRATIO,LSTAT,MEDV,year,Units Sold,Active Listings,Months Supply of Inventory,New Listings,Pending Sales,Days to Off Market,Sold to Original Price Ratio,Price per Square Foot,month
0,6.575,15.3,4.98,24.0,1993,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,6.421,17.8,9.14,21.6,1993,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,7.185,17.8,4.03,34.7,1993,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,6.998,18.7,2.94,33.4,1993,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7.147,18.7,5.33,36.2,1993,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
## list of features to feed
feature_list = list(total.columns)
feature_list.remove('MEDV')
feature_list

['RM',
 'PTRATIO',
 'LSTAT',
 'year',
 'Units Sold',
 'Active Listings',
 'Months Supply of Inventory',
 'New Listings',
 'Pending Sales',
 'Days to Off Market',
 'Sold to Original Price Ratio',
 'Price per Square Foot',
 'month']

In [27]:
gbreb_train_size = 0.4
gbreb_val_size = 0.5
gbreb_test_size = 0.5

X_gbreb_train, X_gbreb, y_gbreb_train, y_gbreb = train_test_split(total[total['year'] != 1993][feature_list], 
                                                                  total[total['year']!=1993]['MEDV'],
                                                                  train_size=gbreb_train_size, 
                                                                 shuffle=True)
X_gbreb_val, X_gbreb_test, y_gbreb_val, y_gbreb_test = train_test_split(X_gbreb, 
                                                                        y_gbreb, 
                                                                        train_size=gbreb_val_size, 
                                                                        shuffle=True)

In [53]:
X_train_set = pd.concat([total[total['year'] == 1993][feature_list], X_gbreb_train], axis=0)
y_train_set = pd.concat([total[total['year'] == 1993]['MEDV'], y_gbreb_train], axis=0)
X_val_set, y_val_set = X_gbreb_val, y_gbreb_val
X_test_set, y_test_set = X_gbreb_test, y_gbreb_test

X_train_set.reset_index(inplace=True, drop=True)
y_train_set.reset_index(inplace=True, drop=True)
X_val_set.reset_index(inplace=True, drop=True)
y_val_set.reset_index(inplace=True, drop=True)
X_test_set.reset_index(inplace=True, drop=True)
y_test_set.reset_index(inplace=True, drop=True)
y_train_set

0       24.0
1       21.6
2       34.7
3       33.4
4       36.2
       ...  
586    380.0
587    900.0
588    358.0
589    692.5
590    760.0
Name: MEDV, Length: 591, dtype: float64

In [54]:
X_train_set = X_train_set.to_numpy()
y_train_set = y_train_set.to_numpy()

In [55]:
X_train_set.shape, y_train_set.shape

((591, 13), (591,))

In [40]:
y_train_set = y_train_set.reshape(-1, 1)
y_train_set.shape

(591, 1)

In [57]:
log_y_train_set = np.log(y_train_set)

In [66]:
model = LinearRegression().fit(X_train_set, log_y_train_set)

### 1. Check Linearity


In [68]:
model.score(X_train_set, log_y_train_set)

0.9212071015740662

In [80]:
model.coef_, model.intercept_

(array([ 9.79188573e-02, -4.04227556e-02, -3.56438533e-02,  9.56108928e-02,
         1.03302540e-03, -4.81175364e-04,  1.18171922e-01, -1.12834027e-03,
         1.08409846e-03,  2.27248958e-03, -6.54184708e-01,  1.75615111e-03,
        -1.34811398e-02]),
 -186.93610808863028)

In [81]:
np.exp(model.coef_), np.exp(model.intercept_)

(array([1.10287329, 0.96038335, 0.96498391, 1.10033083, 1.00103356,
        0.99951894, 1.12543758, 0.9988723 , 1.00108469, 1.00227507,
        0.51986573, 1.00175769, 0.98660932]),
 6.526491678944675e-82)

### 2. Compute L1

In [69]:
# X_val_set
log_y_val_set = np.log(y_val_set)

In [78]:
log_y_pred = model.predict(X_val_set)
mse(log_y_val_set, log_y_pred)

0.11829957234544398