In [1]:
%load_ext autoreload
%autoreload 2
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pprint import pprint
from feature_engineering import numericalise, date_feature_extracter

## Data Loading

In [3]:
df = pd.read_csv("data/Train.csv", low_memory=False, parse_dates=["saledate"])
df.head(2).T

Unnamed: 0,0,1
SalesID,1139246,1139248
SalePrice,66000,57000
MachineID,999089,117657
ModelID,3157,77
datasource,121,121
auctioneerID,3,3
YearMade,2004,1996
MachineHoursCurrentMeter,68,4640
UsageBand,Low,Low
saledate,2006-11-16 00:00:00,2004-03-26 00:00:00


## Modelling on only numerical features

In [3]:
df_numerical = df.select_dtypes(exclude='O').copy()

In [4]:
df_numerical.isnull().sum()

SalesID                          0
SalePrice                        0
MachineID                        0
ModelID                          0
datasource                       0
auctioneerID                 20136
YearMade                         0
MachineHoursCurrentMeter    258360
saledate                         0
dtype: int64

In [5]:
df_numerical.describe()

Unnamed: 0,SalesID,SalePrice,MachineID,ModelID,datasource,auctioneerID,YearMade,MachineHoursCurrentMeter
count,401125.0,401125.0,401125.0,401125.0,401125.0,380989.0,401125.0,142765.0
mean,1919713.0,31099.712848,1217903.0,6889.70298,134.66581,6.55604,1899.156901,3457.955
std,909021.5,23036.898502,440992.0,6221.777842,8.962237,16.976779,291.797469,27590.26
min,1139246.0,4750.0,0.0,28.0,121.0,0.0,1000.0,0.0
25%,1418371.0,14500.0,1088697.0,3259.0,132.0,1.0,1985.0,0.0
50%,1639422.0,24000.0,1279490.0,4604.0,132.0,2.0,1995.0,0.0
75%,2242707.0,40000.0,1468067.0,8724.0,136.0,4.0,2000.0,3025.0
max,6333342.0,142000.0,2486330.0,37198.0,172.0,99.0,2013.0,2483300.0


In [6]:
# filling nas with -100 as none of the values are negative
# note that missingness is a feature

for col in ["auctioneerID","MachineHoursCurrentMeter"]:
    df_numerical[col] = df_numerical[col].fillna(-100)

In [7]:
df_numerical.isnull().sum()

SalesID                     0
SalePrice                   0
MachineID                   0
ModelID                     0
datasource                  0
auctioneerID                0
YearMade                    0
MachineHoursCurrentMeter    0
saledate                    0
dtype: int64

In [8]:
df_numerical.dtypes

SalesID                              int64
SalePrice                            int64
MachineID                            int64
ModelID                              int64
datasource                           int64
auctioneerID                       float64
YearMade                             int64
MachineHoursCurrentMeter           float64
saledate                    datetime64[ns]
dtype: object

## Features from date

In [9]:
date_feature_extracter(df_numerical, 'saledate')
df_numerical.head().T

Unnamed: 0,0,1,2,3,4
SalesID,1139246,1139248,1139249,1139251,1139253
SalePrice,66000,57000,10000,38500,11000
MachineID,999089,117657,434808,1026470,1057373
ModelID,3157,77,7009,332,17311
datasource,121,121,121,121,121
auctioneerID,3,3,3,3,3
YearMade,2004,1996,2001,2001,2007
MachineHoursCurrentMeter,68,4640,2838,3486,722
saleYear,2006,2004,2004,2011,2009
saleMonth,11,3,2,5,7


## DataSplitting

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
feature = df_numerical.drop(["SalePrice"], axis=1).copy().values
target = np.log(df_numerical.SalePrice.copy().values)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(feature, target, random_state=1)

## Linear Regression on just the numerical columns.

In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [14]:
model = LinearRegression().fit(X_train, y_train)

In [15]:
predictions = model.predict(X_train)

In [16]:
np.sqrt(mean_squared_error(y_train, predictions))

0.6589800610647699

In [17]:
model.score(X_train, y_train)

0.09598698124189564

These are very poor scores. Lets try to categorize and numericalise the whole dataframe and run a linear regression model

## Linear Regression Baseline on full data

In [18]:
def numericalise_df(df):
    '''This operation will fix all missing values as well as numericalise
    categories'''
    
    df = df.copy()
    
    # select only object datatypes
    df_cats = df.select_dtypes(include='O')
    
    for col in df_cats.columns: #only loops through categorical columns
        df[col] = df[col].astype("category").cat.codes
        
    for col in df.columns:
        if not col == "saledate":
            mean = df[col].mean()
            df[col] = df[col].fillna(mean)
    return df

In [19]:
df_numericalised = numericalise_df(df)

In [20]:
feature = df_numericalised.drop(["SalePrice", "saledate"], axis=1).copy().values
target  = np.log(df_numericalised.SalePrice.copy().values)
# train test split
X_train, X_test, y_train, y_test = train_test_split(feature, target, random_state=1)

In [21]:
model_all_df = LinearRegression(normalize=True).fit(X_train, y_train)

In [22]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((300843, 51), (300843,), (100282, 51), (100282,))

In [23]:
predictions = model_all_df.predict(X_train)
#model.score(X_train, y_train)

In [24]:
np.sqrt(mean_squared_error(y_train, predictions))

0.49522958149206386

In [26]:
model_all_df.score(X_train, y_train)

0.48944410706470043

## It seems that Linear Regression is a very weak algorith for this specific dataset. Abandonding this algorithm in favour of other ones