In [174]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as plticker
%matplotlib inline

In [175]:
# Reading in the data
shoe_data = pd.read_csv('/Users/logno/Documents/Home/BAF1/ds_shoe_proj/Clean_Shoe_Data.csv', parse_dates = True)
df = shoe_data.copy()
df

Unnamed: 0,Order Date,Brand,Sneaker Name,Sale Price,Retail Price,Release Date,Shoe Size,Buyer Region
0,2017-09-01,Yeezy,Adidas Yeezy Boost 350 Low V2 Beluga,1097,220,2016-09-24,11.0,California
1,2017-09-01,Yeezy,Adidas Yeezy Boost 350 V2 Core Black Copper,685,220,2016-11-23,11.0,California
2,2017-09-01,Yeezy,Adidas Yeezy Boost 350 V2 Core Black Green,690,220,2016-11-23,11.0,California
3,2017-09-01,Yeezy,Adidas Yeezy Boost 350 V2 Core Black Red,1075,220,2016-11-23,11.5,Kentucky
4,2017-09-01,Yeezy,Adidas Yeezy Boost 350 V2 Core Black Red 2017,828,220,2017-02-11,11.0,Rhode Island
...,...,...,...,...,...,...,...,...
99951,2019-02-13,Yeezy,adidas Yeezy Boost 350 V2 Static Reflective,565,220,2018-12-26,8.0,Oregon
99952,2019-02-13,Yeezy,adidas Yeezy Boost 350 V2 Static Reflective,598,220,2018-12-26,8.5,California
99953,2019-02-13,Yeezy,adidas Yeezy Boost 350 V2 Static Reflective,605,220,2018-12-26,5.5,New York
99954,2019-02-13,Yeezy,adidas Yeezy Boost 350 V2 Static Reflective,650,220,2018-12-26,11.0,California


In [176]:
# Checking for missing values in the dataset
nulls = pd.concat([df.isnull().sum()], axis=1)
nulls[nulls.sum(axis=1) > 0]

Unnamed: 0,0


In [177]:
df.shape

(99956, 8)

In [178]:
df.info

<bound method DataFrame.info of        Order Date   Brand                                   Sneaker Name  \
0      2017-09-01   Yeezy           Adidas Yeezy Boost 350 Low V2 Beluga   
1      2017-09-01   Yeezy    Adidas Yeezy Boost 350 V2 Core Black Copper   
2      2017-09-01   Yeezy     Adidas Yeezy Boost 350 V2 Core Black Green   
3      2017-09-01   Yeezy       Adidas Yeezy Boost 350 V2 Core Black Red   
4      2017-09-01   Yeezy  Adidas Yeezy Boost 350 V2 Core Black Red 2017   
...           ...     ...                                            ...   
99951  2019-02-13   Yeezy    adidas Yeezy Boost 350 V2 Static Reflective   
99952  2019-02-13   Yeezy    adidas Yeezy Boost 350 V2 Static Reflective   
99953  2019-02-13   Yeezy    adidas Yeezy Boost 350 V2 Static Reflective   
99954  2019-02-13   Yeezy    adidas Yeezy Boost 350 V2 Static Reflective   
99955  2019-02-13   Yeezy    adidas Yeezy Boost 350 V2 Static Reflective   

       Sale Price  Retail Price Release Date  Shoe Size

In [179]:
df.describe()

Unnamed: 0,Sale Price,Retail Price,Shoe Size
count,99956.0,99956.0,99956.0
mean,446.634719,208.61359,9.344181
std,255.982969,25.20001,2.329588
min,186.0,130.0,3.5
25%,275.0,220.0,8.0
50%,370.0,220.0,9.5
75%,540.0,220.0,11.0
max,4050.0,250.0,17.0


In [180]:
df.columns

Index(['Order Date', 'Brand', 'Sneaker Name', 'Sale Price', 'Retail Price',
       'Release Date', 'Shoe Size', 'Buyer Region'],
      dtype='object')

In [181]:
# Renaming columns to get rid of spaces 
df = df.rename(columns={
    "Order Date": "Order_date",
    "Sneaker Name": "Sneaker_Name",
    "Sale Price": "Sale_Price",
    "Retail Price": "Retail_Price",
    "Release Date": "Release_Date",
    "Shoe Size": "Shoe_Size",
    "Buyer Region": "Buyer_Region"
    })

In [182]:
# Converting dates into numericals
import datetime as dt

df['Order_date'] = pd.to_datetime(df['Order_date'])
df['Order_date']=df['Order_date'].map(dt.datetime.toordinal)

df['Release_Date'] = pd.to_datetime(df['Release_Date'])
df['Release_Date']=df['Release_Date'].map(dt.datetime.toordinal)

In [183]:
# Starting the linear regression
from sklearn import preprocessing, metrics
from sklearn.model_selection import train_test_split

X = df.drop(['Sale_Price'], axis=1)
y = df.Sale_Price
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)

In [184]:
# Converting categorical data to numerical
from sklearn.preprocessing import OneHotEncoder

object_cols = ['Sneaker_Name', 'Buyer_Region', 'Brand']
# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# Adding the column names after one hot encoding
OH_cols_train.columns = OH_encoder.get_feature_names(object_cols)
OH_cols_valid.columns = OH_encoder.get_feature_names(object_cols)

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)


## Model 1

In [185]:
from sklearn.linear_model import LinearRegression

lm = LinearRegression()
lm.fit(OH_X_train,y_train)

LinearRegression()

In [186]:
print(lm.intercept_)

2959694944013.871


In [187]:
coeff_df = pd.DataFrame(lm.coef_, OH_X_train.columns,columns=['Coefficient'])
ranked_suburbs = coeff_df.sort_values("Coefficient", ascending = False)
ranked_suburbs

Unnamed: 0,Coefficient
Sneaker_Name_Nike Air VaporMax Off White 2018,2.064670e+11
Sneaker_Name_Nike Air VaporMax Off White Black,2.064175e+11
Sneaker_Name_Nike Air VaporMax Off White,2.057499e+11
Sneaker_Name_adidas Yeezy Boost 350 V2 Static,8.883017e+10
Sneaker_Name_adidas Yeezy Boost 350 V2 Static Reflective,8.882686e+10
...,...
Sneaker_Name_Nike Air Force 1 Low Virgil Abloh Off White AF100,-7.413324e+10
Sneaker_Name_Nike Blazer Mid Off White All Hallows Eve,-1.290464e+11
Sneaker_Name_Nike Blazer Mid Off White Grim Reaper,-1.290464e+11
Sneaker_Name_Nike Blazer Mid Off White Wolf Grey,-1.291753e+11


In [188]:
predictions = lm.predict(OH_X_valid)

In [190]:
from sklearn import metrics
print("MAE:", metrics.mean_absolute_error(y_valid, predictions))
print('MSE:', metrics.mean_squared_error(y_valid, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_valid, predictions)))

MAE: 58.95042487209728
MSE: 10049.891153092109
RMSE: 100.24914539831305


## Model 2

In [191]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm 
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [192]:
lm = LinearRegression()
lm.fit(OH_X_train,y_train)
rfe = RFE(lm, 10)
rfe = rfe.fit(OH_X_train, y_train)

In [193]:
list(zip(OH_X_train.columns,rfe.support_,rfe.ranking_))

[('Order_date', False, 98),
 ('Retail_Price', False, 96),
 ('Release_Date', False, 95),
 ('Shoe_Size', False, 97),
 ('Sneaker_Name_Adidas Yeezy Boost 350 Low Moonrock', True, 1),
 ('Sneaker_Name_Adidas Yeezy Boost 350 Low Oxford Tan', True, 1),
 ('Sneaker_Name_Adidas Yeezy Boost 350 Low Pirate Black 2015', True, 1),
 ('Sneaker_Name_Adidas Yeezy Boost 350 Low Pirate Black 2016', True, 1),
 ('Sneaker_Name_Adidas Yeezy Boost 350 Low Turtledove', True, 1),
 ('Sneaker_Name_Adidas Yeezy Boost 350 Low V2 Beluga', True, 1),
 ('Sneaker_Name_Adidas Yeezy Boost 350 V2 Beluga 2pt0', False, 5),
 ('Sneaker_Name_Adidas Yeezy Boost 350 V2 Blue Tint', False, 7),
 ('Sneaker_Name_Adidas Yeezy Boost 350 V2 Core Black Copper', False, 2),
 ('Sneaker_Name_Adidas Yeezy Boost 350 V2 Core Black Green', False, 3),
 ('Sneaker_Name_Adidas Yeezy Boost 350 V2 Core Black Red', True, 1),
 ('Sneaker_Name_Adidas Yeezy Boost 350 V2 Core Black Red 2017', True, 1),
 ('Sneaker_Name_Adidas Yeezy Boost 350 V2 Core Black White

In [198]:
X_train_rfe = OH_X_train[OH_X_train.columns[rfe.support_]]
X_train_rfe

Unnamed: 0,Sneaker_Name_Adidas Yeezy Boost 350 Low Moonrock,Sneaker_Name_Adidas Yeezy Boost 350 Low Oxford Tan,Sneaker_Name_Adidas Yeezy Boost 350 Low Pirate Black 2015,Sneaker_Name_Adidas Yeezy Boost 350 Low Pirate Black 2016,Sneaker_Name_Adidas Yeezy Boost 350 Low Turtledove,Sneaker_Name_Adidas Yeezy Boost 350 Low V2 Beluga,Sneaker_Name_Adidas Yeezy Boost 350 V2 Core Black Red,Sneaker_Name_Adidas Yeezy Boost 350 V2 Core Black Red 2017,Sneaker_Name_Adidas Yeezy Boost 350 V2 Core Black White,Brand_Off-White
61933,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
89291,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
66195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
79044,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
89407,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15246,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13114,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [219]:
def build_model(X,y):
    X = sm.add_constant(X) #Adding the constant
    model = sm.OLS(y, X)
    results = model.fit() # fitting the model
    print(results.summary()) # model summary
    dir(results)
    return X
    
def checkVIF(X):
    vif = pd.DataFrame()
    vif['Features'] = X.columns
    vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    vif['VIF'] = round(vif['VIF'], 2)
    vif = vif.sort_values(by = "VIF", ascending = False)
    return(vif)

In [220]:
X_train_new = build_model(OH_X_train,y_train)

OLS Regression Results                            
Dep. Variable:             Sale_Price   R-squared:                       0.833
Model:                            OLS   Adj. R-squared:                  0.833
Method:                 Least Squares   F-statistic:                     3951.
Date:                Sat, 05 Sep 2020   Prob (F-statistic):               0.00
Time:                        17:57:19   Log-Likelihood:            -4.8540e+05
No. Observations:               79964   AIC:                         9.710e+05
Df Residuals:                   79862   BIC:                         9.719e+05
Df Model:                         101                                         
Covariance Type:            nonrobust                                         
                                                                     coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------------------

In [218]:
results

NameError: name 'results' is not defined