# Data Exploration

### Import necessary packages

In [144]:
import pandas as pd
import numpy as np
%matplotlib notebook
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import classification_report, confusion_matrix
# plt.rcParams['figure.figsize'] = [15, 10]

In [145]:
df = pd.read_csv('data/in-vehicle-coupon-recommendation.csv')
df.head()

Unnamed: 0,destination,passanger,weather,temperature,time,coupon,expiration,gender,age,maritalStatus,...,CoffeeHouse,CarryAway,RestaurantLessThan20,Restaurant20To50,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp,Y
0,No Urgent Place,Alone,Sunny,55,2PM,Restaurant(<20),1d,Female,21,Unmarried partner,...,never,,4~8,1~3,1,0,0,0,1,1
1,No Urgent Place,Friend(s),Sunny,80,10AM,Coffee House,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,0,0,0,1,0
2,No Urgent Place,Friend(s),Sunny,80,10AM,Carry out & Take away,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,1,0,0,1,1
3,No Urgent Place,Friend(s),Sunny,80,2PM,Coffee House,2h,Female,21,Unmarried partner,...,never,,4~8,1~3,1,1,0,0,1,0
4,No Urgent Place,Friend(s),Sunny,80,2PM,Coffee House,1d,Female,21,Unmarried partner,...,never,,4~8,1~3,1,1,0,0,1,0


In [146]:
df.shape

(12684, 26)

### Clean the datasetup
Looking at the [data description](data_description.txt), we will convert several columns to more analysis friends form. Since many columns have categorical data, we will use label encoder.

In [147]:
obj_df = df.select_dtypes(include=['object']).copy()
obj_df.head()

Unnamed: 0,destination,passanger,weather,time,coupon,expiration,gender,age,maritalStatus,education,occupation,income,car,Bar,CoffeeHouse,CarryAway,RestaurantLessThan20,Restaurant20To50
0,No Urgent Place,Alone,Sunny,2PM,Restaurant(<20),1d,Female,21,Unmarried partner,Some college - no degree,Unemployed,$37500 - $49999,,never,never,,4~8,1~3
1,No Urgent Place,Friend(s),Sunny,10AM,Coffee House,2h,Female,21,Unmarried partner,Some college - no degree,Unemployed,$37500 - $49999,,never,never,,4~8,1~3
2,No Urgent Place,Friend(s),Sunny,10AM,Carry out & Take away,2h,Female,21,Unmarried partner,Some college - no degree,Unemployed,$37500 - $49999,,never,never,,4~8,1~3
3,No Urgent Place,Friend(s),Sunny,2PM,Coffee House,2h,Female,21,Unmarried partner,Some college - no degree,Unemployed,$37500 - $49999,,never,never,,4~8,1~3
4,No Urgent Place,Friend(s),Sunny,2PM,Coffee House,1d,Female,21,Unmarried partner,Some college - no degree,Unemployed,$37500 - $49999,,never,never,,4~8,1~3


In [148]:
obj_df.isna().sum()

destination                 0
passanger                   0
weather                     0
time                        0
coupon                      0
expiration                  0
gender                      0
age                         0
maritalStatus               0
education                   0
occupation                  0
income                      0
car                     12576
Bar                       107
CoffeeHouse               217
CarryAway                 151
RestaurantLessThan20      130
Restaurant20To50          189
dtype: int64

In [149]:
# Of 12684 records, column 'car' contains NaN for 12576 records, lets drop it.
obj_df.drop('car',axis=1,inplace=True)

In [150]:
# For each column which has NaN counts greater than 0, check what is the most common values in the column
# for simplicity sake, also a little meaningful to replace the NaN with the most common values.
obj_df["Restaurant20To50"].value_counts()

less1    6077
1~3      3290
never    2136
4~8       728
gt8       264
Name: Restaurant20To50, dtype: int64

In [151]:
obj_df = obj_df.fillna({"Bar": "never",
                        "CoffeeHouse":"less1",
                        "CarryAway":"1~3",
                        "RestaurantLessThan20":"1~3",
                        "Restaurant20To50":"less1"
                       })

In [152]:
# Verify no NaN values are present in the dataframe
obj_df.isna().sum()

destination             0
passanger               0
weather                 0
time                    0
coupon                  0
expiration              0
gender                  0
age                     0
maritalStatus           0
education               0
occupation              0
income                  0
Bar                     0
CoffeeHouse             0
CarryAway               0
RestaurantLessThan20    0
Restaurant20To50        0
dtype: int64

In [153]:
# Let's perform on-hot Encoding using the pandas get_dummies method.
obj_encoded=pd.get_dummies(obj_df)

In [154]:
# Removed from original dataframe, those columns which are already encoded
df_without_enc = df.drop(df.select_dtypes(include=['object']).columns,axis=1)

# And join the encoded dataframe to the df_without_enc
df_encoded = df_without_enc.join(obj_encoded)
df_encoded.head()

Unnamed: 0,temperature,has_children,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp,Y,destination_Home,destination_No Urgent Place,...,RestaurantLessThan20_1~3,RestaurantLessThan20_4~8,RestaurantLessThan20_gt8,RestaurantLessThan20_less1,RestaurantLessThan20_never,Restaurant20To50_1~3,Restaurant20To50_4~8,Restaurant20To50_gt8,Restaurant20To50_less1,Restaurant20To50_never
0,55,1,1,0,0,0,1,1,0,1,...,0,1,0,0,0,1,0,0,0,0
1,80,1,1,0,0,0,1,0,0,1,...,0,1,0,0,0,1,0,0,0,0
2,80,1,1,1,0,0,1,1,0,1,...,0,1,0,0,0,1,0,0,0,0
3,80,1,1,1,0,0,1,0,0,1,...,0,1,0,0,0,1,0,0,0,0
4,80,1,1,1,0,0,1,0,0,1,...,0,1,0,0,0,1,0,0,0,0


In [162]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

# Loop Function to identify number of principal components that explain at least 85% of the variance
for comp in range(3, df_encoded.shape[1]):
    pca = PCA(n_components= comp, random_state=42)
    pca.fit(scale(df_encoded))
    comp_check = pca.explained_variance_ratio_
    final_comp = comp
    if comp_check.sum() > 0.85:
        break
        
Final_PCA = PCA(n_components= final_comp,random_state=42)
Final_PCA.fit(df_encoded)
cluster_df=Final_PCA.transform(df_encoded)
num_comps = comp_check.shape[0]
print("Using {} components, we can explain {}% of the variability in the original data.".format(final_comp,comp_check.sum()))

Using 62 components, we can explain 0.8505663673596537% of the variability in the original data.


In [164]:


pca = PCA(n_components=final_comp, random_state=42)
pca.fit(scale(df_encoded))

PCA(n_components=62, random_state=42)

In [165]:
bar_range = pca.n_components_+1
from plotly.subplots import make_subplots
import plotly.graph_objects as go
fig = make_subplots(rows=2, cols=1)

fig.add_trace(go.Bar(x=list(range(1, bar_range)), y=pca.explained_variance_ratio_, name="% of variance explained"),row=1, col=1)

fig.add_trace(go.Scatter(x=list(range(1, bar_range)), y=np.cumsum(pca.explained_variance_ratio_), name='% of cumulative variance explained'),row=2, col=1)

fig.update_xaxes(title_text="Principle Components", range=[1,pca.n_components], row=1, col=1)
fig.update_yaxes(title_text="Explained Variance",range=[0,1], row=1, col=1)
fig.update_layout(height=600,
                  title_text="Principle Component Analysis",
                  yaxis1_tickformat = '%',
                  yaxis2_tickformat = '%',
                  xaxis1=dict(tickmode='linear'),
                  xaxis2=dict(tickmode='linear'))
fig.show()

In [170]:
pca_df = pd.DataFrame(cluster_df,columns=[f'PC{i}' for i in range(1,final_comp+1)])

In [171]:
pca_df.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC53,PC54,PC55,PC56,PC57,PC58,PC59,PC60,PC61,PC62
0,8.292739,-0.430505,-0.160126,-1.133297,-0.160529,-0.936471,0.268586,0.249313,1.238796,0.333496,...,-0.073887,-0.23718,0.105743,0.10594,-0.084164,0.016923,0.138822,-0.157626,-0.023788,-0.145308
1,-16.717296,-1.09357,-0.293476,0.225071,-1.207304,-0.780075,0.252659,0.216191,1.318399,0.22106,...,-0.027289,-0.434818,-0.019227,0.180137,-0.02597,0.171551,-0.159104,0.107553,-0.044074,-0.173438
2,-16.708211,-1.184369,-0.242039,-0.128868,-1.106903,-0.586443,0.353274,0.182634,1.241496,0.306975,...,-0.058449,-0.348146,0.039067,0.133616,-0.048976,0.100233,-0.011078,-0.041388,-0.034266,-0.145779
3,-16.711395,-1.192299,-0.3091,0.087635,-1.346613,-0.591633,0.239542,0.208208,1.332877,0.11794,...,-0.043743,-0.326535,0.056828,0.139752,-0.093759,0.074428,0.080536,-0.111432,-0.013242,-0.134167
4,-16.704953,-1.05837,-0.20448,-1.068003,-0.700253,-0.5883,0.095846,0.251807,1.354389,0.040553,...,-0.027118,-0.385734,0.045214,0.170414,-0.08748,0.088134,0.044342,-0.095255,-0.010047,-0.148952
