In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score
%matplotlib inline

In [2]:
raw_data = pd.read_csv(r"C:\Users\nstow\Desktop\Python\epi_r.csv")

In [3]:
## Remove Null data and check how much data loss there was
df = raw_data
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15864 entries, 0 to 20051
Columns: 680 entries, title to turkey
dtypes: float64(5), int64(674), object(1)
memory usage: 82.4+ MB


In [4]:
## Count the number of rows remaining, as info returns too much data to display
row_count = df.shape[0]
column_count = df.shape[1]
print(row_count)
print(column_count)

15864
680


### We still have nearly 16000 rows of data, so we will continue with the null data simply removed.

In [5]:
## create binary column for use with a binary classifier
df['five_stars'] = np.where(df['rating'] == 5, 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [6]:
## perform feature selection with l1 regression
X = df.drop(columns =  ['rating', 'title', 'five_stars'])
y = df['five_stars']

lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
model = SelectFromModel(lsvc, prefit=True)

df_features = model.transform(X)
df_features.shape



(15864, 36)

In [7]:
## create dataframe to be able to extract column names from the feature selection without different sized dataframes
df_hold = df.drop(columns = ['rating', 'title', 'five_stars'])

## determine column names from feature selection 
model = SelectFromModel(lsvc, prefit=True)
features = model.get_support()
features = df_hold.columns[features]
print(features)

Index(['protein', 'fat', 'alcoholic', 'appetizer', 'backyard bbq', 'bake',
       'bon appétit', 'cheese', 'chicken', 'christmas', 'dessert', 'dinner',
       'drink', 'fall', 'freeze/chill', 'gourmet', 'house & garden',
       'kidney friendly', 'low cholesterol', 'no sugar added', 'pasta',
       'peanut free', 'potato', 'quick & easy', 'rice', 'sauce', 'sauté',
       'side', 'soy free', 'spring', 'summer', 'thanksgiving', 'tomato',
       'vegetable', 'winter', 'turkey'],
      dtype='object')


In [8]:
## create new dataframe with only the features from above and check accuracy with info()
df_features = df[features]
df_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15864 entries, 0 to 20051
Data columns (total 36 columns):
protein            15864 non-null float64
fat                15864 non-null float64
alcoholic          15864 non-null int64
appetizer          15864 non-null int64
backyard bbq       15864 non-null int64
bake               15864 non-null int64
bon appétit        15864 non-null int64
cheese             15864 non-null int64
chicken            15864 non-null int64
christmas          15864 non-null int64
dessert            15864 non-null int64
dinner             15864 non-null int64
drink              15864 non-null int64
fall               15864 non-null int64
freeze/chill       15864 non-null int64
gourmet            15864 non-null int64
house & garden     15864 non-null int64
kidney friendly    15864 non-null int64
low cholesterol    15864 non-null int64
no sugar added     15864 non-null int64
pasta              15864 non-null int64
peanut free        15864 non-null int64
potato 

In [9]:
##create correlation matrix and print by absolute value to remove 4 most highly correlated values
corr_matrix = df_features.corr().abs()

corr_abs = (corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
                 .stack()
                 .sort_values(ascending=False))
print(corr_abs)

peanut free      soy free           0.941653
alcoholic        drink              0.851959
protein          fat                0.712194
bon appétit      gourmet            0.697086
no sugar added   peanut free        0.476860
                 soy free           0.470340
                 side               0.424874
dessert          kidney friendly    0.420922
kidney friendly  soy free           0.407175
                 peanut free        0.400350
bake             dessert            0.395056
alcoholic        house & garden     0.392880
fall             thanksgiving       0.371338
drink            house & garden     0.364912
peanut free      side               0.343739
side             soy free           0.343058
thanksgiving     turkey             0.289833
dinner           peanut free        0.289253
dessert          soy free           0.278657
                 peanut free        0.263001
dinner           soy free           0.248051
christmas        winter             0.245813
dessert   

In [16]:
## Remove highly correlated features to get to 30 features for the dataset as described by the instructions
df_features = df_features.drop(columns = ['soy free', 'fat', 'gourmet', 'peanut free', 'soy free', 'side','kidney friendly'])
df_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15864 entries, 0 to 20051
Data columns (total 30 columns):
protein            15864 non-null float64
alcoholic          15864 non-null int64
appetizer          15864 non-null int64
backyard bbq       15864 non-null int64
bake               15864 non-null int64
bon appétit        15864 non-null int64
cheese             15864 non-null int64
chicken            15864 non-null int64
christmas          15864 non-null int64
dessert            15864 non-null int64
dinner             15864 non-null int64
drink              15864 non-null int64
fall               15864 non-null int64
freeze/chill       15864 non-null int64
house & garden     15864 non-null int64
low cholesterol    15864 non-null int64
no sugar added     15864 non-null int64
pasta              15864 non-null int64
potato             15864 non-null int64
quick & easy       15864 non-null int64
rice               15864 non-null int64
sauce              15864 non-null int64
sauté    

In [17]:
svr = SVR()
X = df_features.sample(frac=0.3, replace=True, random_state=1)
y = df.five_stars.sample(frac=0.3, replace=True, random_state=1)
svr.fit(X,y)



SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False)

In [18]:
svr.score(X, y)

0.07170692150320701

In [19]:
cross_val_score(svr, X, y, cv=5)



array([0.01533726, 0.00649522, 0.02055309, 0.006601  , 0.00874232])

### The R-Squared value is not good for this model and neither are the cross-validation scores. This is a poor model, possibly because we removed too many features.