In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# NLTK
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words_nltk = set(stopwords.words('english'))
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

# Machine learning
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.utils import resample

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Boss\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Boss\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Read in the csv file
df = pd.read_csv("data_to_train.csv")
df.head()

Unnamed: 0,cuisine,full_ingredients,ingredients_processed
0,African,"['larg', 'red', 'orang', 'pepper', 'deseed', '...",larg red orang pepper deseed cut bites chunk s...
1,African,"['dri', 'blackey', 'bean', 'sirloin', 'steak',...",dri blackey bean sirloin steak cut cube oz veg...
2,African,"['oz', 'oliv', 'oil', 'onion', 'chop', 'garlic...",oz oliv oil onion chop garlic clove crush lamb...
3,African,"['butter', 'greas', 'oz', 'fullfat', 'milk', '...",butter greas oz fullfat milk fresh white bread...
4,African,"['tbsp', 'oliv', 'oil', 'onion', 'thinli', 'sl...",tbsp oliv oil onion thinli slice garlic clove ...


### Split the data into training and test sets (75% train, 25% test)

In [3]:
X = df.ingredients_processed #the column contains textual data to extract features from.
y = df.cuisine #the column we're learning to predict.
print(X.shape, y.shape)

(4724,) (4724,)


In [4]:
# Split X and y into training and testing sets. By default, it splits 75% training and 25% test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(3543,) (3543,)
(1181,) (1181,)


In [5]:
# concatenate our training data back together
X_y_train = pd.concat([X_train, y_train], axis=1)
X_y_train.shape

(3543, 2)

In [6]:
X_y_train.head()

Unnamed: 0,ingredients_processed,cuisine
1564,stew steak cut larg chunk beef kidney cut larg...,British
3569,whole squid tube remov wing tentacl remov rese...,Italian
177,larg freerang egg tbsp milk tbsp plain flour f...,American
2018,beef short rib bone tbsp neutral oil groundnut...,Chinese
2772,larg freerang egg tbsp light oliv oil tsp cumi...,Indian


In [7]:
# separate minority and majority classes
british_cuisines_df = X_y_train[X_y_train.cuisine=="British"]
other_cuisines_df = X_y_train[X_y_train.cuisine!="British"]

In [9]:
other_cuisines_df.head()

Unnamed: 0,ingredients_processed,cuisine
3569,whole squid tube remov wing tentacl remov rese...,Italian
177,larg freerang egg tbsp milk tbsp plain flour f...,American
2018,beef short rib bone tbsp neutral oil groundnut...,Chinese
2772,larg freerang egg tbsp light oliv oil tsp cumi...,Indian
3732,wholem penn tbsp veget oil onion fine chop gar...,Italian


In [10]:
other_cuisines = other_cuisines_df.cuisine.unique().tolist()
other_cuisines

['Italian',
 'American',
 'Chinese',
 'Indian',
 'Thai and South-East Asian',
 'Mexican',
 'Japanese',
 'Spanish',
 'French',
 'East European',
 'Caribbean',
 'North African',
 'Turkish and Middle Eastern',
 'Irish',
 'African',
 'Portuguese',
 'Nordic',
 'Pakistani',
 'South American',
 'Korean',
 'Greek']

In [21]:
len(british_cuisines_df)

1099

In [14]:
other_cuisines_upsampled = list()

# upsample minority

for cuisine in other_cuisines:
    cuisine_df = X_y_train[X_y_train.cuisine==cuisine]
    cuisine_upsampled = resample(cuisine_df,
                                 replace=True, # sample with replacement
                                 n_samples=len(british_cuisines_df), # match number in majority class
                                 random_state=1) # reproducible results)
    other_cuisines_upsampled.append(cuisine_upsampled)

In [18]:
other_cuisines_upsampled = pd.concat(other_cuisines_upsampled)
other_cuisines_upsampled.head()

Unnamed: 0,ingredients_processed,cuisine
3347,aubergin slice lengthway grill oliv oil golden...,Italian
3540,white bread flour tbsp oliv oil pinch fine sal...,Italian
3764,small globe artichok tbsp oliv oil garlic clov...,Italian
3638,cure salt cube lardo garlic clove fine chop dr...,Italian
3436,frozen mix berri caster sugar caster sugar med...,Italian


In [19]:
# combine majority and upsampled minority
upsampled = pd.concat([british_cuisines_df, other_cuisines_upsampled])

In [20]:
# check new class counts
upsampled.cuisine.value_counts()

Pakistani                     1099
African                       1099
Greek                         1099
Thai and South-East Asian     1099
Mexican                       1099
French                        1099
South American                1099
Turkish and Middle Eastern    1099
Caribbean                     1099
Korean                        1099
Spanish                       1099
British                       1099
Portuguese                    1099
Indian                        1099
North African                 1099
Italian                       1099
Irish                         1099
East European                 1099
Nordic                        1099
Chinese                       1099
American                      1099
Japanese                      1099
Name: cuisine, dtype: int64

In [None]:
from sklearn.utils import resample

# Separate input features and target
y = df.Class
X = df.drop('Class', axis=1)

# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)

# concatenate our training data back together
X = pd.concat([X_train, y_train], axis=1)

# separate minority and majority classes
not_fraud = X[X.Class==0]
fraud = X[X.Class==1]

# upsample minority
fraud_upsampled = resample(fraud,
                          replace=True, # sample with replacement
                          n_samples=len(not_fraud), # match number in majority class
                          random_state=27) # reproducible results

# combine majority and upsampled minority
upsampled = pd.concat([not_fraud, fraud_upsampled])

# check new class counts
upsampled.Class.value_counts()
    1    213245
    0    213245

In [None]:
tfidf = TfidfVectorizer()

In [None]:
# Vectorize train and test data
X_train_transformed = tfidf.fit_transform(X_train)
X_test_transformed = tfidf.transform(X_test)
print(X_train_transformed.shape, X_test_transformed.shape)

In [None]:
label_encoder = LabelEncoder()
y_train_transformed = label_encoder.fit_transform(y_train)

In [None]:
# upsample minority
fraud_upsampled = resample(fraud,
                          replace=True, # sample with replacement
                          n_samples=len(not_fraud), # match number in majority class
                          random_state=27) # reproducible results

# combine majority and upsampled minority
upsampled = pd.concat([not_fraud, fraud_upsampled])

# check new class counts
upsampled.Class.value_counts()
    1    213245
    0    213245