In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_json('train.json')
test = pd.read_json('test.json')

class_names = list(train.cuisine.unique())
class_names

['greek',
 'southern_us',
 'filipino',
 'indian',
 'jamaican',
 'spanish',
 'italian',
 'mexican',
 'chinese',
 'british',
 'thai',
 'vietnamese',
 'cajun_creole',
 'brazilian',
 'french',
 'japanese',
 'irish',
 'korean',
 'moroccan',
 'russian']

In [3]:
train_texts = train.ingredients.apply(lambda x: ' '.join(' '.join(x).split()))
train_texts[0]

train_labels = train.cuisine.apply(lambda x: class_names.index(x))
train_labels[100]

6

In [4]:
train['num_ings'] = train['ingredients'].apply(lambda x : len(x))
test['num_ings'] = test['ingredients'].apply(lambda x : len(x))

In [5]:
train.head()

Unnamed: 0,cuisine,id,ingredients,num_ings
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes...",9
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g...",11
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g...",12
3,indian,22213,"[water, vegetable oil, wheat, salt]",4
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe...",20


In [6]:
len(train)

39774

In [7]:
train = train[train['num_ings'] > 2]
len(train)

39559

In [8]:
from nltk.stem import WordNetLemmatizer

In [9]:
def get_replacements():
    return {'wasabe': 'wasabi', '-': '', 'sauc': 'sauce',
            'baby spinach': 'babyspinach', 'coconut cream': 'coconutcream',
            'coriander seeds': 'corianderseeds', 'corn tortillas': 'corntortillas',
            'cream cheese': 'creamcheese', 'fish sauce': 'fishsauce',
            'purple onion': 'purpleonion','refried beans': 'refriedbeans', 
            'rice cakes': 'ricecakes', 'rice syrup': 'ricesyrup', 
            'sour cream': 'sourcream', 'toasted sesame seeds': 'toastedsesameseeds', 
            'toasted sesame oil': 'toastedsesameoil', 'yellow onion': 'yellowonion'}

In [10]:
lemmatizer = WordNetLemmatizer()
replacements = get_replacements()

In [11]:
train['ingredients'] = train['ingredients'].apply(lambda x: list(map(lambda y: y.lower(), x)))
train.head()

Unnamed: 0,cuisine,id,ingredients,num_ings
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes...",9
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g...",11
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g...",12
3,indian,22213,"[water, vegetable oil, wheat, salt]",4
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe...",20


In [12]:
test['ingredients'] = test['ingredients'].apply(lambda x: list(map(lambda y: y.lower(), x)))

In [13]:
import re
stop_pattern = re.compile('[\d’%]')

In [14]:
def tranform_to_single_string(ingredients, lemmatizer, replacements, stop_pattern):
    ingredients_text = ' '.join(iter(ingredients))

    for key, value in replacements.items():
        ingredients_text = ingredients_text.replace(key, value)
    
    words = []
    for word in ingredients_text.split():
        if not stop_pattern.match(word) and len(word) > 2: 
            word = lemmatizer.lemmatize(word)
            words.append(word)
    return ' '.join(words)

In [15]:
transform = lambda ingredients: tranform_to_single_string(ingredients, lemmatizer, replacements, stop_pattern)
train['x'] = train['ingredients'].apply(transform)
train.head()

Unnamed: 0,cuisine,id,ingredients,num_ings,x
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes...",9,romaine lettuce black olive grape tomato garli...
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g...",11,plain flour ground pepper salt tomato ground b...
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g...",12,egg pepper salt mayonaise cooking oil green ch...
3,indian,22213,"[water, vegetable oil, wheat, salt]",4,water vegetable oil wheat salt
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe...",20,black pepper shallot cornflour cayenne pepper ...


In [16]:
test['x'] = test['ingredients'].apply(transform)

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer

In [18]:
vectorizer = make_pipeline(
        TfidfVectorizer(sublinear_tf=True),
        FunctionTransformer(lambda x: x.astype('float'), validate=False)
    )

In [19]:
# train['x'] = train['ingredients'].apply(lambda x:  ' '.join(x))
# test['x'] = test['ingredients'].apply(lambda x: ' '.join(x))
# train.head()

In [20]:
len(train['x'].values[0].split())

15

In [21]:
x_train = vectorizer.fit_transform(train['x'].values)
x_train.sort_indices()
x_test = vectorizer.transform(test['x'].values)
x_train

<39559x2877 sparse matrix of type '<class 'numpy.float64'>'
	with 735206 stored elements in Compressed Sparse Row format>

In [22]:
print(x_train[0])

  (0, 164)	0.2136853139081756
  (0, 225)	0.1470338025940124
  (0, 491)	0.15557274400877014
  (0, 704)	0.34993740346216723
  (0, 909)	0.31820801786364306
  (0, 1031)	0.4063283725642091
  (0, 1036)	0.11001351466168802
  (0, 1113)	0.3579355070161128
  (0, 1457)	0.27871073086742515
  (0, 1791)	0.1399970657392681
  (0, 1911)	0.10425384282532606
  (0, 2078)	0.25027231290252794
  (0, 2193)	0.3586823384903479
  (0, 2295)	0.2407497729524336
  (0, 2649)	0.14860442635641782


In [23]:
def get_estimator():
    return SVC(C=300,
         kernel='rbf',
         gamma=1.5, 
         shrinking=True, 
         tol=0.001, 
         cache_size=1000,
         class_weight=None,
         max_iter=-1, 
         decision_function_shape='ovr',
         random_state=42)

In [24]:
estimator = get_estimator()

In [25]:
y_train = train['cuisine'].values
classifier = OneVsRestClassifier(estimator, n_jobs=-1)
classifier.fit(x_train, y_train)

OneVsRestClassifier(estimator=SVC(C=300, cache_size=1000, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1.5, kernel='rbf',
  max_iter=-1, probability=False, random_state=42, shrinking=True,
  tol=0.001, verbose=False),
          n_jobs=-1)

In [40]:
test['cuisine']  = classifier.predict(x_test)
test[['id', 'cuisine']].to_csv('submission.csv', index=False)