<a href="https://colab.research.google.com/github/nandhukumar86/mykagglesubmissions/blob/master/whats_cooking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# install kaggle
! pip install -q kaggle

# upload file to Colab
from google.colab import files
files.upload()

# create kaggle folder in root and copy the file.
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/

# provide permissions
! chmod 600 ~/.kaggle/kaggle.json

# download competition files by providing competition name
! kaggle competitions download -c 'whats-cooking'

# unzip the files and folders
#! unzip train.csv.zip -d train
#! unzip test.csv.zip -d test


Saving kaggle.json to kaggle.json
Downloading sample_submission.csv.zip to /content
  0% 0.00/25.8k [00:00<?, ?B/s]
100% 25.8k/25.8k [00:00<00:00, 23.9MB/s]
Downloading train.json.zip to /content
  0% 0.00/1.76M [00:00<?, ?B/s]
100% 1.76M/1.76M [00:00<00:00, 120MB/s]
Downloading test.json.zip to /content
  0% 0.00/426k [00:00<?, ?B/s]
100% 426k/426k [00:00<00:00, 60.2MB/s]


In [2]:
! unzip /content/train.json.zip -d train
! unzip /content/test.json.zip -d test

Archive:  /content/train.json.zip
  inflating: train/train.json        
   creating: train/__MACOSX/
  inflating: train/__MACOSX/._train.json  
Archive:  /content/test.json.zip
  inflating: test/test.json          
   creating: test/__MACOSX/
  inflating: test/__MACOSX/._test.json  


In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(color_codes=True)
%matplotlib inline


In [4]:
test_raw = pd.read_json('/content/test/test.json')
train_raw = pd.read_json('/content/train/train.json')

In [8]:
train_raw.columns

Index(['id', 'cuisine', 'ingredients'], dtype='object')

In [9]:
train_raw.dtypes

id              int64
cuisine        object
ingredients    object
dtype: object

In [10]:
train_raw.head()

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."


In [11]:
test_raw.head()

Unnamed: 0,id,ingredients
0,18009,"[baking powder, eggs, all-purpose flour, raisi..."
1,28583,"[sugar, egg yolks, corn starch, cream of tarta..."
2,41580,"[sausage links, fennel bulb, fronds, olive oil..."
3,29752,"[meat cuts, file powder, smoked sausage, okra,..."
4,35687,"[ground black pepper, salt, sausage casings, l..."


In [12]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

x = mlb.fit_transform(train_raw.ingredients)

In [13]:
mlb.classes_

array(['(    oz.) tomato sauce', '(   oz.) tomato paste',
       '(10 oz.) frozen chopped spinach', ..., 'ziti', 'zucchini',
       'zucchini blossoms'], dtype=object)

In [14]:
x

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [15]:
train_df_counts = pd.DataFrame(x, columns = mlb.classes_)

In [16]:
train_df_counts.head()

Unnamed: 0,( oz.) tomato sauce,( oz.) tomato paste,(10 oz.) frozen chopped spinach,"(10 oz.) frozen chopped spinach, thawed and squeezed dry",(14 oz.) sweetened condensed milk,(14.5 oz.) diced tomatoes,(15 oz.) refried beans,1% low-fat buttermilk,1% low-fat chocolate milk,1% low-fat cottage cheese,1% low-fat milk,"2 1/2 to 3 lb. chicken, cut into serving pieces",2% low fat cheddar chees,2% low-fat cottage cheese,2% lowfat greek yogurt,2% milk shredded mozzarella cheese,2% reduced-fat milk,25% less sodium chicken broth,33% less sodium cooked deli ham,33% less sodium cooked ham,33% less sodium ham,33% less sodium smoked fully cooked ham,40% less sodium taco seasoning,40% less sodium taco seasoning mix,7 Up,"8 ounc ziti pasta, cook and drain",95% lean ground beef,A Taste of Thai Rice Noodles,Accent Seasoning,Adobo All Purpose Seasoning,Alaskan king crab legs,Alexia Waffle Fries,Alfredo sauce,Amarena cherries,Amaretti Cookies,American cheese,Anaheim chile,Angostura bitters,Argo Corn Starch,Asian chili sauce,...,yellow mustard,yellow mustard seeds,yellow onion,yellow peas,yellow peppers,yellow rice,yellow rock sugar,yellow split peas,yellow squash,yellow summer squash,yellow tomato,yellowfin,yellowfin tuna,yellowtail,yellowtail snapper fillets,yoghurt,yoghurt natural low fat,yogurt cheese,yogurt dressing,yogurt low fat,yolk,yoplait,young coconut meat,young leeks,young nettle,yu choy,yuca,yucca,yucca root,yukon gold,yukon gold potatoes,yuzu,yuzu juice,za'atar,zest,zesty italian dressing,zinfandel,ziti,zucchini,zucchini blossoms
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [17]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

x = mlb.fit_transform(test_raw.ingredients)

test_df_counts = pd.DataFrame(x, columns = mlb.classes_)

In [18]:
test_df_counts.head()

Unnamed: 0,( oz.) tomato sauce,(14.5 oz.) diced tomatoes,1% low-fat buttermilk,1% low-fat cottage cheese,1% low-fat milk,2% low-fat cottage cheese,2% lowfat greek yogurt,2% reduced fat chocolate milk,2% reduced-fat milk,33% less sodium cooked deli ham,33% less sodium smoked ham,40% less sodium taco seasoning,50% less sodium black beans,7 Up,"8 ounc ziti pasta, cook and drain",95% lean ground beef,Alfredo sauce,Amaretti Cookies,American cheese,Anaheim chile,Angostura bitters,Asian chili sauce,Asian herb,Baileys Irish Cream Liqueur,Belgian endive,Bengali 5 Spice,Bertolli Tomato & Basil Sauce,Bertolli® Alfredo Sauce,Bertolli® Classico Olive Oil,Best Foods Mayonnaise Dressing with Extra Virgin Olive Oil,Biryani Masala,Bisquick Original All-Purpose Baking Mix,Boston lettuce,Boursin,Boursin Cheese with Garlic and Herbs,Bragg Liquid Aminos,Bramley apples,Burgundy wine,CURRY GUY Smoked Garam Masala,CURRY GUY Smoked Spicy Salt,...,yellow chives,yellow corn,yellow corn meal,yellow crookneck squash,yellow curry paste,yellow food coloring,yellow hominy,yellow lentils,yellow miso,yellow mustard,yellow mustard seeds,yellow onion,yellow peas,yellow peppers,yellow rice,yellow rice seasoning mix,yellow rock sugar,yellow split peas,yellow squash,yellow summer squash,yellow tomato,yellowfin,yellowtail snapper fillets,yodel,yoghurt,yoghurt natural low fat,yolk,young ginger,yuca,yucca root,yukon gold,yukon gold potatoes,yuzu,za'atar,zabaglione,zest,zesty italian dressing,zinfandel,ziti,zucchini
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [19]:
train_raw.cuisine.unique()

array(['greek', 'southern_us', 'filipino', 'indian', 'jamaican',
       'spanish', 'italian', 'mexican', 'chinese', 'british', 'thai',
       'vietnamese', 'cajun_creole', 'brazilian', 'french', 'japanese',
       'irish', 'korean', 'moroccan', 'russian'], dtype=object)

In [20]:
train_df_counts.columns

Index(['(    oz.) tomato sauce', '(   oz.) tomato paste',
       '(10 oz.) frozen chopped spinach',
       '(10 oz.) frozen chopped spinach, thawed and squeezed dry',
       '(14 oz.) sweetened condensed milk', '(14.5 oz.) diced tomatoes',
       '(15 oz.) refried beans', '1% low-fat buttermilk',
       '1% low-fat chocolate milk', '1% low-fat cottage cheese',
       ...
       'yukon gold potatoes', 'yuzu', 'yuzu juice', 'za'atar', 'zest',
       'zesty italian dressing', 'zinfandel', 'ziti', 'zucchini',
       'zucchini blossoms'],
      dtype='object', length=6714)

In [21]:
train_count_columns = train_df_counts.columns
test_count_columns = test_df_counts.columns

In [22]:
train_count_columns.difference(test_count_columns)

Index(['(   oz.) tomato paste', '(10 oz.) frozen chopped spinach',
       '(10 oz.) frozen chopped spinach, thawed and squeezed dry',
       '(14 oz.) sweetened condensed milk', '(15 oz.) refried beans',
       '1% low-fat chocolate milk',
       '2 1/2 to 3 lb. chicken, cut into serving pieces',
       '2% low fat cheddar chees', '2% milk shredded mozzarella cheese',
       '25% less sodium chicken broth',
       ...
       'yogurt dressing', 'yogurt low fat', 'yoplait', 'young coconut meat',
       'young leeks', 'young nettle', 'yu choy', 'yucca', 'yuzu juice',
       'zucchini blossoms'],
      dtype='object', length=2653)

In [23]:
test_count_columns.difference(train_count_columns)

Index(['2% reduced fat chocolate milk', '33% less sodium smoked ham',
       '50% less sodium black beans', 'Asian herb',
       'Best Foods Mayonnaise Dressing with Extra Virgin Olive Oil',
       'Boursin Cheese with Garlic and Herbs', 'Bramley apples',
       'CURRY GUY Smoked Tandoori Masala', 'Chobani Yogurt',
       'Crisco Canola Oil',
       ...
       'whole wheat cheese tortellini', 'whole wheat spiral pasta',
       'whole wheat white flour', 'whole wheat wraps',
       'wish-bone deluxe french dressing', 'yellow chile',
       'yellow rice seasoning mix', 'yodel', 'young ginger', 'zabaglione'],
      dtype='object', length=423)

In [24]:
from sklearn.decomposition import PCA

train_reduced_array = PCA(n_components=10).fit_transform(train_df_counts)


In [78]:
train_reduced_array

array([[-0.15916899, -0.40712664,  0.22080654, ..., -0.07560989,
         0.08219195, -0.07722984],
       [ 0.4207815 ,  0.44485887, -0.06356569, ...,  0.61650523,
         0.16643922,  0.10205237],
       [ 0.5139314 ,  0.59124356,  0.18177693, ..., -0.2996745 ,
         1.01835043,  0.02870106],
       ...,
       [ 0.23111211,  1.22996961,  0.02243507, ..., -0.43786601,
         0.31894883, -0.25959422],
       [-0.82701717,  0.4823334 ,  0.55927213, ..., -0.35617708,
        -0.28982169, -0.13194003],
       [ 0.91986681, -0.56477255,  0.36049554, ...,  0.58266065,
        -0.22124315, -0.31705663]])

In [122]:
train_reduced_array.shape

(39774, 10)

In [79]:
from sklearn.decomposition import PCA

test_reduced_array = PCA(n_components=10).fit_transform(test_df_counts)

In [80]:
X_train = pd.DataFrame(train_reduced_array)
X_test = pd.DataFrame(test_reduced_array)


In [81]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-0.159169,-0.407127,0.220807,-0.649031,0.359827,-0.290182,-0.28773,-0.07561,0.082192,-0.07723
1,0.420781,0.444859,-0.063566,-0.141696,-0.145737,-0.38748,0.403184,0.616505,0.166439,0.102052
2,0.513931,0.591244,0.181777,-0.367889,0.153662,-0.480926,-0.238117,-0.299674,1.01835,0.028701
3,0.290998,0.377953,0.400536,0.554406,-0.031876,-0.155687,-0.455152,0.470705,-0.026971,0.34451
4,1.0628,0.084225,0.51852,0.221203,-1.038499,0.108941,-0.561349,0.172433,-0.149872,0.111853


In [37]:
from sklearn.preprocessing import LabelEncoder

df1 = pd.DataFrame(train_raw.cuisine.copy(deep=True),columns = ['cuisine'])
le = LabelEncoder()
# Categorical to Continuous variables conversion 
df2 = df1.apply(le.fit_transform)


In [38]:
df2

Unnamed: 0,cuisine
0,6
1,16
2,4
3,7
4,7
...,...
39769,8
39770,9
39771,8
39772,3


In [39]:
y_train = df2.cuisine

In [101]:
mapping_temp = df2
mapping_temp

Unnamed: 0,cuisine
0,6
1,16
2,4
3,7
4,7
...,...
39769,8
39770,9
39771,8
39772,3


In [102]:
mapping_temp['cuisine_actual'] = train_raw.cuisine

In [107]:
mapping_temp.drop_duplicates(inplace=True)

In [108]:
mapping_temp

Unnamed: 0,cuisine,cuisine_actual
0,6,greek
1,16,southern_us
2,4,filipino
3,7,indian
5,10,jamaican
6,17,spanish
7,9,italian
8,13,mexican
11,3,chinese
16,1,british


In [109]:
mapping = mapping_temp

In [110]:
mapping

Unnamed: 0,cuisine,cuisine_actual
0,6,greek
1,16,southern_us
2,4,filipino
3,7,indian
5,10,jamaican
6,17,spanish
7,9,italian
8,13,mexican
11,3,chinese
16,1,british


In [113]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(criterion = 'gini', random_state=1)

model.fit(X_train, y_train)

y_test_predict = model.predict(X_test)
y_train_predict = model.predict(X_train)


In [115]:
print('Training Score: ', model.score(X_train, y_train))

# from sklearn.metrics import accuracy_score
# print('Accuracy: ',accuracy_score(y_test, y_test_predict))


Training Score:  0.9996480112636396


In [120]:
X_train.shape

(39774, 10)

In [124]:
y_test_predict.shape

(9944,)

In [125]:
y_train_predict

array([ 6, 16,  4, ...,  8,  3, 13])

In [128]:
y_test_predict = le.inverse_transform(y_test_predict)

In [129]:
y_test_predict.size

9944

In [None]:
id_column = test_raw.id

In [133]:
# df = pd.concat([pd.DataFrame(id_column),pd.DataFrame(y_test_predict)], axis=1, ignore_index=True)
# df.columns = ['id','cuisine']
# df.to_csv('Submission.csv', index=False)


In [27]:
train_df_counts.head()

Unnamed: 0,( oz.) tomato sauce,( oz.) tomato paste,(10 oz.) frozen chopped spinach,"(10 oz.) frozen chopped spinach, thawed and squeezed dry",(14 oz.) sweetened condensed milk,(14.5 oz.) diced tomatoes,(15 oz.) refried beans,1% low-fat buttermilk,1% low-fat chocolate milk,1% low-fat cottage cheese,1% low-fat milk,"2 1/2 to 3 lb. chicken, cut into serving pieces",2% low fat cheddar chees,2% low-fat cottage cheese,2% lowfat greek yogurt,2% milk shredded mozzarella cheese,2% reduced-fat milk,25% less sodium chicken broth,33% less sodium cooked deli ham,33% less sodium cooked ham,33% less sodium ham,33% less sodium smoked fully cooked ham,40% less sodium taco seasoning,40% less sodium taco seasoning mix,7 Up,"8 ounc ziti pasta, cook and drain",95% lean ground beef,A Taste of Thai Rice Noodles,Accent Seasoning,Adobo All Purpose Seasoning,Alaskan king crab legs,Alexia Waffle Fries,Alfredo sauce,Amarena cherries,Amaretti Cookies,American cheese,Anaheim chile,Angostura bitters,Argo Corn Starch,Asian chili sauce,...,yellow mustard,yellow mustard seeds,yellow onion,yellow peas,yellow peppers,yellow rice,yellow rock sugar,yellow split peas,yellow squash,yellow summer squash,yellow tomato,yellowfin,yellowfin tuna,yellowtail,yellowtail snapper fillets,yoghurt,yoghurt natural low fat,yogurt cheese,yogurt dressing,yogurt low fat,yolk,yoplait,young coconut meat,young leeks,young nettle,yu choy,yuca,yucca,yucca root,yukon gold,yukon gold potatoes,yuzu,yuzu juice,za'atar,zest,zesty italian dressing,zinfandel,ziti,zucchini,zucchini blossoms
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [28]:
test_df_counts.head()

Unnamed: 0,( oz.) tomato sauce,(14.5 oz.) diced tomatoes,1% low-fat buttermilk,1% low-fat cottage cheese,1% low-fat milk,2% low-fat cottage cheese,2% lowfat greek yogurt,2% reduced fat chocolate milk,2% reduced-fat milk,33% less sodium cooked deli ham,33% less sodium smoked ham,40% less sodium taco seasoning,50% less sodium black beans,7 Up,"8 ounc ziti pasta, cook and drain",95% lean ground beef,Alfredo sauce,Amaretti Cookies,American cheese,Anaheim chile,Angostura bitters,Asian chili sauce,Asian herb,Baileys Irish Cream Liqueur,Belgian endive,Bengali 5 Spice,Bertolli Tomato & Basil Sauce,Bertolli® Alfredo Sauce,Bertolli® Classico Olive Oil,Best Foods Mayonnaise Dressing with Extra Virgin Olive Oil,Biryani Masala,Bisquick Original All-Purpose Baking Mix,Boston lettuce,Boursin,Boursin Cheese with Garlic and Herbs,Bragg Liquid Aminos,Bramley apples,Burgundy wine,CURRY GUY Smoked Garam Masala,CURRY GUY Smoked Spicy Salt,...,yellow chives,yellow corn,yellow corn meal,yellow crookneck squash,yellow curry paste,yellow food coloring,yellow hominy,yellow lentils,yellow miso,yellow mustard,yellow mustard seeds,yellow onion,yellow peas,yellow peppers,yellow rice,yellow rice seasoning mix,yellow rock sugar,yellow split peas,yellow squash,yellow summer squash,yellow tomato,yellowfin,yellowtail snapper fillets,yodel,yoghurt,yoghurt natural low fat,yolk,young ginger,yuca,yucca root,yukon gold,yukon gold potatoes,yuzu,za'atar,zabaglione,zest,zesty italian dressing,zinfandel,ziti,zucchini
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [35]:
test_df_counts.sum().sort_values(ascending=False)

# df.sort_values(['col1','col2'], ascending=[1, 1])

salt                     4485
onions                   2036
olive oil                1917
water                    1836
garlic                   1791
                         ... 
dinner rolls                1
disco empanada frozen       1
distilled vinegar           1
doenzang                    1
shark fillets               1
Length: 4484, dtype: int64

In [40]:
y_train

0         6
1        16
2         4
3         7
4         7
         ..
39769     8
39770     9
39771     8
39772     3
39773    13
Name: cuisine, Length: 39774, dtype: int64

In [43]:
from sklearn.tree import ExtraTreeClassifier

model = ExtraTreeClassifier()

model.fit(train_df_counts, y_train)


ExtraTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                    max_depth=None, max_features='auto', max_leaf_nodes=None,
                    min_impurity_decrease=0.0, min_impurity_split=None,
                    min_samples_leaf=1, min_samples_split=2,
                    min_weight_fraction_leaf=0.0, random_state=None,
                    splitter='random')

In [49]:
model.feature_importances_

array([2.21816552e-07, 5.22552901e-05, 2.90238427e-06, ...,
       5.05084027e-05, 1.20902581e-03, 2.60792389e-06])

ValueError: ignored