In [32]:
from nltk.corpus import stopwords
import re
import pandas as pd
df = pd.read_csv('cuisine_data.csv')
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score

In [33]:
df.head()

Unnamed: 0,cuisine_description,cuisine
0,romaine lettuce black olives grape tomatoes ga...,greek
1,plain flour ground pepper salt tomatoes ground...,southern_us
2,eggs pepper salt mayonaise cooking oil green c...,filipino
3,water vegetable oil wheat salt,indian
4,black pepper shallots cornflour cayenne pepper...,indian


In [34]:
len(df.cuisine.value_counts())

20

In [35]:
df.cuisine.unique()

array(['greek', 'southern_us', 'filipino', 'indian', 'jamaican',
       'spanish', 'italian', 'mexican', 'chinese', 'british', 'thai',
       'vietnamese', 'cajun_creole', 'brazilian', 'french', 'japanese',
       'irish', 'korean', 'moroccan', 'russian'], dtype=object)

# Checking for NULLs and removing them

In [36]:
df.isnull().sum()

cuisine_description    0
cuisine                0
dtype: int64

# Dropping duplicates

In [37]:
df.shape

(39774, 2)

In [38]:
df.drop_duplicates(inplace=True)

In [39]:
df.shape

(39677, 2)

# Preprocessing the text

In [40]:
print(df['cuisine_description'].apply(lambda x: len(x.split(' '))).sum())

806112


In [41]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [42]:
special_character_remover = re.compile('[/(){}\[\]\|@,;]')
extra_symbol_remover = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

In [43]:
def clean_text(text):
    text = text.lower()
    text = special_character_remover.sub(' ', text)
    text = extra_symbol_remover.sub('', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text
    
df['cuisine_description'] = df['cuisine_description'].apply(clean_text)

In [44]:
print(df['cuisine_description'].apply(lambda x: len(x.split(' '))).sum())

803337


# Train Test split

In [45]:
from sklearn.model_selection import train_test_split
X = df.cuisine_description
y = df.cuisine
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

In [46]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((27773,), (11904,), (27773,), (11904,))

# Applying Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

lr = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', LogisticRegression()),
              ])

lr.fit(X_train,y_train)
y_pred1 = lr.predict(X_test)



In [48]:
accuracy_score(y_test,y_pred1)

0.7819220430107527

# Applying Naive Bayes Classifier

In [51]:
from sklearn.naive_bayes import MultinomialNB


naivebayes = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
naivebayes.fit(X_train, y_train)

y_pred = naivebayes.predict(X_test)

print(f'accuracy {accuracy_score(y_pred,y_test)}')


accuracy 0.6682627688172043
