In [9]:
import pandas as pd
import numpy as np
import time
import re
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


#Classification and Metrics
from sklearn.model_selection import GridSearchCV, train_test_split, PredefinedSplit
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multioutput import ClassifierChain
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, f1_score

from sklearn import metrics

%load_ext autoreload
%autoreload 2
import warnings
warnings.filterwarnings('ignore')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
clean_reviews = pd.read_csv('/content/drive/MyDrive/modelling/clean_reviews_dataset.csv')

In [11]:
clean_reviews = clean_reviews.dropna(subset=['cleaned_reviews_reduced_words'])
clean_reviews = clean_reviews[clean_reviews['cleaned_reviews_reduced_words'].apply(lambda x: len(str(x)) > 0)]

In [12]:
mlb = MultiLabelBinarizer()
binarised_labels = mlb.fit_transform(clean_reviews['Cuisines'])
clean_reviews['labels'] = binarised_labels.tolist()

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    clean_reviews['cleaned_reviews_reduced_words'], binarised_labels, test_size=0.2, random_state=42, stratify=binarised_labels)

X_train = np.array([str(x) for x in X_train])

In [14]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42 )

In [16]:
X_train_full = np.concatenate((X_train, X_val), axis = 0)
y_train_full = np.concatenate((y_train, y_val), axis = 0)

split_index = [-1]* len(X_train) + [0] * len(X_val)
predefined_split = PredefinedSplit(test_fold = split_index)

In [8]:
bigram_tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,1), token_pattern=r"(?u)\b\w+\b")
bigram_bow_vectorizer = CountVectorizer(ngram_range=(1,1), token_pattern=r"(?u)\b\w+\b")

In [17]:
scores = ["accuracy", "precision_weighted", "recall_weighted", "f1_weighted"]

#### One Vs Rest

In [20]:
from sklearn.tree import DecisionTreeClassifier

pipeline_dt = Pipeline([
    ('vectorizer', bigram_tfidf_vectorizer),
    ('classifier', OneVsRestClassifier(DecisionTreeClassifier(random_state=42)))])

param_grid_dt = {
    'vectorizer': [bigram_tfidf_vectorizer, bigram_bow_vectorizer],
    'classifier__estimator__max_depth': [None, 10, 20, 30],
    'classifier__estimator__min_samples_split': [2, 5, 10],
    'classifier__estimator__min_samples_leaf': [1, 2, 4]}

grid_search_dt = GridSearchCV(
    estimator=pipeline_dt,
    param_grid=param_grid_dt,
    scoring=scores,
    cv = predefined_split,
    refit="f1_weighted",
    verbose=4)

grid_result_dt = grid_search_dt.fit(X_train_full, y_train_full)


Fitting 1 folds for each of 72 candidates, totalling 72 fits
[CV 1/1] END classifier__estimator__max_depth=None, classifier__estimator__min_samples_leaf=1, classifier__estimator__min_samples_split=2, vectorizer=TfidfVectorizer(token_pattern='(?u)\\b\\w+\\b'); accuracy: (test=0.048) f1_weighted: (test=0.840) precision_weighted: (test=0.841) recall_weighted: (test=0.840) total time= 2.7min
[CV 1/1] END classifier__estimator__max_depth=None, classifier__estimator__min_samples_leaf=1, classifier__estimator__min_samples_split=2, vectorizer=CountVectorizer(token_pattern='(?u)\\b\\w+\\b'); accuracy: (test=0.045) f1_weighted: (test=0.830) precision_weighted: (test=0.837) recall_weighted: (test=0.824) total time= 1.0min
[CV 1/1] END classifier__estimator__max_depth=None, classifier__estimator__min_samples_leaf=1, classifier__estimator__min_samples_split=5, vectorizer=TfidfVectorizer(token_pattern='(?u)\\b\\w+\\b'); accuracy: (test=0.048) f1_weighted: (test=0.839) precision_weighted: (test=0.841

In [21]:
print("Best Parameters (Decision Trees One Vs Rest): ", grid_result_dt.best_params_)

Best Parameters (Decision Trees One Vs Rest):  {'classifier__estimator__max_depth': 30, 'classifier__estimator__min_samples_leaf': 1, 'classifier__estimator__min_samples_split': 10, 'vectorizer': TfidfVectorizer(token_pattern='(?u)\\b\\w+\\b')}


In [22]:
for score in scores:
    mean_score = grid_result_dt.cv_results_[f'mean_test_{score}'][grid_result_dt.best_index_]
    print(f"Decision Tree One Vs Rest: \n{score} = {round(mean_score, 3)}")

Decision Tree One Vs Rest: 
accuracy = 0.102
Decision Tree One Vs Rest: 
precision_weighted = 0.861
Decision Tree One Vs Rest: 
recall_weighted = 0.867
Decision Tree One Vs Rest: 
f1_weighted = 0.851


Classifier Chain

In [23]:
pipeline_dt_cchain = Pipeline([
    ('vectorizer', bigram_tfidf_vectorizer),
    ('classifier', ClassifierChain(DecisionTreeClassifier(random_state=42)))])

param_grid_dt_cchain = {
    'vectorizer': [bigram_tfidf_vectorizer],
    'classifier__order' : [None, 'random'],
    'classifier__base_estimator__max_depth': [30],
    'classifier__base_estimator__min_samples_split': [10, 5],
    'classifier__base_estimator__min_samples_leaf': [1],
    'classifier__base_estimator__class_weight': [None, 'balanced']}

grid_search_dt_cchain = GridSearchCV(
    estimator=pipeline_dt_cchain,
    param_grid=param_grid_dt_cchain,
    scoring=scores,
    refit="f1_weighted",
    verbose=4,
    cv = predefined_split)

grid_result_dt_cchain = grid_search_dt_cchain.fit(X_train_full, y_train_full)


Fitting 1 folds for each of 8 candidates, totalling 8 fits
[CV 1/1] END classifier__base_estimator__class_weight=None, classifier__base_estimator__max_depth=30, classifier__base_estimator__min_samples_leaf=1, classifier__base_estimator__min_samples_split=10, classifier__order=None, vectorizer=TfidfVectorizer(token_pattern='(?u)\\b\\w+\\b'); accuracy: (test=0.219) f1_weighted: (test=0.843) precision_weighted: (test=0.841) recall_weighted: (test=0.858) total time=  18.2s
[CV 1/1] END classifier__base_estimator__class_weight=None, classifier__base_estimator__max_depth=30, classifier__base_estimator__min_samples_leaf=1, classifier__base_estimator__min_samples_split=10, classifier__order=random, vectorizer=TfidfVectorizer(token_pattern='(?u)\\b\\w+\\b'); accuracy: (test=0.260) f1_weighted: (test=0.849) precision_weighted: (test=0.850) recall_weighted: (test=0.863) total time=  16.8s
[CV 1/1] END classifier__base_estimator__class_weight=None, classifier__base_estimator__max_depth=30, classif

In [24]:
print("Best Parameters (Decision Trees Classifier Chain): ", grid_result_dt_cchain.best_params_)

Best Parameters (Decision Trees Classifier Chain):  {'classifier__base_estimator__class_weight': None, 'classifier__base_estimator__max_depth': 30, 'classifier__base_estimator__min_samples_leaf': 1, 'classifier__base_estimator__min_samples_split': 10, 'classifier__order': 'random', 'vectorizer': TfidfVectorizer(token_pattern='(?u)\\b\\w+\\b')}


In [25]:
for score in scores:
    mean_score = grid_result_dt_cchain.cv_results_[f'mean_test_{score}'][grid_result_dt_cchain.best_index_]
    print(f"Decision Tree Classifier Chain: \n{score} = {round(mean_score, 3)}")

Decision Tree Classifier Chain: 
accuracy = 0.26
Decision Tree Classifier Chain: 
precision_weighted = 0.85
Decision Tree Classifier Chain: 
recall_weighted = 0.863
Decision Tree Classifier Chain: 
f1_weighted = 0.849
