In [13]:
import os
import time
import joblib
import mlflow

import numpy as np
import pandas as pd

#from dotenv import load_dotenv
from sklearn.model_selection import train_test_split
from mlflow.models.signature import infer_signature

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

In [14]:
!python -m spacy download fr_core_news_sm -q

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')


In [15]:
# Import spacy and initialize french language
import fr_core_news_sm
nlp = fr_core_news_sm.load()

In [18]:
from spacy.lang.fr.stop_words import STOP_WORDS

df = pd.read_csv("./data_final.csv")

df['plat'] = df['plat'].apply(lambda x : x + ', ')

nlp.Defaults.stop_words.add("facile")
nlp.Defaults.stop_words.add("rapide")
nlp.Defaults.stop_words.add("réussie")
nlp.Defaults.stop_words.add("maman")

df["plat"] = df["plat"].apply(lambda x: x.replace("d'",""))
df["plat"] = df["plat"].apply(lambda x: x.replace("l'",""))
df["recettes"] = df["recettes"].apply(lambda x: x.replace("d'",""))
df["recettes"] = df["recettes"].apply(lambda x: x.replace("l'",""))

df["plat_clean"] = df["plat"].apply(lambda x:''.join(ch for ch in x if ch.isalnum() or ch==" "))
df["plat_clean"] = df["plat_clean"].apply(lambda x: x.replace(" +"," ").lower().strip())
df["plat_clean"] = df["plat_clean"].apply(lambda x: " ".join(token.text for token in nlp(x) if token.text not in STOP_WORDS))

df["recettes_clean"] = df["recettes"].apply(lambda x:''.join(ch for ch in x if ch.isalnum() or ch==" "))
df["recettes_clean"] = df["recettes_clean"].apply(lambda x: x.replace(" +"," ").lower().strip())
df["recettes_clean"] = df["recettes_clean"].apply(lambda x: " ".join(token.text for token in nlp(x) if token.text not in STOP_WORDS))

df.head()

Unnamed: 0,target,plat,recettes,plat_recette,plat_clean,recettes_clean
0,Desserts,"crème anglaise réussie, onctueuse à souhait,","sucre, vanille, jaune oeuf, lait demi-écrémé, ...","crème anglaise réussie, onctueuse à souhait su...",crème anglaise onctueuse souhait,sucre vanille jaune oeuf lait demiécrémé maïzena
1,Desserts,"crème pâtissière,","sucre, vanille, jaune oeuf, lait demi-écrémé, ...","crème pâtissière sucre, vanille, jaune d'oeuf,...",crème pâtissière,sucre vanille jaune oeuf lait demiécrémé maïzena
2,Desserts,"panna cotta,","sucre, vanille, jaune oeuf, lait demi-écrémé, ...","panna cotta sucre, vanille, jaune d'oeuf, lait...",panna cotta,sucre vanille jaune oeuf lait demiécrémé maïzena
3,Desserts,"crème brûlée,","sucre, vanille, jaune oeuf, lait demi-écrémé, ...","crème brûlée sucre, vanille, jaune d'oeuf, lai...",crème brûlée,sucre vanille jaune oeuf lait demiécrémé maïzena
4,Desserts,"riz au lait de ma maman,","sucre, vanille, jaune oeuf, lait demi-écrémé, ...","riz au lait de ma maman sucre, vanille, jaune ...",riz lait,sucre vanille jaune oeuf lait demiécrémé maïzena


In [41]:
X = df[['plat_clean', 'recettes_clean']]
Y = df['target']

In [42]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0, stratify = Y)

In [43]:
X_train

Unnamed: 0,plat_clean,recettes_clean
7622,tarte carrés chocolat,sucre vanille jaune oeuf lait demiécrémé maïzena
15433,soupe crocodiles,sucre vanille jaune oeuf lait demiécrémé maïzena
14960,tarte vanillée brousse poires,sucre vanille jaune oeuf lait demiécrémé maïzena
11849,potage endives fenouil,sucre vanille jaune oeuf lait demiécrémé maïzena
3157,tarte chocolat noisettes,sucre vanille jaune oeuf lait demiécrémé maïzena
...,...,...
8864,bouchées artichaut petit violet foie gras,sucre vanille jaune oeuf lait demiécrémé maïzena
16131,velouté crémeux asperges vertes,sucre vanille jaune oeuf lait demiécrémé maïzena
1018,cake légumes soleil,sucre vanille jaune oeuf lait demiécrémé maïzena
15188,gratin crabe st jacques whisky,sucre vanille jaune oeuf lait demiécrémé maïzena


In [44]:
X_test

Unnamed: 0,plat_clean,recettes_clean
9900,tarte fraises speculoos petits lu,sucre vanille jaune oeuf lait demiécrémé maïzena
2840,tourin lail,sucre vanille jaune oeuf lait demiécrémé maïzena
374,filets saintpierre beurre orange,sucre vanille jaune oeuf lait demiécrémé maïzena
11357,tarte ananas séchés chocolatcaramel,sucre vanille jaune oeuf lait demiécrémé maïzena
18852,escalopes échalotes vin blanc,sucre vanille jaune oeuf lait demiécrémé maïzena
...,...,...
11111,cake chèvre menthe pignons,sucre vanille jaune oeuf lait demiécrémé maïzena
19832,pizza orient express saucisses poivrons curry,sucre vanille jaune oeuf lait demiécrémé maïzena
6033,pâte feuilletée ultra simple petits suisses,sucre vanille jaune oeuf lait demiécrémé maïzena
13571,verrines tomate fromages,sucre vanille jaune oeuf lait demiécrémé maïzena


In [45]:
encoder = LabelEncoder()
oneencoder = OneHotEncoder(drop="first", handle_unknown = "ignore")

X_train = oneencoder.fit_transform(X_train)
Y_train = encoder.fit_transform(Y_train)
X_test = oneencoder.transform(X_test)
Y_test = encoder.transform(Y_test)



# Linear SVM

In [46]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, ConfusionMatrixDisplay, RocCurveDisplay

In [57]:
svc = SVC(kernel = 'poly', probability = True) # set probability to True if you plan to use predict_proba()

svc.fit(X_train, Y_train)

SVC(kernel='poly', probability=True)

In [58]:
Y_train_pred = svc.predict(X_train)
Y_train_proba = svc.predict_proba(X_train)

Y_test_pred = svc.predict(X_test)
Y_test_proba = svc.predict_proba(X_test)

In [59]:
# Print scores
print("accuracy on training set : ", accuracy_score(Y_train, Y_train_pred))
print("accuracy on test set : ", accuracy_score(Y_test, Y_test_pred))
print()

accuracy on training set :  0.9871096267651023
accuracy on test set :  0.407311928755566



# XGBOOST

In [62]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.7.3-py3-none-macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m925.9 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: xgboost
Successfully installed xgboost-1.7.3


In [63]:
from xgboost import XGBClassifier

# Perform grid search
print("Grid search...")
xgboost = XGBClassifier()

# Grid of values to be tested
params = {
    'max_depth': [2, 4, 6], # exactly the same role as in scikit-learn
    'min_child_weight': [1, 2, 3], # effect is more or less similar to min_samples_leaf and min_samples_split
    'n_estimators': [2, 4, 6, 8,] # exactly the same role as in scikit-learn
}
print(params)
gridsearch = GridSearchCV(xgboost, param_grid = params, cv = 3) # cv : the number of folds to be used for CV
gridsearch.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearch.best_params_)
print("Best validation accuracy : ", gridsearch.best_score_)
print()
print("Accuracy on training set : ", gridsearch.score(X_train, Y_train))
print("Accuracy on test set : ", gridsearch.score(X_test, Y_test))


Grid search...
{'max_depth': [2, 4, 6], 'min_child_weight': [1, 2, 3], 'n_estimators': [2, 4, 6, 8]}
...Done.
Best hyperparameters :  {'max_depth': 2, 'min_child_weight': 1, 'n_estimators': 8}
Best validation accuracy :  0.24878420343352667

Accuracy on training set :  0.24925294427843206
Accuracy on test set :  0.25263651277243965


In [80]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5, weights='distance', algorithm='brute', leaf_size=50, p=1, metric='minkowski', metric_params=None, n_jobs=-1)

knn.fit(X_train, Y_train)

Y_train_pred = knn.predict(X_train)
Y_train_proba = knn.predict_proba(X_train)

Y_test_pred = knn.predict(X_test)
Y_test_proba = knn.predict_proba(X_test)

In [81]:
# Print scores
print("accuracy on training set : ", accuracy_score(Y_train, Y_train_pred))
print("accuracy on test set : ", accuracy_score(Y_test, Y_test_pred))
print()

accuracy on training set :  0.9898048866233081
accuracy on test set :  0.4548863370049215

