In [7]:
import pandas as pd
import re
# import spacy
import numpy as np
import pickle
from pathlib import Path
import sys
import os

In [12]:
# Get the current notebook directory
CURRENT_DIR = Path(os.getcwd()).resolve()

# Automatically find the project root (go up 3 levels)
PROJECT_ROOT = CURRENT_DIR.parents[2]

# Add project root to sys.path
sys.path.append(str(PROJECT_ROOT))

# Function to get relative paths from project root
def get_relative_path(absolute_path):
    return str(Path(absolute_path).relative_to(PROJECT_ROOT))

# Print project root directory
print(f"Project Root Directory: {PROJECT_ROOT.name}")  # Display only the root folder name

import config  # Now Python can find config.py

# Paths to load
# tfidf_path = Path(config.XTRAIN_MATRIX_PATH)
tfidf_path = './notebooks\modeling\text'
labels_path = Path(config.YTRAIN_ENCODED_PATH)

# Print paths being used (relative to project root)
print(f"Using Config File from: {get_relative_path(config.__file__)}")
print(f"Loading TF-IDF matrix from: {get_relative_path(tfidf_path)}")
print(f"Loading encoded labels from: {get_relative_path(labels_path)}")

# Check if files exist before loading
if not tfidf_path.exists():
    raise FileNotFoundError(f"Error: TF-IDF matrix file not found at {get_relative_path(tfidf_path)}")

if not labels_path.exists():
    raise FileNotFoundError(f"Error: Encoded labels file not found at {get_relative_path(labels_path)}")

# Load the TF-IDF matrix
X = pickle.load(open(tfidf_path, "rb"))

# Load the classification labels
y = pd.read_pickle(labels_path)

# Print confirmation
print("Data Successfully Loaded!")
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")


Project Root Directory: Data_Scientist_Rakuten_Project-main
Using Config File from: config.py


ValueError: 'notebooks\\modeling\text' does not start with 'D:\\Data_Science\\Append_Data_Engineer_AWS_MLOPS\\Data_Scientist_Rakuten_Project-main'

In [3]:
X = pickle.load(open("Xtrain_matrix.pkl", 'rb'))

In [4]:
y = pd.read_pickle('ytrain.pkl')

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# LOGISTIC REGRESSION

In [7]:
from sklearn import linear_model

In [8]:
clf_lr = linear_model.LogisticRegression(multi_class='multinomial',class_weight= "balanced", max_iter=1000)

In [9]:
clf_lr.fit(X_train, y_train.values.ravel())

LogisticRegression(class_weight='balanced', max_iter=1000,
                   multi_class='multinomial')

In [11]:
y_pred = clf_lr.predict(X_test)

In [12]:
#  Importer la classe classification_report
from sklearn.metrics import classification_report

# Calcul et affichage de classification_report
print( classification_report(y_test, y_pred) )

              precision    recall  f1-score   support

           0       0.35      0.67      0.46       629
           1       0.66      0.76      0.70       941
           2       0.78      0.84      0.81       377
           3       0.74      0.48      0.59       975
           4       0.67      0.69      0.68       532
           5       0.93      0.88      0.91       998
           6       0.74      0.77      0.76       514
           7       0.83      0.81      0.82      1036
           8       0.58      0.57      0.57       403
           9       0.89      0.90      0.90       851
          10       0.85      0.63      0.73       931
          11       0.74      0.81      0.77       547
          12       0.98      0.93      0.95      2008
          13       0.56      0.65      0.60       144
          14       0.85      0.91      0.88      1052
          15       0.76      0.73      0.74       270
          16       0.90      0.85      0.87       806
          17       0.80    

In [13]:
clf_lr.score(X_test, y_test)

0.7853273669335845

# SVM (SUPPORT VECTOR MACHINE)

In [14]:
from sklearn import svm
from sklearn import model_selection

In [15]:
# Insérez votre code ici

clf_svm = svm.SVC( gamma = 0.01, kernel = 'poly')

In [16]:
clf_svm.fit(X_train, y_train.values.ravel())


SVC(gamma=0.01, kernel='poly')

In [17]:
y_pred = clf_svm.predict(X_test)

In [18]:
# Calcul et affichage de classification_report
print( classification_report(y_test, y_pred) )

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       629
           1       0.00      0.00      0.00       941
           2       0.00      0.00      0.00       377
           3       0.00      0.00      0.00       975
           4       0.00      0.00      0.00       532
           5       0.00      0.00      0.00       998
           6       0.00      0.00      0.00       514
           7       0.00      0.00      0.00      1036
           8       0.00      0.00      0.00       403
           9       0.00      0.00      0.00       851
          10       0.00      0.00      0.00       931
          11       0.00      0.00      0.00       547
          12       0.12      1.00      0.21      2008
          13       0.00      0.00      0.00       144
          14       0.00      0.00      0.00      1052
          15       0.00      0.00      0.00       270
          16       0.00      0.00      0.00       806
          17       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
clf_svm.score(X_test, y_test)

0.11822892133772962

# RANDOM FOREST CLASSIFIER

In [8]:
from sklearn import ensemble

In [11]:
clf_rf = ensemble.RandomForestClassifier(n_jobs = -1, random_state=321)

In [12]:
clf_rf.fit(X_train, y_train.values.ravel())

RandomForestClassifier(n_jobs=-1, random_state=321)

In [14]:
y_pred = clf_rf.predict(X_test)

In [16]:
from sklearn.metrics import classification_report
print( classification_report(y_test, y_pred) )

              precision    recall  f1-score   support

           0       0.47      0.41      0.44       612
           1       0.66      0.76      0.71       958
           2       0.86      0.75      0.80       351
           3       0.69      0.59      0.64      1012
           4       0.44      0.72      0.55       590
           5       0.86      0.89      0.88       990
           6       0.81      0.69      0.75       513
           7       0.73      0.80      0.77      1022
           8       0.65      0.53      0.58       399
           9       0.90      0.90      0.90       887
          10       0.75      0.72      0.74       971
          11       0.74      0.77      0.76       509
          12       0.91      0.99      0.95      2062
          13       0.83      0.53      0.65       153
          14       0.78      0.91      0.84       954
          15       0.74      0.77      0.76       250
          16       0.84      0.85      0.85       794
          17       0.74    

In [17]:
clf_rf.score(X_test, y_test)

0.77296278850683

In [8]:
from sklearn import ensemble
clf_rf = ensemble.RandomForestClassifier(n_jobs = -1, max_features= 'sqrt', min_samples_split= 4)

In [9]:
clf_rf.fit(X_train, y_train.values.ravel())

RandomForestClassifier(max_features='sqrt', min_samples_split=4, n_jobs=-1)

In [10]:
y_pred = clf_rf.predict(X_test)

In [11]:
from sklearn.metrics import classification_report
print( classification_report(y_test, y_pred) )

              precision    recall  f1-score   support

           0       0.54      0.41      0.47       621
           1       0.51      0.81      0.63       995
           2       0.83      0.72      0.77       350
           3       0.69      0.61      0.64       969
           4       0.73      0.65      0.69       547
           5       0.87      0.91      0.89      1005
           6       0.81      0.66      0.73       521
           7       0.73      0.82      0.77       989
           8       0.66      0.54      0.59       410
           9       0.90      0.90      0.90       873
          10       0.75      0.76      0.75       969
          11       0.79      0.81      0.80       550
          12       0.90      0.99      0.94      2013
          13       0.79      0.58      0.67       144
          14       0.80      0.93      0.86       990
          15       0.78      0.79      0.79       296
          16       0.85      0.89      0.87       769
          17       0.72    

In [12]:
clf_rf.score(X_test, y_test)

0.7799105040037683

In [70]:
from sklearn import ensemble
clf_rf = ensemble.RandomForestClassifier(n_jobs = -1, max_features= 'log2', min_samples_split= 27)

In [71]:
clf_rf.fit(X_train, y_train.values.ravel())

RandomForestClassifier(max_features='log2', min_samples_split=27, n_jobs=-1)

In [72]:
y_pred = clf_rf.predict(X_test)

In [73]:
print( classification_report(y_test, y_pred) )

              precision    recall  f1-score   support

           0       0.63      0.44      0.52       621
           1       0.60      0.82      0.69       995
           2       0.85      0.71      0.77       350
           3       0.69      0.60      0.64       969
           4       0.77      0.65      0.70       547
           5       0.89      0.93      0.91      1005
           6       0.83      0.60      0.70       521
           7       0.71      0.83      0.76       989
           8       0.75      0.47      0.58       410
           9       0.90      0.89      0.90       873
          10       0.70      0.78      0.74       969
          11       0.77      0.81      0.79       550
          12       0.82      1.00      0.90      2013
          13       0.83      0.59      0.69       144
          14       0.77      0.94      0.85       990
          15       0.75      0.80      0.77       296
          16       0.84      0.94      0.89       769
          17       0.73    

In [74]:
clf_rf.score(X_test, y_test)

0.7827955723033443

# K NEIGHBORS CLASSIFIER

In [13]:
from sklearn import neighbors

In [14]:
knn = neighbors.KNeighborsClassifier(n_neighbors=2)

In [15]:
knn.fit(X_train, y_train.values.ravel())

KNeighborsClassifier(n_neighbors=2)

In [16]:
y_pred = knn.predict(X_test)

In [17]:
print( classification_report(y_test, y_pred) )

              precision    recall  f1-score   support

           0       0.06      0.99      0.11       621
           1       0.51      0.09      0.16       995
           2       0.70      0.27      0.39       350
           3       0.55      0.21      0.31       969
           4       0.57      0.02      0.04       547
           5       0.98      0.47      0.63      1005
           6       0.85      0.29      0.43       521
           7       0.79      0.32      0.46       989
           8       0.59      0.10      0.17       410
           9       0.93      0.72      0.81       873
          10       0.78      0.18      0.30       969
          11       0.86      0.18      0.30       550
          12       1.00      0.66      0.80      2013
          13       0.90      0.25      0.39       144
          14       0.88      0.34      0.49       990
          15       0.72      0.49      0.58       296
          16       0.92      0.25      0.40       769
          17       0.91    

In [19]:
knn.score(X_test, y_test)

0.355864342910975

In [20]:
parametres = {'n_neighbors': np.arange(2,28)}

In [22]:
from sklearn import model_selection

In [23]:
grid_knn = model_selection.GridSearchCV(estimator = knn, param_grid = parametres )

In [25]:
grille = grid_knn.fit(X_train, y_train.values.ravel())

In [26]:
print(grid_knn.best_params_)

{'n_neighbors': 27}


In [27]:
knn = neighbors.KNeighborsClassifier(n_neighbors=27)

In [29]:
knn.fit(X_train, y_train.values.ravel())

KNeighborsClassifier(n_neighbors=27)

In [30]:
y_pred = knn.predict(X_test)

In [31]:
print( classification_report(y_test, y_pred) )

              precision    recall  f1-score   support

           0       0.56      0.31      0.40       621
           1       0.35      0.84      0.50       995
           2       0.75      0.60      0.67       350
           3       0.71      0.51      0.60       969
           4       0.56      0.22      0.32       547
           5       0.89      0.88      0.88      1005
           6       0.80      0.64      0.71       521
           7       0.76      0.77      0.77       989
           8       0.69      0.34      0.46       410
           9       0.83      0.91      0.87       873
          10       0.64      0.69      0.66       969
          11       0.73      0.62      0.67       550
          12       0.91      0.97      0.94      2013
          13       0.70      0.53      0.60       144
          14       0.78      0.93      0.85       990
          15       0.50      0.81      0.62       296
          16       0.82      0.86      0.84       769
          17       0.76    

In [32]:
knn.score(X_test, y_test)

0.7259185115402732

# DECISION TREE CLASSIFIER

In [33]:
from sklearn.tree import DecisionTreeClassifier

In [34]:
dt_clf = DecisionTreeClassifier(criterion='entropy', max_depth=4, random_state=123)

In [35]:
dt_clf.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=4, random_state=123)

In [36]:
y_pred = dt_clf.predict(X_test)

In [37]:
print( classification_report(y_test, y_pred) )

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       621
           1       0.09      0.99      0.16       995
           2       0.00      0.00      0.00       350
           3       0.13      0.27      0.18       969
           4       0.00      0.00      0.00       547
           5       0.00      0.00      0.00      1005
           6       0.38      0.01      0.01       521
           7       0.00      0.00      0.00       989
           8       0.00      0.00      0.00       410
           9       0.96      0.57      0.71       873
          10       0.47      0.40      0.43       969
          11       0.00      0.00      0.00       550
          12       0.98      0.59      0.74      2013
          13       0.00      0.00      0.00       144
          14       0.00      0.00      0.00       990
          15       0.00      0.00      0.00       296
          16       0.00      0.00      0.00       769
          17       0.64    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [38]:
dt_clf.score(X_test, y_test)

0.22562411681582667

In [39]:
dt_clf_gini = DecisionTreeClassifier(criterion = 'gini', max_depth=4, random_state=321)

In [40]:
dt_clf_gini.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=4, random_state=321)

In [41]:
y_pred = dt_clf_gini.predict(X_test)

In [None]:
#import warnings
#warnings.filterwarnings('ignore')

In [42]:
print( classification_report(y_test, y_pred) )

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       621
           1       0.00      0.00      0.00       995
           2       0.00      0.00      0.00       350
           3       0.51      0.02      0.05       969
           4       0.00      0.00      0.00       547
           5       0.07      0.99      0.13      1005
           6       0.38      0.01      0.01       521
           7       0.00      0.00      0.00       989
           8       0.00      0.00      0.00       410
           9       0.95      0.61      0.74       873
          10       0.00      0.00      0.00       969
          11       0.00      0.00      0.00       550
          12       0.98      0.59      0.74      2013
          13       0.00      0.00      0.00       144
          14       0.88      0.36      0.51       990
          15       0.00      0.00      0.00       296
          16       0.00      0.00      0.00       769
          17       0.52    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [43]:
dt_clf_gini.score(X_test, y_test)

0.20548751766368348

# VOTING CLASSIFIER

In [44]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [45]:
clf1 = KNeighborsClassifier(n_neighbors=27)
clf2 = RandomForestClassifier(n_jobs = -1,max_features= 'sqrt', min_samples_split= 4)
clf3 = LogisticRegression(multi_class='multinomial',class_weight= "balanced", max_iter=1000)
vc = VotingClassifier(estimators=[('knn', clf1), ('rf', clf2), ('lr', clf3)], voting='hard')

In [47]:
vc.fit(X_train, y_train.values.ravel())

VotingClassifier(estimators=[('knn', KNeighborsClassifier(n_neighbors=27)),
                             ('rf',
                              RandomForestClassifier(max_features='sqrt',
                                                     min_samples_split=4,
                                                     n_jobs=-1)),
                             ('lr',
                              LogisticRegression(class_weight='balanced',
                                                 max_iter=1000,
                                                 multi_class='multinomial'))])

In [48]:
y_pred = vc.predict(X_test)

In [50]:
print( classification_report(y_test, y_pred) )

              precision    recall  f1-score   support

           0       0.39      0.56      0.46       621
           1       0.50      0.85      0.63       995
           2       0.81      0.79      0.80       350
           3       0.72      0.57      0.64       969
           4       0.76      0.63      0.68       547
           5       0.91      0.91      0.91      1005
           6       0.78      0.71      0.74       521
           7       0.78      0.80      0.79       989
           8       0.72      0.55      0.62       410
           9       0.89      0.90      0.90       873
          10       0.81      0.71      0.76       969
          11       0.78      0.81      0.79       550
          12       0.95      0.98      0.96      2013
          13       0.74      0.62      0.68       144
          14       0.82      0.94      0.88       990
          15       0.73      0.79      0.76       296
          16       0.89      0.88      0.89       769
          17       0.81    

In [51]:
vc.score(X_test, y_test)

0.7910975035327367

#  XGBOOST CLASSIFIER

In [63]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.5.0-py3-none-win_amd64.whl (106.6 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.5.0


In [64]:
import xgboost

In [65]:
from xgboost import XGBClassifier

xgb_clf = XGBClassifier(use_label_encoder=False)

In [66]:
xgb_clf.fit(X_train, y_train.values.ravel())



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [67]:
y_pred = xgb_clf.predict(X_test)

In [68]:
print( classification_report(y_test, y_pred) )

              precision    recall  f1-score   support

           0       0.30      0.72      0.43       621
           1       0.64      0.71      0.67       995
           2       0.79      0.75      0.77       350
           3       0.71      0.66      0.68       969
           4       0.79      0.60      0.68       547
           5       0.89      0.89      0.89      1005
           6       0.75      0.69      0.72       521
           7       0.82      0.83      0.83       989
           8       0.67      0.56      0.61       410
           9       0.89      0.91      0.90       873
          10       0.78      0.71      0.74       969
          11       0.80      0.75      0.78       550
          12       0.96      0.96      0.96      2013
          13       0.78      0.58      0.67       144
          14       0.88      0.88      0.88       990
          15       0.79      0.75      0.77       296
          16       0.89      0.82      0.85       769
          17       0.79    

In [69]:
xgb_clf.score(X_test, y_test)

0.7826778144135657