# Read data

In [1]:
from google.colab import drive

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/00_ML/ML_Algos/000-STAGE/DATA_OK.csv", sep =";")

In [4]:
df

Unnamed: 0,N_LST,isTrunc
0,abadgarcia,0
1,abbondante,0
2,abdelgha,1
3,abdelsa,1
4,abdulqa,1
...,...,...
72053,thurston,0
72054,draughty,0
72055,henderson,0
72056,espinoza,0


# Data preprocesing

In [5]:
#separate names by space
def separate_string(name):

    name =[" ".join(letter.lower()) for letter in name if letter!=" "]
    return(' '.join(name))


In [6]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer

In [7]:
words = df['N_LST']
words

Unnamed: 0,N_LST
0,abadgarcia
1,abbondante
2,abdelgha
3,abdelsa
4,abdulqa
...,...
72053,thurston
72054,draughty
72055,henderson
72056,espinoza


In [8]:
ngram_range= (2,3)
cv = CountVectorizer(analyzer='char', ngram_range=ngram_range, lowercase=False)
X = cv.fit_transform(words)

In [9]:
X.toarray()[:5]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [10]:
df_X = pd.DataFrame(X.toarray(), columns=cv.get_feature_names_out(), index=words)

In [11]:
#X=X.toarray()

In [12]:
y=df["isTrunc"]

# Model Training

In [13]:
from sklearn.model_selection import train_test_split,GridSearchCV
X_train, X_test, y_train , y_test = train_test_split(X,y, test_size=0.2, random_state=12)

In [14]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(57646, 9124) (57646,) (14412, 9124) (14412,)


### 1- Naive Bayes

In [15]:
from sklearn.naive_bayes import MultinomialNB
nb_classifier = MultinomialNB()

In [16]:
from sklearn import metrics

In [17]:
nb_classifier.fit(X_train, y_train)
y_pred = nb_classifier.predict(X_test)
score = metrics.accuracy_score(y_test, y_pred)
print("accuracy: %0.3f" % score)
cm = metrics.confusion_matrix(y_test, y_pred)
print(cm)

accuracy: 0.622
[[5771 2649]
 [2795 3197]]


#### Naive Bayes - Multinomial classifier with hyperparameter

same as MultinomialNB but with alpha, etc...

In [18]:
import numpy as np
nb_classifier_h = MultinomialNB(alpha = 0.1)
previous_score = 0
for alpha in np.arange(0,1,0.05):
  sub_classifier = MultinomialNB(alpha=alpha)
  sub_classifier.fit(X_train, y_train)
  y_pred = sub_classifier.predict(X_test)
  score = metrics.accuracy_score(y_test, y_pred)
  if score > previous_score:
    nb_classifier_h = sub_classifier
    previous_score=score
  print('Alpha:{}, Score:{}'.format(alpha,score))

Alpha:0.0, Score:0.6222592284207604
Alpha:0.05, Score:0.6234388009991674
Alpha:0.1, Score:0.6233000277546489
Alpha:0.15000000000000002, Score:0.623022481265612




Alpha:0.2, Score:0.6235081876214266
Alpha:0.25, Score:0.6246877601998335
Alpha:0.30000000000000004, Score:0.6239938939772411
Alpha:0.35000000000000003, Score:0.6238551207327228
Alpha:0.4, Score:0.623924507354982
Alpha:0.45, Score:0.6237857341104635
Alpha:0.5, Score:0.6231612545101305
Alpha:0.55, Score:0.6232306411323897
Alpha:0.6000000000000001, Score:0.6228143213988343
Alpha:0.65, Score:0.6226061615320566
Alpha:0.7000000000000001, Score:0.6225367749097974
Alpha:0.75, Score:0.6220510685539828
Alpha:0.8, Score:0.6219122953094643
Alpha:0.8500000000000001, Score:0.6224673882875381
Alpha:0.9, Score:0.6224673882875381
Alpha:0.9500000000000001, Score:0.623022481265612


### 2- Passive agressive

In [19]:
from sklearn.linear_model import PassiveAggressiveClassifier
pa_classifier = PassiveAggressiveClassifier()

In [20]:
pa_classifier.fit(X_train, y_train)
y_pred = pa_classifier.predict(X_test)
score = metrics.accuracy_score(y_test, y_pred)
print("Passive Aggressive algo accuracy: %0.3f" % score)
cm = metrics.confusion_matrix(y_test, y_pred)
print(cm)

Passive Aggressive algo accuracy: 0.927
[[7971  449]
 [ 603 5389]]


#### Passive agressive classifier with hyperparameter

same as PassiveAggressiveClassifier but with C (the bigger is C, the more agressive the model is)

In [21]:
import numpy as np
pac = PassiveAggressiveClassifier(random_state=12)

# Définition de la grille de paramètres
param_grid = {'C': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]}

# Initialisation de GridSearchCV
grid_search = GridSearchCV(pac, param_grid, cv=5, scoring='accuracy')

# Entraînement du modèle avec recherche par grille
grid_search.fit(X_train, y_train)

# Affichage des meilleurs paramètres
print(f'Best parameters: {grid_search.best_params_}')

# Prédiction sur l'ensemble de test avec les meilleurs paramètres
pa_classifier_h = grid_search.best_estimator_
y_pred = pa_classifier_h.predict(X_test)

# Calcul de l'exactitude
accuracy = metrics.accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Best parameters: {'C': 0.01}
Accuracy: 0.94


### 3- Random Forest

In [22]:
from sklearn.ensemble import RandomForestClassifier

In [23]:

rf_classifier = RandomForestClassifier(n_estimators=100,random_state=12)
rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)
score = metrics.accuracy_score(y_test, y_pred)
print("Random forest accuracy: %0.3f" % score)
cm = metrics.confusion_matrix(y_test, y_pred)
print(cm)

Random forest accuracy: 0.789
[[5890 2530]
 [ 510 5482]]


#### Random Forest classifier with hyperparameter

same as RandomForestClassifier but with C (the bigger is C, the more agressive the model is)

In [24]:
# Initialisation du classifieur Random Forest
# rf_classifier_h = RandomForestClassifier(random_state=12)

# # Définition de la grille de paramètres
# param_grid = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': [None, 10, 20, 30],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'bootstrap': [True, False]
# }

# # Initialisation de GridSearchCV
# grid_search = GridSearchCV(estimator=rf_classifier_h, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

# # Entraînement du modèle avec recherche par grille
# grid_search.fit(X_train, y_train)

# # Affichage des meilleurs paramètres
# print(f'Best parameters: {grid_search.best_params_}')

# # Utilisation des meilleurs paramètres pour prédire sur l'ensemble de test
# rf_classifier_h = grid_search.best_estimator_
# y_pred = rf_classifier_h.predict(X_test)

# # Évaluation de la performance
# accuracy = metrics.accuracy_score(y_test, y_pred)
# print(f'Accuracy: {accuracy:.2f}')

In [25]:
print(2)

2
