In [23]:
import pandas as pd
from sklearn.metrics import confusion_matrix
import evaluate
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, log_loss, roc_curve

clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return clf_metrics.compute(predictions=predictions, references=labels)


df = pd.read_csv('entities.csv', sep=',')
df.rename(columns={'chef':'label', 'texte':'text'}, inplace=True)
df.label = df.label.astype('int')
df

Unnamed: 0,text,label
0,Breton Cyrille menuisier 25 Garçon française,0
1,Ferazzi Auguste vitrier 30 Garçon Piémontaise,1
2,Machol Pierre vitrier 24 Garçon Piémontaise,1
3,Desbois Alexandre prop re 48 Homme marié franç...,1
4,Vignat Zélie prop re sa fe 30 française,0
...,...,...
25075,Chameton-Dideron Marie chef 1869 idem Pailharès,1
25076,Ode Marie ouv chaus res chef Cara 1863 idem St...,1
25077,Berni Nello manoeuvre chef Baretto 1886 italie...,1
25078,Berni-Laureti Annunziata épouse 1887 idem idem,0


In [10]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42, stratify=df.label)

In [11]:
count_vec = CountVectorizer()
X_train = count_vec.fit_transform(df_train.text)
tfidf = TfidfTransformer()
X_train = tfidf.fit_transform(X_train)

X_test = count_vec.transform(df_test.text)
X_test = tfidf.transform(X_test)

y_train = df_train.label
y_test = df_test.label

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(20064, 13249) (5016, 13249)
(20064,) (5016,)


In [22]:


clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = (clf.predict_proba(X_test)[:, 1] > 0.5)

print("Accuracy: ", accuracy_score(y_test, y_pred))
print("F1: ", f1_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))

Accuracy:  0.9224481658692185
F1:  0.8334047109207708
Precision:  0.8061309030654515
Recall:  0.8625886524822695


In [13]:
#Most important features

feature_names = count_vec.get_feature_names_out()
coefs = clf.coef_[0]
print(np.sort(coefs))
coefs = np.argsort(coefs)

print("Most negative features: ", [feature_names[i] for i in coefs[:10]])
print("Most positive features: ", [feature_names[i] for i in coefs[-10:]])

[-8.75691522 -8.20396608 -7.00227289 ...  3.20553941  3.71475742
 13.82410982]
Most negative features:  ['fils', 'femme', 'fille', 'épouse', 'domestique', 'idem', 'sa', 'garçon', 'mère', 'ans']
Most positive features:  ['journalière', 'rentier', 'vve', 'propriétaire', 'veuve', 'homme', 'veuf', 'marié', 'ch', 'chef']


In [8]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

res_df = pd.DataFrame(columns=["Accuracy", "F1", "Precision", "Recall", "BCE Loss"])
for i, (train_index, test_index) in enumerate(skf.split(df.text, df.label)):
    count_vec = CountVectorizer()
    X_train = count_vec.fit_transform(df.text[train_index])
    tfidf = TfidfTransformer()
    X_train = tfidf.fit_transform(X_train)
    y_train = df.label[train_index]


    X_test = count_vec.transform(df.text[test_index])
    X_test = tfidf.transform(X_test)
    y_test = df.label[test_index]

    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    print(f"Fold {i+1}")
    print("Accuracy: ", accuracy_score(y_test, y_pred))
    print("F1: ", f1_score(y_test, y_pred))
    print("Precision: ", precision_score(y_test, y_pred))
    print("Recall: ", recall_score(y_test, y_pred))
    bce_loss = log_loss(y_test, clf.predict_proba(X_test))
    print("BCE Loss: ", bce_loss)

    res_df.loc[i] = [accuracy_score(y_test, y_pred), f1_score(y_test, y_pred), precision_score(y_test, y_pred), recall_score(y_test, y_pred), bce_loss]

final_res_df = pd.DataFrame(columns=["Accuracy", "F1", "Precision", "Recall", "BCE Loss"])
final_res_df.loc[0] = res_df.mean()
final_res_df.loc[1] = res_df.std()

    







Fold 1
Accuracy:  0.9200558213716108
F1:  0.8308730493462674
Precision:  0.7924376508447305
Recall:  0.87322695035461
BCE Loss:  0.21013681113622015
Fold 2
Accuracy:  0.9254385964912281
F1:  0.8378143972246314
Precision:  0.8200339558573854
Recall:  0.8563829787234043
BCE Loss:  0.2041974451619266
Fold 3
Accuracy:  0.9234449760765551
F1:  0.8372881355932204
Precision:  0.801948051948052
Recall:  0.875886524822695
BCE Loss:  0.20168107751471848
Fold 4
Accuracy:  0.9250398724082934
F1:  0.8413502109704641
Precision:  0.8027375201288245
Recall:  0.8838652482269503
BCE Loss:  0.19819007301492453
Fold 5
Accuracy:  0.92603668261563
F1:  0.8411134903640257
Precision:  0.8135874067937034
Recall:  0.8705673758865248
BCE Loss:  0.20741880969734167


In [9]:
final_res_df

Unnamed: 0,Accuracy,F1,Precision,Recall,BCE Loss
0,0.924003,0.837688,0.806149,0.871986,0.204325
1,0.002406,0.004236,0.010787,0.010042,0.00469
