In [2]:
import pandas as pd
import numpy as np
#for text pre-processing
import re, string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
#for model-building
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score
# bag of words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adeliakhasanova/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
import spacy

In [4]:
from sklearn.svm import SVC

In [5]:
#https://dataverse.cirad.fr/dataset.xhtml?persistentId=doi:10.18167/DVN1/MSLEFC

In [43]:
data = pd.read_csv('preprocessed_data.csv')

In [41]:
data_sum = pd.read_csv('processed_summaries')

In [7]:
data_pos = data[data['relevance']==1]
data_neg = data[data['relevance']==0]
data_pos_split = data_pos.sample(n=len(data_pos)//2)

In [64]:
data_pos_split = data_pos.sample(n=len(data_pos)//2)

In [65]:
data_pos_split

Unnamed: 0.1,Unnamed: 0,clean_text,relevance
949,949,depuis lundi vaccination ouvert tout adulte fr...,1.0
989,989,heure fin live merci avoir suivre rendre demai...,1.0
663,663,plus personne manifester mercredi pays nom lib...,1.0
302,302,avoir quelque jour rentrer scolaire alors mini...,1.0
978,978,désormais inférieur habitants lincidence lépid...,1.0
...,...,...,...
510,510,voilà donnée attendre depuis longtemps parmi p...,1.0
1152,1152,vigueur jusqu novembre pas sanitaire test covi...,1.0
247,247,créneal dose calcul vite faire semaine suivant...,1.0
993,993,si tout bien finir bien élève second terminal ...,1.0


In [49]:
resampled = data_neg.append(data_pos_split, ignore_index = True)

In [18]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn import metrics

In [60]:
def run_model(data, test_size = 0.3):
    """this function splits data into train and test
    vectorizes, calculates tfidf, and fits into MNB"""
    
    df_X, df_y = data['clean_text'], data['relevance']
    x_train, x_test, y_train, y_test = train_test_split(df_X, df_y, test_size=test_size, random_state=0)
    
    tf_idf = Pipeline([('cv',CountVectorizer()), ('tfidf_transformer',TfidfTransformer(smooth_idf=True,use_idf=True))])
    
    x_train_CV  = tf_idf.fit_transform(x_train)
    x_test_CV = tf_idf.transform(x_test)
    
    mnb = MultinomialNB()
    y_train=y_train.astype('int')
    
    mnb.fit(x_train_CV,y_train)
    
    predictions = mnb.predict(x_test_CV)
    
    return predictions, y_test
    
    

In [72]:
def printing_metrics(pred,test):
    print('accuracy:', metrics.accuracy_score(pred,test),'precision:', metrics.precision_score(pred,test), 'recall:', metrics.recall_score(pred,test), 'f1', metrics.f1_score(pred,test))

In [78]:
from sklearn.metrics import confusion_matrix

In [84]:
confusion_matrix(run_model(data_sum)[0], run_model(data_sum)[1])

array([[ 53,  12],
       [ 75, 243]])

In [87]:
printing_metrics(run_model(data_sum)[0], run_model(data_sum)[1])

accuracy: 0.7728459530026109 precision: 0.9529411764705882 recall: 0.7641509433962265 f1 0.8481675392670157


In [68]:
print(classification_report(run_model(data)[0], run_model(data)[1]))

              precision    recall  f1-score   support

           0       0.30      0.90      0.45        42
           1       0.98      0.74      0.84       341

    accuracy                           0.75       383
   macro avg       0.64      0.82      0.64       383
weighted avg       0.91      0.75      0.80       383



In [66]:
print(classification_report(run_model(resampled)[0], run_model(resampled)[1]))

              precision    recall  f1-score   support

           0       0.98      0.67      0.79       199
           1       0.45      0.95      0.61        58

    accuracy                           0.73       257
   macro avg       0.72      0.81      0.70       257
weighted avg       0.86      0.73      0.75       257



In [63]:
print(classification_report(run_model(data_sum)[0], run_model(data_sum)[1]))

              precision    recall  f1-score   support

           0       0.41      0.82      0.55        65
           1       0.95      0.76      0.85       318

    accuracy                           0.77       383
   macro avg       0.68      0.79      0.70       383
weighted avg       0.86      0.77      0.80       383



In [26]:
from sklearn.metrics import confusion_matrix
confusion_matrix(predictions, y_test)

array([[133,  77],
       [  3,  44]])