## Twitter sentiment analysis with BART Comparaison - Machine Learning models notebook

### Import librairies

In [1]:
import pandas as pd
import numpy as np
import time

import matplotlib.pyplot as plt
import seaborn as sns

# import plotly.express as px
# import plotly.graph_objects as go
# from plotly.subplots import make_subplots

import re
import nltk
import contractions
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from spellchecker import SpellChecker

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix, roc_auc_score

import warnings
warnings.filterwarnings('ignore')

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

# Set style for visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette("deep")

[nltk_data] Downloading package punkt to /Users/photoli93/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/photoli93/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Loading data

In [2]:
file_path = "../data/dataset_twitter_classification.csv"

df = pd.read_csv(file_path)

display(df.sample(5))
print(df.info())
display(df.describe())

Unnamed: 0,is_positive,id,datetime,user,message,bart_is_positive,text_length,word_count,sentence_count,avg_word_length,punctuation_count,capital_letters_count,exclamation_count,question_count,url_count,mention_count,hashtag_count,bart_pred,clean_text
5307,1,1827937592,Sun May 17 11:45:24 PDT 2009,PansyMariee,For my 15th Birthday the one thing i want to d...,0.423072,127,28,1,3.535714,0,2,0,0,0,0,0,0,15th birthday one thing want give money earned...
2221,1,2177052206,Mon Jun 15 05:20:50 PDT 2009,annisatadiyana,#musicmonday a lot of songs for today,0.228707,38,7,1,4.428571,1,0,0,0,0,0,1,0,musicmonday lot songs today
17479,1,2011971974,Tue Jun 02 19:51:46 PDT 2009,Vicki_McGuire,NYC here we come,0.74684,17,4,1,3.25,0,3,0,0,0,0,0,1,nyc come
417,1,1992708729,Mon Jun 01 09:11:41 PDT 2009,zsoczi02,"MTV Movie Awards yesterday: congrats Robert, K...",0.839815,93,13,2,6.153846,7,9,0,0,0,0,0,1,mtv movie awards yesterday congrats robert kri...
5428,1,2063001310,Sun Jun 07 01:01:26 PDT 2009,BrandyWandLover,@ScruffyPanther cool i was never good at lang...,0.602748,81,16,2,4.0625,3,7,1,0,0,1,0,1,scruffypanther cool never good languages got c...


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   is_positive            20000 non-null  int64  
 1   id                     20000 non-null  int64  
 2   datetime               20000 non-null  object 
 3   user                   20000 non-null  object 
 4   message                20000 non-null  object 
 5   bart_is_positive       20000 non-null  float64
 6   text_length            20000 non-null  int64  
 7   word_count             20000 non-null  int64  
 8   sentence_count         20000 non-null  int64  
 9   avg_word_length        20000 non-null  float64
 10  punctuation_count      20000 non-null  int64  
 11  capital_letters_count  20000 non-null  int64  
 12  exclamation_count      20000 non-null  int64  
 13  question_count         20000 non-null  int64  
 14  url_count              20000 non-null  int64  
 15  me

Unnamed: 0,is_positive,id,bart_is_positive,text_length,word_count,sentence_count,avg_word_length,punctuation_count,capital_letters_count,exclamation_count,question_count,url_count,mention_count,hashtag_count,bart_pred
count,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0
mean,0.5029,1999831000.0,0.308301,74.3218,13.2095,2.35415,4.868268,4.0065,3.26695,0.57375,0.1577,0.0441,0.49395,0.02625,0.3055
std,0.500004,193842600.0,0.353417,36.389079,6.952138,1.232357,1.350713,3.487702,5.251816,1.405119,1.08724,0.210374,0.595215,0.182654,0.46063
min,0.0,1467816000.0,8.9e-05,7.0,1.0,1.0,1.428571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1957074000.0,0.004287,44.0,7.0,1.0,4.090909,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,2002057000.0,0.110693,70.0,12.0,2.0,4.6,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,2177694000.0,0.638163,104.0,19.0,3.0,5.285714,6.0,4.0,1.0,0.0,0.0,1.0,0.0,1.0
max,1.0,2329179000.0,0.998371,222.0,41.0,21.0,45.333333,89.0,105.0,74.0,88.0,4.0,9.0,5.0,1.0


### Data Preprocessing

In [3]:
# Init
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
spell = SpellChecker()

def remove_html_artefacts(text):
    """Remove HTML artefacts amp, lt, gt"""
    # On peut ajouter d'autres artefacts si besoin
    artifacts = ["amp", "lt", "gt"]
    pattern = r'\b(' + '|'.join(artifacts) + r')\b'
    return re.sub(pattern, '', text)

def correct_spelling(tokens):
    """Correct spelling for each token"""
    corrected_tokens = []
    for token in tokens:
        # Skip Twitter mentions and <URL>
        if token.startswith('@') or token == '':
            corrected_tokens.append(token)
        else:
            corrected_tokens.append(spell.correction(token))
    return corrected_tokens

def tokenize_and_process(text, use_stemming=True, remove_stopwords=True):
    """Tokenize text and apply stemming or lemmatization"""
    if not text:
        return ""

    # Remove contractions
    text = contractions.fix(text)
    
    # Remove HTML artefacts
    text = remove_html_artefacts(text)

    # Temporarily protect <URL> and <EMAIL>
    text = text.replace("<URL>", "URLTOKEN").replace("<EMAIL>", "EMAILTOKEN")

    # Tokenize texts
    tokens = word_tokenize(text)

    # Restore <URL> and <EMAIL> in tokens
    tokens = ["<URL>" if t == "URLTOKEN" else ("<EMAIL>" if t == "EMAILTOKEN" else t) for t in tokens]
    
    if remove_stopwords:
        tokens = [token for token in tokens if token not in stop_words]
    
    if use_stemming:
        tokens = [stemmer.stem(token) for token in tokens]
    else:
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return ' '.join(tokens)

def preprocess(texts, use_stemming=True, remove_stopwords=True):
    """Apply full preprocessing pipeline to a list of texts"""
    return [
        tokenize_and_process(text, use_stemming, remove_stopwords)
        for text in texts
    ]

df['processed_text'] = preprocess(df['clean_text'].tolist(), use_stemming=False)

print("\n=== Preprocessing Examples ===")
for i, row in df.sample(5).iterrows():
    print(f"Original: {row['message']}")
    print(f"Processed: {row['processed_text']}")
    print("-" * 50)


=== Preprocessing Examples ===
Original: @AbhorrentAspen I am  Guess I gotta find someone else then.
Processed: abhorrentaspen guess got find someone else
--------------------------------------------------
Original: studing for my last exam for the semester 
Processed: studing last exam semester
--------------------------------------------------
Original: haha i had so much fun last night! eating whip cream at 2 in the morning and watching Invader Zim and having a party by myself haha 
Processed: haha much fun last night eating whip cream 2 morning watching invader zim party haha
--------------------------------------------------
Original: I recommend returning your broken duck toy with the batteis it came with or the person who your giving it to may get mad!!  trust me!
Processed: recommend returning broken duck toy batteis came person giving may get mad trust
--------------------------------------------------
Original: @thekelliejane sadly, this is a grocery store, not a restaurant.

### Feature Engineering and Model Training

In [4]:
# Train_test_split
def prepare_features(df, test_size=0.2, random_state=42):
    """Prepare features and split data"""
    # Remove empty processed texts
    df_clean = df[df['processed_text'].str.len() > 0].copy()
    
    X = df_clean['processed_text']
    y = df_clean['is_positive']
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )
    
    print(f"Training set size: {len(X_train)}")
    print(f"Test set size: {len(X_test)}")
    print(f"Training set positive ratio: {y_train.mean():.3f}")
    print(f"Test set positive ratio: {y_test.mean():.3f}")
    
    return X_train, X_test, y_train, y_test, df_clean

X_train, X_test, y_train, y_test, df_clean = prepare_features(df)

Training set size: 15999
Test set size: 4000
Training set positive ratio: 0.503
Test set positive ratio: 0.503


In [None]:
# Define candidate models with pipelines to avoid data leakage
pipelines = {
    'Naive Bayes': Pipeline([
        ('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1, 2), 
                                    min_df=2, max_df=0.8, stop_words='english')),
        ('clf', MultinomialNB(alpha=0.1))
    ]),

    'Logistic Regression': Pipeline([
        ('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1, 2), 
                                  min_df=2, max_df=0.8, stop_words='english')),
        ('clf', LogisticRegression(random_state=42, max_iter=1000))
    ]),
    
    'Random Forest': Pipeline([
        ('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1, 2), 
                                  min_df=2, max_df=0.8, stop_words='english')),
        ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
    ])
}

# Cross-validation step
cv_results = {}

for name, pipe in pipelines.items():
    start = time.time()
    scores = cross_val_score(pipe, X_train, y_train, cv=5, scoring='f1')
    duration = time.time() - start
    
    cv_results[name] = (scores.mean(), scores.std(), duration)
    print(f"{name}: mean F1 = {scores.mean():.4f} ± {scores.std():.4f} "
          f"(time: {duration:.2f} sec)")

# Train on full training set and evaluate on test set
final_results = {}

for name, pipe in pipelines.items():
    print(f"\nTraining {name} on full training set")
    start = time.time()
    
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    y_proba = pipe.predict_proba(X_test)[:, 1] if hasattr(pipe.named_steps['clf'], "predict_proba") else None
    
    duration = time.time() - start
    print(f"{name} finished in {duration:.2f} sec")
    
    final_results[name] = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred),
        'roc_auc': roc_auc_score(y_test, y_proba) if y_proba is not None else None,
        'cv_f1_mean': cv_results[name][0],
        'cv_f1_std': cv_results[name][1],
        'cv_time_sec': cv_results[name][2],
        'train_time_sec': duration
    }

# Display results
metrics_df = pd.DataFrame({
    name: {
        'Accuracy': f"{res['accuracy']:.4f}",
        'Precision': f"{res['precision']:.4f}",
        'Recall': f"{res['recall']:.4f}",
        'F1-Score': f"{res['f1']:.4f}",
        'ROC-AUC': f"{res['roc_auc']:.4f}" if res['roc_auc'] else "N/A",
        'CV F1 (μ±σ)': f"{res['cv_f1_mean']:.4f}±{res['cv_f1_std']:.4f}",
        'CV Time (s)': f"{res['cv_time_sec']:.2f}",
        'Train Time (s)': f"{res['train_time_sec']:.2f}"
    }
    for name, res in final_results.items()
}).T

print("\n=== Final Model Performance (with timings) ===")
display(metrics_df)

Naive Bayes
Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_df=0.8, max_features=10000, min_df=2,
                                 ngram_range=(1, 2), stop_words='english')),
                ('clf', MultinomialNB(alpha=0.1))])
Naive Bayes: mean F1 = 0.7059 ± 0.0072 (time: 1.00 sec)
Logistic Regression
Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_df=0.8, max_features=10000, min_df=2,
                                 ngram_range=(1, 2), stop_words='english')),
                ('clf', LogisticRegression(max_iter=1000, random_state=42))])
Logistic Regression: mean F1 = 0.7313 ± 0.0038 (time: 1.07 sec)
Random Forest
Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_df=0.8, max_features=10000, min_df=2,
                                 ngram_range=(1, 2), stop_words='english')),
                ('clf', RandomForestClassifier(random_state=42))])
Random Forest: mean F1 = 0.7124 ± 0.0053 (time: 33.93 sec)

Training Naive Bayes on full training set
N

Unnamed: 0,Accuracy,Precision,Recall,F1-Score,ROC-AUC,CV F1 (μ±σ),CV Time (s),Train Time (s)
Naive Bayes,0.7127,0.7167,0.7092,0.713,0.7755,0.7059±0.0072,1.0,0.27
Logistic Regression,0.7292,0.7265,0.7406,0.7334,0.8047,0.7313±0.0038,1.07,0.31
Random Forest,0.7057,0.7038,0.7167,0.7102,0.7827,0.7124±0.0053,33.93,9.01
