In [2]:
# Installation des packages nécessaires
!pip install transformers torch scikit-learn pandas numpy matplotlib seaborn nltk spacy imbalanced-learn

# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from transformers import AutoTokenizer, AutoModel, BertTokenizer, BertModel
from torch import nn
import torch
from torch.utils.data import Dataset, DataLoader
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from imblearn.over_sampling import SMOTE
import spacy



In [3]:
# Chargement et prétraitement
class TweetDataset:
    def __init__(self, file_path):
        self.df = pd.read_csv(file_path)
        self.preprocess_data()

    def preprocess_data(self):
        # Nettoyage du texte
        self.df['clean_text'] = self.df['text'].apply(self.clean_text)

        # Encodage des labels
        le = LabelEncoder()
        self.df['label'] = le.fit_transform(self.df['label'])

    @staticmethod
    def clean_text(text):
        text = text.lower()
        text = re.sub(r'@[A-Za-z0-9]+', '', text) # Remove mentions
        text = re.sub(r'#', '', text) # Remove hashtags
        text = re.sub(r'RT[\s]+', '', text) # Remove RT
        text = re.sub(r'https?:\/\/\S+', '', text) # Remove URLs
        text = re.sub(r'\n', ' ', text) # Remove newlines
        return text.strip()

    def get_data(self):
        return self.df['clean_text'], self.df['label']

In [5]:
# Modèles
class BertClassifier(nn.Module):
    def __init__(self):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(768, 2)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        output = self.dropout(pooled_output)
        return self.linear(output)

def train_models(X_train, y_train):
    # Logistic Regression
    lr_model = LogisticRegression()
    lr_model.fit(X_train, y_train)

    # Random Forest
    rf_model = RandomForestClassifier()
    rf_model.fit(X_train, y_train)

    # SVM
    svm_model = SVC(probability=True)
    svm_model.fit(X_train, y_train)

    return lr_model, rf_model, svm_model

In [6]:
# Évaluation
def evaluate_models(models, X_test, y_test):
    results = {}
    for name, model in models.items():
        predictions = model.predict(X_test)

        # Matrice de confusion
        cm = confusion_matrix(y_test, predictions)

        # ROC Curve
        fpr, tpr, _ = roc_curve(y_test, model.predict_proba(X_test)[:, 1])
        roc_auc = auc(fpr, tpr)

        # Classification report
        report = classification_report(y_test, predictions)

        results[name] = {
            'confusion_matrix': cm,
            'roc_data': (fpr, tpr, roc_auc),
            'report': report
        }

    return results

In [7]:
# Visualisation
def plot_results(results):
    plt.figure(figsize=(15, 5))

    # ROC Curves
    plt.subplot(121)
    for name, data in results.items():
        fpr, tpr, roc_auc = data['roc_data']
        plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.2f})')

    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves')
    plt.legend()

    # Confusion Matrices
    plt.subplot(122)
    for name, data in results.items():
        sns.heatmap(data['confusion_matrix'], annot=True, fmt='d')
        plt.title(f'Confusion Matrix - {name}')

    plt.tight_layout()
    plt.show()

In [3]:
# Import des bibliothèques nécessaires
import pandas as pd
import numpy as np

# Charger le fichier
from google.colab import files
uploaded = files.upload()

# Une fois le fichier uploadé, on le lit
filename = list(uploaded.keys())[0]  # Récupère le nom du fichier uploadé
df = pd.read_csv(filename)

# Afficher les informations sur le DataFrame
print("Colonnes disponibles :")
print(df.columns)
print("\nPremières lignes :")
print(df.head())

Saving tweets_suspect.csv to tweets_suspect (2).csv
Colonnes disponibles :
Index(['message', 'label'], dtype='object')

Premières lignes :
                                             message  label
0  @switchfoot http://twitpic.com/2y1zl - Awww, t...      1
1  is upset that he can't update his Facebook by ...      1
2  @Kenichan I dived many times for the ball. Man...      1
3    my whole body feels itchy and like its on fire       0
4  @nationwideclass no, it's not behaving at all....      1


In [None]:

# Import manquant pour TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE

# Définition des features (X) et de la cible (y)
X = df['message']
y = df['label']

# Vectorisation du texte
vectorizer = TfidfVectorizer(max_features=5000)
X_vectorized = vectorizer.fit_transform(X)

# Split des données
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2)

# SMOTE pour le déséquilibre
smote = SMOTE()
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Entraînement des modèles
models = {
    'RandomForest': RandomForestClassifier(),
    'LogisticRegression': LogisticRegression(),
    'SVM': SVC(probability=True)
}

for name, model in models.items():
    model.fit(X_train_balanced, y_train_balanced)
    predictions = model.predict(X_test)
    print(f"\nRésultats pour {name}:")
    print(classification_report(y_test, predictions))

    # Visualisation de la matrice de confusion
    plt.figure(figsize=(8, 6))
    sns.heatmap(confusion_matrix(y_test, predictions), annot=True, fmt='d')
    plt.title(f'Matrice de confusion - {name}')
    plt.show()