In [31]:
import re
import string
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [32]:
# Chargement des données
data = pd.read_csv('tweets_suspect.csv')

In [33]:
print(data.head())

                                             message  label
0  @switchfoot http://twitpic.com/2y1zl - Awww, t...      1
1  is upset that he can't update his Facebook by ...      1
2  @Kenichan I dived many times for the ball. Man...      1
3    my whole body feels itchy and like its on fire       0
4  @nationwideclass no, it's not behaving at all....      1


In [34]:
print(data.columns)

Index(['message', 'label'], dtype='object')


In [35]:
print(data.isnull().sum())


message    0
label      0
dtype: int64


In [36]:
# Afficher les dimensions du DataFrame
print(data.shape)  # Renvoie (3, 2), soit 3 lignes et 2 colonnes

(60000, 2)


In [37]:
# Fonction de nettoyage des tweets
def clean_tweet(tweet):
    tweet = tweet.lower()  # Minuscule
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)  # Supprimer les liens
    tweet = re.sub(r'\@w+|\#','', tweet)  # Supprimer les @ et #
    tweet = re.sub(r'[^\w\s]', '', tweet)  # Supprimer la ponctuation
    return tweet

data['cleaned_tweet'] = data['message'].apply(clean_tweet)

In [38]:
# Séparation des features et labels
X = data['cleaned_tweet']
y = data['label']  # 0 pour non suspect, 1 pour suspect
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorisation TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [40]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Modèle de régression logistique
model = LogisticRegression(class_weight='balanced')
model.fit(X_train_tfidf, y_train)

# Prédictions
y_pred = model.predict(X_test_tfidf)

# Évaluation
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.80      0.78      1222
           1       0.98      0.97      0.97     10778

    accuracy                           0.96     12000
   macro avg       0.87      0.89      0.88     12000
weighted avg       0.96      0.96      0.96     12000



In [41]:
from sklearn.metrics import confusion_matrix, accuracy_score

# Matrice de confusion
conf_matrix = confusion_matrix(y_test, y_pred)
print("Matrice de confusion :\n", conf_matrix)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy :", accuracy)

Matrice de confusion :
 [[  978   244]
 [  294 10484]]
Accuracy : 0.9551666666666667
