In [1]:
# Importation des bibliothèques nécessaires
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Téléchargement des ressources NLTK nécessaires
nltk.download('punkt')
nltk.download('stopwords')

# 1. Chargement des données
df_imdb = pd.read_csv('/Users/ludovicveltz/Documents/Bootcamp_GENAI_2025/Crashcourse/WEEK_5/DAY_2/DATASET/IMDB_Dataset.csv')
df = df_imdb.iloc[:len(df_imdb) // 5]  # Utiliser 20% des données


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ludovicveltz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ludovicveltz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# 2. & 3. Exploration initiale
print("Premières lignes :")
print(df.head())
print("\nDimensions :", df.shape)
print("\nTypes des colonnes :")
print(df.dtypes)

# 4. Vérification des valeurs manquantes
print("\nValeurs manquantes :")
print(df.isnull().sum())
df = df.dropna()

# 5. Affichage des 5 premières reviews
print("\nPremières reviews :")
print(df[['review', 'sentiment']].head())

# 6. Fonction de comptage des mots
def count_words(text):
    return len(text.split())

df['words_count'] = df['review'].apply(count_words)


Premières lignes :
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive

Dimensions : (10000, 2)

Types des colonnes :
review       object
sentiment    object
dtype: object

Valeurs manquantes :
review       0
sentiment    0
dtype: int64

Premières reviews :
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [5]:
# Preprocessing
def simple_preprocessing(text):
    # Conversion en minuscules
    text = text.lower()
    # Suppression des balises HTML
    text = re.sub(r'<br\s*/>', ' ', text)
    # Suppression des URLs
    text = re.sub(r'http\S+|www.\S+', '', text)
    # Suppression des hashtags et @
    text = re.sub(r'[@#]\w+', '', text)
    # Suppression de la ponctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenization et suppression des stop words
    stop_words = set(stopwords.words('english'))
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Application du preprocessing
df['processed_review'] = df['review'].apply(simple_preprocessing)

# Vérification des doublons
print("\nNombre de doublons :", df.duplicated().sum())
df = df.drop_duplicates()

# Stemming
def stemming(text):
    ps = PorterStemmer()
    tokens = text.split()
    stemmed = [ps.stem(word) for word in tokens]
    return ' '.join(stemmed)

df['stemmed_review'] = df['processed_review'].apply(stemming)



Nombre de doublons : 17


In [6]:
# Préparation des données
# Binarisation des sentiments
df['sentiment_binary'] = (df['sentiment'] == 'positive').astype(int)

# Vectorisation TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['stemmed_review'])
y = df['sentiment_binary']

# Split des données
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("\nFormes des données :")
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


Formes des données :
X_train shape: (6988, 49871)
X_test shape: (2995, 49871)
y_train shape: (6988,)
y_test shape: (2995,)


In [7]:
# Modèle de régression logistique
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

# Évaluation
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("\nPrécision du modèle :", accuracy)

print("\nMatrice de confusion :")
print(confusion_matrix(y_test, y_pred))

print("\nRapport de classification :")
print(classification_report(y_test, y_pred))


Précision du modèle : 0.8761268781302171

Matrice de confusion :
[[1269  219]
 [ 152 1355]]

Rapport de classification :
              precision    recall  f1-score   support

           0       0.89      0.85      0.87      1488
           1       0.86      0.90      0.88      1507

    accuracy                           0.88      2995
   macro avg       0.88      0.88      0.88      2995
weighted avg       0.88      0.88      0.88      2995



In [8]:
# Test de prédictions
def predict_sentiment(text):
    # Prétraitement
    processed = simple_preprocessing(text)
    stemmed = stemming(processed)
    # Vectorisation
    vectorized = vectorizer.transform([stemmed])
    # Prédiction
    prediction = model.predict(vectorized)
    return "positive" if prediction[0] == 1 else "negative"

# Test des phrases
test_reviews = [
    "I loved this movie!",
    "This movie was a bad comedy movie!"
]

print("\nPrédictions :")
for review in test_reviews:
    sentiment = predict_sentiment(review)
    print(f"Review: {review}")
    print(f"Sentiment prédit: {sentiment}\n")


Prédictions :
Review: I loved this movie!
Sentiment prédit: positive

Review: This movie was a bad comedy movie!
Sentiment prédit: negative



In [None]:
# BONUS : 
def enhanced_preprocessing(text):
    # Gestion spécifique des expressions courantes dans les critiques de films
    text = text.lower()
    # Remplacer les contractions courantes
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    # Gestion des expressions spécifiques aux films
    text = re.sub(r"(\d+)/10", "rating", text)  # Remplacer les notes /10
    text = re.sub(r"oscar[s]?", "award", text)  # Standardiser les références aux oscars
    # Nettoyage standard
    text = re.sub(r'<br\s*/>', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenization et stopwords
    stop_words = set(stopwords.words('english'))
    # Garder certains mots importants pour les critiques
    important_words = {'not', 'no', 'but', 'very', 'good', 'bad'}
    stop_words = stop_words - important_words
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# 2. Modifier la fonction improve_imdb_model pour utiliser la fonction externe
def improve_imdb_model():
    vectorizer_improved = TfidfVectorizer(
        preprocessor=enhanced_preprocessing,
        ngram_range=(1, 3),     
        max_features=15000,     
        min_df=3,              
        max_df=0.95           
    )

    # Préparation des données
    X_improved = vectorizer_improved.fit_transform(df['review'])
    
    # Split avec stratification
    X_train_imp, X_test_imp, y_train_imp, y_test_imp = train_test_split(
        X_improved, 
        df['sentiment_binary'],
        test_size=0.3,
        random_state=42,
        stratify=df['sentiment_binary']
    )

    # Modèle amélioré
    model_improved = LogisticRegression(
        C=1.0,
        class_weight='balanced',
        max_iter=200,
        random_state=42
    )
    
    # Entraînement
    model_improved.fit(X_train_imp, y_train_imp)
    
    # Évaluation
    y_pred_imp = model_improved.predict(X_test_imp)
    accuracy_imp = accuracy_score(y_test_imp, y_pred_imp)
    
    return vectorizer_improved, model_improved, accuracy_imp

# 3. Test avec les mêmes critiques
test_reviews_imdb = [
    "This movie was absolutely brilliant, the acting was superb and the story kept me engaged throughout the entire film. 9/10",
    "Despite good intentions, the movie falls flat with poor acting and a confusing plot. The special effects couldn't save it. 3/10",
    "A decent film but nothing spectacular. The performances were okay but the story was predictable. 6/10",
    "One of the worst movies I've ever seen. Complete waste of time and money. The plot made no sense.",
    "While not perfect, the film manages to deliver solid entertainment with strong performances and beautiful cinematography."
]

# 4. Exécution et test
vectorizer_improved, model_improved, accuracy_improved = improve_imdb_model()

print(f"Précision du modèle amélioré : {accuracy_improved}")

print("\nPrédictions avec le modèle amélioré :")
for review in test_reviews_imdb:
    processed = enhanced_preprocessing(review)
    vectorized = vectorizer_improved.transform([processed])
    prediction = model_improved.predict(vectorized)
    sentiment = "positive" if prediction[0] == 1 else "negative"
    
    print(f"\nCritique : {review}")
    print(f"Sentiment prédit : {sentiment}")



Précision du modèle amélioré : 0.8874791318864774

Prédictions avec le modèle amélioré :

Critique : This movie was absolutely brilliant, the acting was superb and the story kept me engaged throughout the entire film. 9/10
Sentiment prédit : positive

Critique : Despite good intentions, the movie falls flat with poor acting and a confusing plot. The special effects couldn't save it. 3/10
Sentiment prédit : negative

Critique : A decent film but nothing spectacular. The performances were okay but the story was predictable. 6/10
Sentiment prédit : negative

Critique : One of the worst movies I've ever seen. Complete waste of time and money. The plot made no sense.
Sentiment prédit : negative

Critique : While not perfect, the film manages to deliver solid entertainment with strong performances and beautiful cinematography.
Sentiment prédit : positive
