In [5]:
# basic imports
import os
import re
import matplotlib.pyplot as plt
# display matplotlib graphics in notebook
%matplotlib inline 
import seaborn as sns
import numpy as np
import pandas as pd

from wordcloud import WordCloud
import matplotlib.pyplot as plt


# disable warnings for libraries
import warnings
warnings.filterwarnings("ignore")

# configure logger
import logging
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.INFO, datefmt='%I:%M:%S')
logger = logging.getLogger(__name__)

In [13]:
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
base_path = ".."

In [6]:
#Expected rate for each word in the vocabulary

def load_embeddings():
    # Load rates
    logger.info("Loading embedding vectors...")
    rates= np.genfromtxt('aclImdb/imdbEr.txt')
    logger.info(f"Loaded vectors with shape: {rates.shape}")
    
    # Load associated words
    logger.info("Loading words...")
    with open('aclImdb/imdb.vocab', 'r', encoding='utf-8') as f:
        words = [line.strip() for line in f.readlines()]
    logger.info(f"Loaded {len(words)} words")
    
    if len(words) != rates.shape[0]:
        raise ValueError(f"Mismatch between number of words ({len(words)}) and vectors ({rates.shape[0]})")
    
    logger.info("Verification complete - sizes match!")
    
    return rates, words

rates, words = load_embeddings()

print("\nDataset information:")
print(f"Number of words (vocab): {len(words)}")
print(f"Number of vectors (rates): {rates.shape[0]}")
print(f"\nFirst few words and their expected rates {[(word, rate) for (word, rate) in zip(words[:5], rates[:5])]}")


05:39:22 INFO:Loading embedding vectors...
05:39:23 INFO:Loaded vectors with shape: (89527,)
05:39:23 INFO:Loading words...
05:39:23 INFO:Loaded 89527 words
05:39:23 INFO:Verification complete - sizes match!



Dataset information:
Number of words (vocab): 89527
Number of vectors (rates): 89527

First few words and their expected rates [('the', np.float64(0.0490972013402)), ('and', np.float64(0.201363575849)), ('a', np.float64(0.0333946807184)), ('of', np.float64(0.099837669572)), ('to', np.float64(-0.0790210365788))]


In [7]:
#words with rate < 0.25 will be considered as stop words
stopwords = [word for word, rate in zip(words, rates) if np.abs(rate) < 0.7]
print(f"Nombre de stop words détectés : {len(stopwords)}")

#save stop words
with open("aclImdb/stop_word_rate.txt", "w", encoding="utf-8") as f:
    for word in stopwords:
        f.write(word + "\n")

print("Le fichier stop_word_rate.txt a été créé avec succès.")

Nombre de stop words détectés : 63286
Le fichier stop_word_rate.txt a été créé avec succès.


In [9]:
def clean_review(text):
    """
    Removes <br /> 
    """
    text = re.sub(r'<br\s*/?>', ' ', text)
    return text

def process_movie_reviews(directory_path: str) -> pd.DataFrame:
    """
    Traite les fichiers de critiques de films et les combine dans un DataFrame.
    Args:
        directory_path (str): Chemin vers le répertoire contenant les fichiers texte
    Returns:
        pd.DataFrame: DataFrame contenant les colonnes id, rate, et comment
    """

    movie_ids = []
    rates = []
    comments = []
    
    for filename in os.listdir(directory_path):
        if filename.endswith('.txt'):
            # id et note du nom du fichier

            parts = filename.split('_')
            if len(parts) == 2:
                movie_id = int(parts[0])
                rate = int(parts[1].split('.')[0])
                
                with open(os.path.join(directory_path, filename), 'r', encoding='utf-8') as f:
                    comment = f.read().strip()
                    comment = clean_review(comment) 
                
                # Ajouter les données aux listes
                movie_ids.append(movie_id)
                rates.append(rate)
                comments.append(comment)
    
    df = pd.DataFrame({
        'id': movie_ids,
        'rate': rates,
        'comment': comments
    })
    
    # Trier par id de film
    df = df.sort_values('id')
    
    return df

pos_df = process_movie_reviews('aclImdb/train/pos')
neg_df = process_movie_reviews('aclImdb/train/neg')

#labelling 
pos_df['sentiment'] = 1
neg_df['sentiment'] = 0

# Create dataset with positive and negative reviews
df_train = pd.concat([pos_df, neg_df], ignore_index=True)
df_train = df_train.sample(frac=1, random_state=42).reset_index(drop=True)

In [10]:
df_train.to_csv('aclImdb/df_train.csv', index=False)

In [11]:
pos_df = process_movie_reviews('aclImdb/test/pos')
neg_df = process_movie_reviews('aclImdb/test/neg')

#labelling 
pos_df['sentiment'] = 1
neg_df['sentiment'] = 0

# Create dataset with positive and negative reviews
df_test = pd.concat([pos_df, neg_df], ignore_index=True)
df_test = df_test.sample(frac=1, random_state=42).reset_index(drop=True)
df_test.to_csv('aclImdb/df_test.csv', index=False)