In [38]:
!wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

--2025-04-03 20:48:09--  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz.1’


2025-04-03 20:48:11 (40.5 MB/s) - ‘aclImdb_v1.tar.gz.1’ saved [84125825/84125825]



In [39]:
!tar xvzf aclImdb_v1.tar.gz

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
aclImdb/train/unsup/44983_0.txt
aclImdb/train/unsup/44982_0.txt
aclImdb/train/unsup/44981_0.txt
aclImdb/train/unsup/44980_0.txt
aclImdb/train/unsup/44979_0.txt
aclImdb/train/unsup/44978_0.txt
aclImdb/train/unsup/44977_0.txt
aclImdb/train/unsup/44976_0.txt


In [40]:
import os

def fetch_reviews(path):
    data = []

    # List all files in the given directory
    files = [f for f in os.listdir(path)]

    for file in files:
        with open(os.path.join(path, file), "r", encoding="utf-8") as f:
            data.append(f.read())

    return data


In [41]:
import pandas as pd
import matplotlib.pyplot as plt



# Creating dataframes for training and testing sets
df_train_pos = pd.DataFrame({'review': fetch_reviews('aclImdb/train/pos/'), 'label': 1})
df_train_neg = pd.DataFrame({'review': fetch_reviews('aclImdb/train/neg/'), 'label': 0})

df_test_pos = pd.DataFrame({'review': fetch_reviews('aclImdb/test/pos/'), 'label': 1})
df_test_neg = pd.DataFrame({'review': fetch_reviews('aclImdb/test/neg/'), 'label': 0})

# Merging all dataframes for data cleaning and preprocessing
df = pd.concat([df_train_pos, df_train_neg, df_test_pos, df_test_neg], ignore_index=True)

# Displaying dataset details
print("Total reviews in df:", df.shape)
df.head()


Total reviews in df: (50000, 2)


Unnamed: 0,review,label
0,I watched this on the movies with my girlfrien...,1
1,The power to dream is a wonderful thing. There...,1
2,My baby sitter was a fan so I saw many of the ...,1
3,A common plotline in films consists of the mai...,1
4,Lindy (Meryl Streep) and her husband Michael (...,1


In [50]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [51]:
def clean_text(text):

    # Lowercase
    text = text.lower()

    # Remove all characters except letters, digits, and whitespace
    text = re.sub(r"[^a-z0-9\s]", "", text)

    # Normalize multiple spaces to single space
    text = re.sub(r"\s+", " ", text).strip()

    return text



In [44]:
# Initialize stop words set
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):

    return [word for word in text if word not in stop_words]


In [45]:
lemmatizer = WordNetLemmatizer()
# lemmatize string
def lemmatize_word(text):

    lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in text]
    return lemmas



In [46]:
df['cleaned'] = df['review'].apply(clean_text)


In [47]:
df["Tokenize_Text"]=df.apply(lambda row: nltk.word_tokenize(row["cleaned"]), axis=1)

In [48]:
df['no_stopwords'] = df['Tokenize_Text'].apply(remove_stopwords)


In [49]:
df["Lemmatized_Text"] = df["no_stopwords"].apply(lemmatize_word)
df.head(5)


Unnamed: 0,review,label,cleaned,Tokenize_Text,no_stopwords,Lemmatized_Text
0,I watched this on the movies with my girlfrien...,1,i watched this on the movies with my girlfrien...,"[i, watched, this, on, the, movies, with, my, ...","[watched, movies, girlfriend, time, say, didnt...","[watch, movies, girlfriend, time, say, didnt, ..."
1,The power to dream is a wonderful thing. There...,1,the power to dream is a wonderful thing theres...,"[the, power, to, dream, is, a, wonderful, thin...","[power, dream, wonderful, thing, theres, sayin...","[power, dream, wonderful, thing, theres, say, ..."
2,My baby sitter was a fan so I saw many of the ...,1,my baby sitter was a fan so i saw many of the ...,"[my, baby, sitter, was, a, fan, so, i, saw, ma...","[baby, sitter, fan, saw, many, older, episodes...","[baby, sitter, fan, saw, many, older, episodes..."
3,A common plotline in films consists of the mai...,1,a common plotline in films consists of the mai...,"[a, common, plotline, in, films, consists, of,...","[common, plotline, films, consists, main, char...","[common, plotline, film, consist, main, charac..."
4,Lindy (Meryl Streep) and her husband Michael (...,1,lindy meryl streep and her husband michael sam...,"[lindy, meryl, streep, and, her, husband, mich...","[lindy, meryl, streep, husband, michael, sam, ...","[lindy, meryl, streep, husband, michael, sam, ..."
