In [None]:
import pandas as pd

df = pd.read_csv('/home/shawon/NLP/archive (2)/IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
df.columns
df.shape


(50000, 2)

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize the text
    words = text.split()
    
    # Remove stopwords and lemmatize
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    
    return ' '.join(words)

df['review'] = df['review'].apply(preprocess_text)  


[nltk_data] Downloading package stopwords to /home/shawon/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /home/shawon/nltk_data...


0    one reviewer mentioned watching oz episode you...
1    wonderful little production filming technique ...
2    thought wonderful way spend time hot summer we...
3    basically there family little boy jake think t...
4    petter matteis love time money visually stunni...
Name: review, dtype: object


In [6]:
df.tail()

Unnamed: 0,review,sentiment
49995,thought movie right good job wasnt creative or...,positive
49996,bad plot bad dialogue bad acting idiotic direc...,negative
49997,catholic taught parochial elementary school nu...,negative
49998,im going disagree previous comment side maltin...,negative
49999,one expects star trek movie high art fan expec...,negative


In [7]:
total_words = len(' '.join(df['review']).split())
print(f'Total number of words in the dataset: {total_words}')

Total number of words in the dataset: 5930080


In [8]:
unique_words = set(' '.join(df['review']).split())
print(f'Total number of unique words in the dataset: {len(unique_words)}')  

Total number of unique words in the dataset: 203439


In [9]:
all_words = ' '.join(df['review']).split()
vocab = sorted(set(all_words))
word_to_index = {word: idx for idx, word in enumerate(vocab, start=1)}  # Start indexing from 1
cleaned_reviews = df['review'].apply(preprocess_text)
first_review = cleaned_reviews.iloc[0].split()
vector = [0]*len(vocab)
for word in first_review:
    if word in word_to_index:
        index = word_to_index[word] - 1  # Adjust for 0-based index
        vector[index] += 1

print(f'First review: {cleaned_reviews.iloc[0]}')
print(f'Bag-of-Words vector: {vector}')
print("One-hot vector length:", len(vector))
print("First 50 elements of vector:", vector[:50])

First review: one reviewer mentioned watching oz episode youll hooked right exactly happened methe first thing struck oz brutality unflinching scene violence set right word go trust show faint hearted timid show pull punch regard drug sex violence hardcore classic use wordit called oz nickname given oswald maximum security state penitentary focus mainly emerald city experimental section prison cell glass front face inwards privacy high agenda em city home manyaryans muslim gangsta latino christian italian irish moreso scuffle death stare dodgy dealing shady agreement never far awayi would say main appeal show due fact go show wouldnt dare forget pretty picture painted mainstream audience forget charm forget romanceoz doesnt mess around first episode ever saw struck nasty surreal couldnt say ready watched developed taste oz got accustomed high level graphic violence violence injustice crooked guard wholl sold nickel inmate wholl kill order get away well mannered middle class inmate turn

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['review'])
bow_vector = X[0].toarray().flatten()
print(f'Bag-of-Words vector shape: {bow_vector.shape}')
print("First 50 elements of first review vector:", bow_vector[:50])
print("One-hot vector length:", len(bow_vector))

Bag-of-Words vector shape: (203415,)
First 50 elements of first review vector: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]
One-hot vector length: 203415


In [18]:
def tokenize_text(text :str):
    return preprocess_text(text).split()

def make_ngrams(tokens, n=2):
    return [ ' '.join(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

df['tokens'] = df['review'].apply(tokenize_text)
bigram = df['tokens'].apply(lambda x: make_ngrams(x, n=2))
trigram = df['tokens'].apply(lambda x: make_ngrams(x, n=3))



In [20]:
print("Top 10 bigrams in the first review:", bigram.iloc[0][:10])
print("Top 10 trigrams in the first review:", trigram.iloc[0][:10])

Top 10 bigrams in the first review: ['one reviewer', 'reviewer mentioned', 'mentioned watching', 'watching oz', 'oz episode', 'episode youll', 'youll hooked', 'hooked right', 'right exactly', 'exactly happened']
Top 10 trigrams in the first review: ['one reviewer mentioned', 'reviewer mentioned watching', 'mentioned watching oz', 'watching oz episode', 'oz episode youll', 'episode youll hooked', 'youll hooked right', 'hooked right exactly', 'right exactly happened', 'exactly happened methe']


In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer 

tfidf_uni = TfidfVectorizer(
    preprocessor=preprocess_text,
    tokenizer=tokenize_text,
    ngram_range=(1,1) ,
    min_df=5,
    max_df=0.8,
)

X_uni = tfidf_uni.fit_transform(df['review'])
vocab_uni = tfidf_uni.get_feature_names_out()

print(f'TF-IDF Unigram matrix shape: {X_uni.shape}')
print(f'Vocabulary size: {len(vocab_uni[:20])}')



TF-IDF Unigram matrix shape: (50000, 36118)
Vocabulary size: 20
