In [None]:
import pandas as pd

import numpy as np
import spacy

from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report

from sklearn.metrics import accuracy_score

import nltk
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

#from collections import defaultdict

df = pd.read_csv('IMDB Dataset.csv')

df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# **Data Preprocessing**

**1. Tokenization**

In [None]:
import spacy
# Load the SpaCy English model
nlp = spacy.load('en_core_web_sm')

# Select one review (e.g., the first review)
review_text = 'One of the other reviewers has mentioned that after watching just 1 Oz episode youll be hooked.' # Get the review text from the first row

# Process the text using SpaCy
doc = nlp(review_text)

# Tokenize (extract tokens)
spacy_tokens = [token.text for token in doc]
print("Original Review:\n", review_text)
print("Tokens:\n", spacy_tokens)

Original Review:
 One of the other reviewers has mentioned that after watching just 1 Oz episode youll be hooked.
Tokens:
 ['One', 'of', 'the', 'other', 'reviewers', 'has', 'mentioned', 'that', 'after', 'watching', 'just', '1', 'Oz', 'episode', 'you', 'll', 'be', 'hooked', '.']


**2. Lowecasing**

In [None]:
# Input text
text = "A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion"

# Tokenize the text using SpaCy
doc = nlp(text)

# Convert tokens to lowercase
lowercased_tokens = [token.text.lower() for token in doc]

print("Lowercased tokens:\n", lowercased_tokens)

Lowercased tokens:
 ['a', 'wonderful', 'little', 'production', '.', '<', 'br', '/><br', '/>the', 'filming', 'technique', 'is', 'very', 'unassuming-', 'very', 'old', '-', 'time', '-', 'bbc', 'fashion']


**3. Remove Punctuation**

In [None]:
# Input text
text = "I thought this was a wonderful way to (spend time) on a too hot summer weekend, sitting in the air conditioned theater and watching a light-hearted comedy. "

# Tokenize the text
doc = nlp(text)

# Remove punctuation
filtered_tokens = [token.text for token in doc if not token.is_punct]

print("Tokens without punctuation:\n", filtered_tokens)


Tokens without punctuation:
 ['I', 'thought', 'this', 'was', 'a', 'wonderful', 'way', 'to', 'spend', 'time', 'on', 'a', 'too', 'hot', 'summer', 'weekend', 'sitting', 'in', 'the', 'air', 'conditioned', 'theater', 'and', 'watching', 'a', 'light', 'hearted', 'comedy']


**4. Remove Stop Words**

In [None]:
# Input text
text = "Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time"

# Tokenize the text
doc = nlp(text)

# Remove stop words
filtered_tokens = [token.text for token in doc if not token.is_stop]

print("Tokens without stop words:\n", filtered_tokens)


Tokens without stop words:
 ['Basically', 'family', 'little', 'boy', '(', 'Jake', ')', 'thinks', 'zombie', 'closet', '&', 'parents', 'fighting', 'time']


**5. Remove extra whitespace**

In [None]:
# Input text with extra white space
text = " Petter Mattei's    Love in the Time       of Money is a       visually stunning film to watch.   "

# Remove leading and trailing white space
text = text.strip()

# Replace multiple consecutive white space characters with a single space
text = " ".join(text.split())

print("Cleaned text:\n", text)

Cleaned text:
 Petter Mattei's Love in the Time of Money is a visually stunning film to watch.


**6. Remove URLs**

In [None]:
import re

# Input text with URLs
text = "Probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a noble cause, but it's not preachy or boring. Check out this article for more information: https://en.wikipedia.org/wiki/Natural_language_processing"

# Define a regular expression pattern to match URLs
pattern = r"(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?"

# Replace URLs with an empty string
cleaned_text = re.sub(pattern, "", text).strip()

# Print the cleaned text without URLs
print("Text without URLs:\n", cleaned_text)


Text without URLs:
 Probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a noble cause, but it's not preachy or boring. Check out this article for more information:


**7. Remove HTML Code**

In [None]:
import re

# Input text with HTML code
text = "A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion "

# Define a regular expression pattern to match HTML tags
pattern = r"<[^>]+>"

# Replace HTML tags with an empty string
cleaned_text = re.sub(pattern, "", text).strip()

# Optionally process the cleaned text using SpaCy (if further processing is needed)
doc = nlp(cleaned_text)

# Print the cleaned text without HTML
print("Text without HTML code:\n", cleaned_text)


Text without HTML code:
 A wonderful little production. The filming technique is very unassuming- very old-time-BBC fashion


**8. Lemmatization**

In [None]:
# Input text
text = "I sure would like to see a resurrection of a up dated Seahunt series with the tech they have today it would bring back the kid excitement in me"

# Tokenize and process the text
doc = nlp(text)

# Lemmatize each token
lemmatized_tokens = [token.lemma_ for token in doc]

print("Lemmatized tokens:\n", lemmatized_tokens)

Lemmatized tokens:
 ['I', 'sure', 'would', 'like', 'to', 'see', 'a', 'resurrection', 'of', 'a', 'up', 'date', 'Seahunt', 'series', 'with', 'the', 'tech', 'they', 'have', 'today', 'it', 'would', 'bring', 'back', 'the', 'kid', 'excitement', 'in', 'I']


**9. Part-of-Speech**

In [None]:
# Input text
text = "This show was an amazing, fresh & innovative idea in the 70's when it first aired."

# Tokenize and process the text using SpaCy
doc = nlp(text)

# Tag each token with its POS tag
tagged_tokens = [(token.text, token.pos_) for token in doc]

print("Tagged tokens:\n", tagged_tokens)

Tagged tokens:
 [('This', 'DET'), ('show', 'NOUN'), ('was', 'AUX'), ('an', 'DET'), ('amazing', 'ADJ'), (',', 'PUNCT'), ('fresh', 'ADJ'), ('&', 'CCONJ'), ('innovative', 'ADJ'), ('idea', 'NOUN'), ('in', 'ADP'), ('the', 'DET'), ('70', 'NUM'), ("'s", 'PART'), ('when', 'SCONJ'), ('it', 'PRON'), ('first', 'ADV'), ('aired', 'VERB'), ('.', 'PUNCT')]


**10. Name Entity Recognition**

In [None]:
# Input text
text = "Phil the Alien is one of those quirky films where the humour is based around the oddness of everything rather than actual punchlines."

# Process the text using SpaCy
doc = nlp(text)

# Extract named entities
named_entities = [(ent.text, ent.label_) for ent in doc.ents]

print("Named entities:\n", named_entities)

Named entities:
 [('Phil', 'PERSON'), ('Alien', 'PERSON'), ('one', 'CARDINAL')]


# **Exploratory Data Analysis (EDA)**

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [None]:
# Number of poitive and negative reviews
df.sentiment.value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


In [None]:
# Lets encode labels: each label is an integer value of either 0 or 1, where 0 is a negative review, and 1 is a positive review.
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
df['sentiment'] = label_encoder.fit_transform(df['sentiment'])
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [None]:
#slicing
df=df[0:500]
df.shape

(500, 2)

# **Word2Vec**

In [None]:
pip install gensim



In [None]:
import pandas as pd
import gensim
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk

# Download NLTK punkt and punkt_tab tokenizer models
nltk.download('punkt')
nltk.download('punkt_tab') # Download the missing 'punkt_tab' data


# Assuming df is your DataFrame with the 'review' column
df = pd.read_csv('IMDB Dataset.csv')  # Replace this with your actual data

# Tokenize the reviews (convert each review into a list of words)
tokenized_reviews = df['review'].apply(lambda x: word_tokenize(x.lower()))  # Lowercasing and tokenizing

# Train a Word2Vec model (you can also use pre-trained models, but here we are training on your data)
model = Word2Vec(tokenized_reviews, vector_size=100, window=5, min_count=1, workers=4)

# Convert each review into a vector by averaging the Word2Vec vectors of the words in that review
def get_review_vector(review):
    # Get Word2Vec vectors for each word in the review
    word_vectors = [model.wv[word] for word in review if word in model.wv]
    if len(word_vectors) > 0:
        return sum(word_vectors) / len(word_vectors)  # Average of word vectors
    else:
        return [0] * model.vector_size  # Return a zero vector if no words are found in Word2Vec model

# Apply this to all reviews in the DataFrame
df['review_vector'] = tokenized_reviews.apply(get_review_vector)

# Convert the review vectors to a DataFrame to resemble the original BOW format
# We will expand the review vectors into columns for each element of the vector
review_vectors_df = pd.DataFrame(df['review_vector'].to_list())

# Print the DataFrame with Word2Vec vectors
print(review_vectors_df.head())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


         0         1         2         3         4         5         6   \
0  0.359633  0.000345 -0.166439  0.977514  0.396527 -1.126511 -0.075848   
1  0.461648 -0.358288 -0.204040  0.882033 -0.241735 -1.700259 -0.672759   
2  0.338515  0.123076 -0.255154  0.783888  0.111142 -0.943613 -0.359936   
3  0.245513  0.048114 -0.233921  0.835297 -0.075368 -1.468500 -0.640680   
4  0.442462  0.066985 -0.126463  1.014085 -0.072636 -1.562164 -0.424977   

         7         8         9   ...        90        91        92        93  \
0 -0.171688 -0.542453 -0.001968  ...  0.301293  0.232945 -0.570636  0.845826   
1  0.076100 -0.828950  0.034663  ...  0.511795  0.081447 -0.724110  0.570205   
2 -0.150978 -0.495274 -0.106602  ...  0.254083  0.308633 -0.479926  0.730299   
3 -0.072478 -0.819626 -0.102287  ...  0.447963 -0.035852 -0.628953  0.618025   
4 -0.010798 -0.515602  0.074092  ...  0.326650  0.254197 -0.674061  0.709464   

         94        95        96        97        98        99  
0  0

# **Conclusion:**
The word embeddings created from the reviews in the dataset help capture the underlying meaning and sentiment of each text, turning the words into numerical vectors that the model can understand. This transformation enables more accurate analysis and prediction of sentiment, whether the review is positive or negative.