In [4]:
import pandas as pd
import zipfile
zf = zipfile.ZipFile('/home/mustafa/nlp/Movie_Review_Sentiment_Analysis/IMDB Dataset.csv.zip') 

data = pd.read_csv(zf.open('IMDB Dataset.csv'))
print(data.head())

                                              review sentiment Unnamed: 2  \
0  One of the other reviewers has mentioned that ...  positive        NaN   
1  A wonderful little production. <br /><br />The...  positive        NaN   
2  I thought this was a wonderful way to spend ti...  positive        NaN   
3  Basically there's a family where a little boy ...  negative        NaN   
4  Petter Mattei's "Love in the Time of Money" is...  positive        NaN   

  Unnamed: 3 Unnamed: 4 Unnamed: 5 Unnamed: 6 Unnamed: 7  
0        NaN        NaN        NaN        NaN        NaN  
1        NaN        NaN        NaN        NaN        NaN  
2        NaN        NaN        NaN        NaN        NaN  
3        NaN        NaN        NaN        NaN        NaN  
4        NaN        NaN        NaN        NaN        NaN  


In [89]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

nltk.download('stopwords')

def preprocess_text(text):
    text = re.sub(r'https?://\S+', '', text)
    
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'<br\s*/?>', ' ', text)

    text = re.sub(r'[^a-zA-Z]', ' ', text)
    
    words = text.lower().split()
    
    ps = PorterStemmer()
    words = [ps.stem(word) for word in words if word not in set(stopwords.words('english'))]
    
    return ' '.join(words)

# Assuming 'data' is your DataFrame containing 'review' column
data['review'] = data['review'].apply(preprocess_text)
print(data.head())


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mustafa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                              review sentiment Unnamed: 2  \
0  one review mention watch oz episod hook right ...         1        NaN   
1  wonder littl product film techniqu unassum old...         1        NaN   
2  thought wonder way spend time hot summer weeke...         1        NaN   
3  basic famili littl boy jake think zombi closet...         0        NaN   
4  petter mattei love time money visual stun film...         1        NaN   

  Unnamed: 3 Unnamed: 4 Unnamed: 5 Unnamed: 6 Unnamed: 7  
0        NaN        NaN        NaN        NaN        NaN  
1        NaN        NaN        NaN        NaN        NaN  
2        NaN        NaN        NaN        NaN        NaN  
3        NaN        NaN        NaN        NaN        NaN  
4        NaN        NaN        NaN        NaN        NaN  


In [90]:
data['sentiment'] = data['sentiment'].replace({'positive': 1, 'negative': 0})


In [97]:
def build_freqs(data):
    freqs = {}
    
    for index, row in data.iterrows():
        review = row['review']
        sentiment = row['sentiment']
        words = review.split()
        #print(words,sentiment)
        
        for word in words:
            pair = (word, sentiment)
            if pair not in freqs:
                freqs[pair] = 1
            else:
                freqs[pair] += 1
    
    return freqs


In [114]:
freqs = build_freqs(data)
filtered_freqs={}
for pair, freq in freqs.items():
    if isinstance(pair[1], (int)):
        filtered_freqs[pair] = freq

In [325]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data['review'], data['sentiment'], test_size=0.15, random_state=100)


In [326]:
import numpy as np
from collections import defaultdict

def train_naive_bayes(freqs, train_y):
  
    loglikelihood = {}
    logprior = 0

    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)

    N_pos = N_neg = 0
    for pair, freq in freqs.items():
        try:
            if pair[1] > 0:
                N_pos += freq
            else:
                N_neg += freq
        except TypeError:
            continue

    D = len(train_y)

    D_pos = np.sum(train_y == 1)

    D_neg = np.sum(train_y == 0)

    logprior = np.log(D_pos) - np.log(D_neg)

    for word in vocab:
        freq_pos = freqs.get((word, 1), 0)
        freq_neg = freqs.get((word, 0), 0)

        p_w_pos = (freq_pos + 1) / (N_pos + V)
        p_w_neg = (freq_neg + 1) / (N_neg + V)

        loglikelihood[word] = np.log(p_w_pos / p_w_neg)

    return logprior, loglikelihood


In [327]:
logprior, loglikelihood = train_naive_bayes(filtered_freqs, y_train)


In [328]:
import numpy as np

def predict_naive_bayes(review, logprior, loglikelihood):
    words = review.split()  # Split the review into words
    
    p = 0
    
    p += logprior
    
    for word in words:
        if word in loglikelihood:
            p += loglikelihood[word]
    
    if p > 0:
        return 1
    else:
        return 0



In [329]:
predictions = []

for review in X_test:
    predictions.append(predict_naive_bayes(review, logprior, loglikelihood))

predictions = np.array(predictions)

accuracy = np.mean(predictions == y_test) * 100
print(f"Accuracy on test set: {accuracy:.2f}%")


Accuracy on test set: 89.03%


In [330]:
predictions = []

for review in X_train:
    predictions.append(predict_naive_bayes(review, logprior, loglikelihood))

predictions = np.array(predictions)

accuracy = np.mean(predictions == y_train) * 100
print(predictions)
print(f"Accuracy on train set: {accuracy:.2f}%")


[0 1 0 ... 0 1 1]
Accuracy on train set: 88.57%


In [338]:
movie_review= "Screenplay structure, editing, cinematography, grand settings, graphics of new technology can go on and on. Best wishes for success."
movie_review=preprocess_text(movie_review)
p=predict_naive_bayes(movie_review, logprior,loglikelihood)
if(p>0):
    print("Positive Sentiment")
else:
    print("Negative Sentiment")

Positive Sentiment
