In [1]:
# -*- coding: utf-8 -*-
"""
Created on Mon Jul  6 23:59:31 2020

@author: Hamza
"""

import pandas as pd
import re
import string as st
from sklearn.model_selection import train_test_split as tts
from sklearn.feature_extraction.text import TfidfVectorizer as tfidf
from sklearn.linear_model import LogisticRegression as lr
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score as acc,precision_score as prec, recall_score as rec
import pickle #for saving and loading the file

train_data=pd.read_csv(r'C:\Users\Shifa\Downloads\IMDB Dataset.csv')#reading the data
train_data.shape#shows the shape
train_data.head(10)#shows what the top 10 data looks like


def cleaning(text):
    text=text.lower()
    text=re.sub('\[.*?\]','',text)
    text=re.sub('[%s]'%re.escape(st.punctuation),'',text)
    text = re.sub('\w*\d\w*','',text)
    text = re.sub('[‘’“”…]','',text)
    text = re.sub('\n','',text)
    return text
#removing all the new line and special characters and converting everything into lower case
    
train_data['Description']=pd.DataFrame(train_data.review.apply(cleaning))#adding the cleaned  data in the training data
train_data.head()

#preparation for training
independent_var=train_data.review
dependent_var=train_data.sentiment
IV_train,IV_test,DV_train,DV_test=tts(independent_var,dependent_var,test_size=0.1,random_state=225)

print('IV_train: ', len(IV_train))
print('IV_test: ', len(IV_test))
print('DV_train: ', len(DV_train))
print('DV_test: ', len(DV_test))

#the data s split into training and test sets. Training part builds the model and test part tests for the accuracy.here, size of test set in 10% of training set

'''random state?
When the Random_state is not defined in the code for every run train data will change and accuracy might change 
for every run. When the Random_state = " constant integer" is defined then train data will be constant For every run
so that it will make easy to debug'''

#training

vectorizer=tfidf()
classifier=lr(solver = "lbfgs")#this solver is used for smaller datasets.
model=Pipeline([('vectorizer',vectorizer),('classifier',classifier)])
model.fit(IV_train, DV_train)
predictions=model.predict(IV_test)

'''
Scikit-learn is a free software machine learning library for the Python programming language. 
It features various classification, regression and clustering algorithms including support vector machines.

With Tfidfvectorizer you compute the word counts, idf and tf-idf values all at once. It’s really simple.

Logistic regression is a classification algorithm used to assign observations to a discrete set of classes. 
Some of the examples of classification problems are Email spam or not spam, Online transactions Fraud or not Fraud, Tumor Malignant or Benign. 
Logistic regression transforms its output using the logistic sigmoid function to return a probability value.
review positive or negative in this case.

it is a predictive analysis algorithm and based on the concept of probability.

pipelines are used to make the code look cleaner

'''

#checking the performance
print("Accuracy: ",acc(predictions,DV_test))
print("Precision: ",prec(predictions,DV_test,average='weighted'))
print("Recall: ",rec(predictions,DV_test,average='weighted'))

'''
The precision is the ratio tp / (tp + fp) where tp is the number of true positives and fp the number of false positives. 
The precision is intuitively the ability of the classifier not to label as positive a sample that is negative.

weighted: Calculate metrics for each label, and find their average weighted by support 

The recall is intuitively the ability of the classifier to find all the positive samples

'''

with open('sentimental_analyze_on_movie_reviews.pkl', 'wb') as f:
    pickle.dump(model, f)

IV_train:  45000
IV_test:  5000
DV_train:  45000
DV_test:  5000
Accuracy:  0.9032
Precision:  0.9033119893810588
Recall:  0.9032


In [5]:
#testing
with open(r'C:\Users\Shifa\Desktop\Sentiment Analysis\sentimental_analyze_on_movie_reviews.pkl','rb') as f:
    sentimental_analyze=pickle.load(f)

random_comments = [
    "I was thoroughly entertained by this film from start to finish. The plot twists kept me guessing, and the performances were outstanding.",
    "This movie was a total disappointment. The storyline was cliché, and the acting felt forced and unnatural.",
    "An absolute masterpiece! The cinematography was breathtaking, and the soundtrack complemented the emotions perfectly.",
    "I found the movie to be incredibly boring. The pacing was slow, and I struggled to stay awake through the whole thing.",
    "This film had me laughing out loud throughout. The humor was on point, and the characters were lovable.",
    "A visual spectacle with stunning special effects. However, the lack of a coherent plot left me feeling unsatisfied.",
    "I was moved to tears by the touching story. The actors delivered powerful performances that resonated deeply with me.",
    "I didn't enjoy this movie at all. It was too predictable, and the dialogue was cringeworthy.",
    "A thrilling ride from beginning to end! The action scenes were intense, and the plot was well thought out.",
    "This movie had a lot of potential, but it fell flat. The direction was poor, and the ending was unsatisfying.",
    "I liked this movie"
]

for cmt in random_comments:
    result = sentimental_analyze.predict([cmt])  # Wrap the comment in a list
    if result[0] == 'positive':
        print(cmt,": positive")
    else:
        print(cmt,": negative")
    print()

I was thoroughly entertained by this film from start to finish. The plot twists kept me guessing, and the performances were outstanding. : positive

This movie was a total disappointment. The storyline was cliché, and the acting felt forced and unnatural. : negative

An absolute masterpiece! The cinematography was breathtaking, and the soundtrack complemented the emotions perfectly. : positive

I found the movie to be incredibly boring. The pacing was slow, and I struggled to stay awake through the whole thing. : negative

This film had me laughing out loud throughout. The humor was on point, and the characters were lovable. : positive

A visual spectacle with stunning special effects. However, the lack of a coherent plot left me feeling unsatisfied. : negative

I was moved to tears by the touching story. The actors delivered powerful performances that resonated deeply with me. : positive

I didn't enjoy this movie at all. It was too predictable, and the dialogue was cringeworthy. : ne