# Install and import all required dependencies

In [1]:
# Has installed all required libraies and modules in local machine to import the libraries
%reset -f
import numpy as np
import pandas as pd
import math

import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
from spacy.lang.en.stop_words import STOP_WORDS

In [4]:
stopwords = list(STOP_WORDS)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

### Load data



In [6]:
#We have converted given data into csv file. And have used the same file

In [7]:
b_data = pd.read_csv('book_review_sentiment.csv')

In [9]:
columns_name = ["Review", 'Sentiment']
b_data.columns = columns_name

In [10]:
b_data.head()

Unnamed: 0,Review,Sentiment
0,I like to use the Amazon reviews when purchasi...,0
1,THis book was horrible. If it was possible to...,0
2,"I'm not sure who's writing these reviews, but ...",0
3,I picked up the first book in this series (The...,0
4,"Not only do I disagree with his opinions, but ...",0


In [11]:
b_data.shape

(1981, 2)

In [12]:
b_data['Sentiment'].value_counts()

1    996
0    985
Name: Sentiment, dtype: int64

In [13]:
b_data.isnull().sum()

Review       0
Sentiment    0
dtype: int64

## Text Classification and Tokenization

In [14]:
import string

In [15]:
punct = string.punctuation

In [16]:
punct

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

## Data cleaning process

In [17]:
def text_data_cleaning(sentence):
    doc = nlp(sentence)
    
    tokens = []
    for token in doc:
        if token.lemma_ != "-PRON-":
            temp = token.lemma_.lower().strip()
        else:
            temp - token.lower_
        tokens.append(temp)
    cleaned_tokens = []
    for token in tokens:
        if token not in stopwords and token not in punct:
            cleaned_tokens.append(token)
    return cleaned_tokens

### Define the network architecture and Vectorization Feature Engineering(TF-IDF)

In [18]:
from sklearn.svm import LinearSVC
vectorizer = LinearSVC()

In [19]:
tfidf = TfidfVectorizer(tokenizer = text_data_cleaning)
classifier = LinearSVC()

In [20]:
X = b_data['Review']
y = b_data['Sentiment']

## Split the data, define model, train and test the model

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [22]:
 X_train.shape, X_test.shape

((1584,), (397,))

In [23]:
clf = Pipeline([('tfidf', tfidf), ('clf', classifier)])

In [24]:
clf.fit(X_train, y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(tokenizer=<function text_data_cleaning at 0x000002A8E5BE4430>)),
                ('clf', LinearSVC())])

In [25]:
y_pred = clf.predict(X_test)

In [26]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.73      0.77       195
           1       0.76      0.83      0.80       202

    accuracy                           0.78       397
   macro avg       0.79      0.78      0.78       397
weighted avg       0.79      0.78      0.78       397



In [27]:
confusion_matrix(y_test, y_pred)

array([[143,  52],
       [ 34, 168]], dtype=int64)

## Testing Model and predicting outcome

In [28]:
# It gives output as 1 which can be referred as positive
clf.predict(['Wow, this is amazing lesson'])

array([1], dtype=int64)

In [29]:
#It gives output as 0 which can be refferd as negative
clf.predict([" this is bad"])

array([0], dtype=int64)

## dvd data review

In [30]:
d_data = pd.read_csv('dvd_review.csv')

In [31]:
columns_name = ["Review", 'Sentiment']
d_data.columns = columns_name

## Tokenization and cleaning data

In [32]:
import string
punct = string.punctuation

In [33]:
def text_data_cleaning(sentence):
    doc = nlp(sentence)
    
    tokens = []
    for token in doc:
        if token.lemma_ != "-PRON-":
            temp = token.lemma_.lower().strip()
        else:
            temp - token.lower_
        tokens.append(temp)
    cleaned_tokens = []
    for token in tokens:
        if token not in stopwords and token not in punct:
            cleaned_tokens.append(token)
    return cleaned_tokens

## Split the data, define model, train and test the model

In [34]:
X = d_data['Review']
y = d_data['Sentiment']

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [36]:
X_train.shape, X_test.shape


((1593,), (399,))

In [37]:
clf = Pipeline([('tfidf', tfidf), ('clf', classifier)])

In [38]:
clf.fit(X_train, y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(tokenizer=<function text_data_cleaning at 0x000002A8E5BE4430>)),
                ('clf', LinearSVC())])

In [39]:
y_pred = clf.predict(X_test)

In [40]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.73      0.77       194
           1       0.77      0.83      0.80       205

    accuracy                           0.78       399
   macro avg       0.79      0.78      0.78       399
weighted avg       0.79      0.78      0.78       399



In [41]:
confusion_matrix(y_test, y_pred)

array([[142,  52],
       [ 34, 171]], dtype=int64)

In [42]:
# It gives output as 1 which can be referred as positive
clf.predict(['Wow, this is amazing lesson'])

array([1], dtype=int64)

In [43]:
#It gives output as 0 which can be refferd as negative
clf.predict([" this is bad"])

array([0], dtype=int64)

### References

##### Kant, L. (2020). NLP-Tutorial-8---Sentiment-Classification-using-Spacy-for-IMDB-and-Amazon-Review-Dataset Retrieved from https://github.com/laxmimerit