In [1]:
# !pip install -U spacy

In [7]:
import spacy
from spacy import displacy
from spacy.lang.en.stop_words import STOP_WORDS
import string

import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [3]:
# !python -m spacy download en_core_web_lg 

In [33]:
imdb_data = pd.read_table('datasets/imdb_labelled.txt', sep='\t', header=None)
amazon_data = pd.read_table('datasets/amazon_cells_labelled.txt', sep = '\t', header = None)
yelp_data = pd.read_table('datasets/yelp_labelled.txt', sep='\t', header = None)

In [46]:
# joiing the tables
combined_col = [amazon_data, yelp_data, imdb_data]

# adding headers
for col_name in combined_col:
    col_name.columns = ["Reviews", "Sentiments"]

# to recognize which dataset belongs to which company adding a company column as a key
company = ["amazon", "yelp", "imdb"]
data = pd.concat(combined_col, keys=company)

# checking for null vaues
data.isnull().sum()

Reviews       0
Sentiments    0
dtype: int64

In [44]:
data.head()

Unnamed: 0,Unnamed: 1,Reviews,Sentiments
amazon,0,So there is no way for me to plug it in here i...,0
amazon,1,"Good case, Excellent value.",1
amazon,2,Great for the jawbone.,1
amazon,3,Tied to charger for conversations lasting more...,0
amazon,4,The mic is great.,1


In [4]:
nlp = spacy.load('en_core_web_sm')



In [8]:
punct = string.punctuation
stopwords = list(STOP_WORDS)

In [25]:
# Text processing
def text_tokenizer(sentence):
    doc = nlp(sentence)
    
#     tokens = []
#     for token in doc:
#         if token.lemma_ != "-PRON-":
#             word = token.lemma_lower().strip()
#         else:
#             word = token.lower_
#         tokens.append(word)
    doc = [token.lemma_.lower().strip() if token.lemma_ != "-PRON-" else token.lower_ for token in doc]
    doc = [token for token in doc if token not in punct and token not in stopwords]
#     clean_words = []
#     for token in tokens:
#         if token not in punct and token not in stopwords:
#             clean_words.append(token)
    
#     return clean_words
    return doc

In [47]:
# vectorization using tfidf
tfidf = TfidfVectorizer(tokenizer=text_tokenizer)
classifier = LinearSVC()

In [49]:
X = data["Reviews"]
y = data["Sentiments"]

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 8)

In [52]:
# creating a pipeline so that we don't need to prepare data separately for training and testing
sent_clf = Pipeline([("tfidf", tfidf), ("clf", classifier)])

In [53]:
sent_clf.fit(X_train, y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(tokenizer=<function text_tokenizer at 0x0000017A911C1700>)),
                ('clf', LinearSVC())])

In [54]:
y_predict = sent_clf.predict(X_test)

In [57]:
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           0       0.79      0.77      0.78       274
           1       0.78      0.79      0.78       276

    accuracy                           0.78       550
   macro avg       0.78      0.78      0.78       550
weighted avg       0.78      0.78      0.78       550



In [58]:
accuracy_score(y_test, y_predict)

0.7818181818181819

In [59]:
confusion_matrix(y_test, y_predict)

array([[211,  63],
       [ 57, 219]], dtype=int64)