# Imports

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import string

import spacy

nlp = spacy.load('en_core_web_sm')

from spacy.lang.en.stop_words import STOP_WORDS

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Load Data

In [None]:
data_yelp = pd.read_csv("yelp_labelled.txt", sep='\t', header=None)

column_names = ['Review', 'Sentiment']
data_yelp.columns = column_names

data_yelp.head()

In [None]:
data_amazon = pd.read_csv("amazon_cells_labelled.txt", sep='\t', header=None)

data_amazon.columns = column_names

data_amazon.head()

In [None]:
data_imdb = pd.read_csv("imdb_labelled.txt", sep='\t', header=None)

data_imdb.columns = column_names

data_imdb.head()

In [None]:
data = data_yelp.append([data_amazon, data_imdb], ignore_index=True)
data.shape

In [None]:
data.head()

# Understanding Data

In [None]:
data_yelp.shape, data_imdb.shape, data_amazon.shape

In [None]:
data['Sentiment'].value_count()

In [None]:
data.isnull().sum()

# Data processing

In [None]:
x = data['Review']
y = data['Sentiment']

In [None]:
punct = string.punctuation
punct

In [None]:
stopwords = list(STOP_WORDS)
stopwords

In [None]:

def Data_cleaning(sentance):
  doc = nlp(sentance)

  tokens = []
  for token in doc:
    if token.lemma_ != "-PRON-":    # -PRON- is a placeholder/lemma for all pronouns such as Their, you, me, and I.  Other NLP tools lemmatize these to I instead of a placeholder
      temp = token.lemma_.lower().strip()     # Lemmatization, lowercase and removes leading and trailing whitespaces from a string.
    else:
      temp = token.lower_

    tokens.append(temp)

  cleaned_tokens = []
  for token in tokens:
    if token not in stopwords and token not in punct:
      cleaned_tokens.append(token)

  return cleaned_tokens


In [None]:
Data_cleaning(x[0])

In [None]:
tfidf = TfidfVectorizer(tokenizer = Data_cleaning)

classifier = LinearSVC()

# Split data

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split (x, y, test_size=0.20, random_state=0)

In [None]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

In [None]:
# Create and Train model

In [None]:
clf = Pipeline([("tfidf", tfidf) , ("clf", classifier)])
clf.fit (x_train, y_train)


# Testing and metrics

In [None]:
y_pred = clf.predict (x_test)

In [None]:
accuracy_score (y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))