# **Spacy**

In [1]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
nlp = spacy.load('en_core_web_sm')



In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 

from sklearn.metrics import accuracy_score 
from sklearn.model_selection import train_test_split
from sklearn.base import TransformerMixin 
from sklearn.svm import LinearSVC 
from sklearn.pipeline import Pipeline 

In [3]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score


In [17]:
import numpy as np
import pandas as pd
import nltk
import re
import string
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [33]:
from imblearn.over_sampling import RandomOverSampler, SMOTE

In [4]:
def text_cleaning(text):
    return clean_text.label().lower_case()


In [24]:
def spacy_tokenizer(doc):
    return [x.orth_ for x in nlp(doc)]

vectorizer = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))
classifier = LinearSVC()
tfvectorizer = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [15]:
reviews_csv = pd.read_csv('../reviews.csv')

In [18]:
# Function to convert text to lowercase
def convert_to_lower(text):
    return text.lower()

# Removes html notation, such as <br/>
def remove_html(text):
    regex = r"<[^>]+>"
    text_new = re.sub(regex, " ", text)
    return text_new

# Function to remove numbers and other numeric values
def remove_numbers(text):
    number_pattern = r'\d+'
    without_number = re.sub(pattern=number_pattern, repl=" ", string=text)
    return without_number

# Function to remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

# Function to remove stopwords
def remove_stopwords(text):
    removed = []
    stop_words = list(stopwords.words("english"))
    tokens = word_tokenize(text)
    for i in range(len(tokens)):
        if tokens[i] not in stop_words:
            removed.append(tokens[i])
    return " ".join(removed)

# Function to remove extra white spaces
def remove_extra_white_spaces(text):
    single_char_pattern = r'\s+[a-zA-Z]\s+'
    without_sc = re.sub(pattern=single_char_pattern, repl=" ", string=text)
    return without_sc

# Function to Lemmatize
def lemmatizing(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    for i in range(len(tokens)):
        lemma_word = lemmatizer.lemmatize(tokens[i])
        tokens[i] = lemma_word
    return " ".join(tokens)


reviews_csv['Text'] = reviews_csv['Text'].apply(lambda x: convert_to_lower(x))
reviews_csv['Text'] = reviews_csv['Text'].apply(lambda x: remove_html(x))
reviews_csv['Text'] = reviews_csv['Text'].apply(lambda x: remove_numbers(x))
reviews_csv['Text'] = reviews_csv['Text'].apply(lambda x: remove_punctuation(x))
reviews_csv['Text'] = reviews_csv['Text'].apply(lambda x: remove_stopwords(x))
reviews_csv['Text'] = reviews_csv['Text'].apply(lambda x: remove_extra_white_spaces(x))
reviews_csv['Text'] = reviews_csv['Text'].apply(lambda x: lemmatizing(x))


In [19]:
X = reviews_csv['Text']

label_map = {
    'positive': 1,
    'negative': 0,
}
reviews_csv['sentiment_label'] = reviews_csv['Sentiment'].map(label_map)
ylabels = reviews_csv['sentiment_label']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.25, random_state=42)

In [28]:
pipe = Pipeline([#("cleaner", predictors())
                 ('vectorizer', tfvectorizer),
                 ('classifier', classifier)])

In [29]:
pipe.fit(X_train,y_train)



In [30]:
pred = pipe.predict(X_test)
f1_score(y_test, pred)

0.9364161849710984

In [31]:
print("F1 score: ", f1_score(y_test, pred))
print("PR_AUC score: ", average_precision_score(y_test, pred))
print("ROC_AUC score: ", roc_auc_score(y_test, pred))
print("Accuracy :",pipe.score(X_test, y_test))

F1 score:  0.9364161849710984
PR_AUC score:  0.9131134755677366
ROC_AUC score:  0.8539450941983062
Accuracy : 0.9030124908155768


# **Pattern**

In [39]:
import pattern
from pattern.en import parse
from pattern.en import pprint

In [42]:
from pattern.en import sentiment
print(sentiment("The course of true love never did run smooth."))

(0.4166666666666667, 0.5833333333333334)


In [None]:
polarity = []

for tweet in reviews_csv['Text'].to_list():
    s = sentiment(tweet)
    polarity.append(s[0])

# add probability and sentiment predictions to tweets dataframe
#tweets['probability'] = probs
#tweets['sentiment'] = sentiments
polarity

In [57]:
pattern_sentiment = []
for score in polarity:
  if score < 0:
    pattern_sentiment.append(0)
  elif score >= 0:
    pattern_sentiment.append(1)

pattern_sentiment

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,


In [58]:
len(polarity)

5444

In [59]:
print("F1 score: ", f1_score(reviews_csv['sentiment_label'], pattern_sentiment))
print("PR_AUC score: ", average_precision_score(reviews_csv['sentiment_label'], pattern_sentiment))
print("ROC_AUC score: ", roc_auc_score(reviews_csv['sentiment_label'], pattern_sentiment))
print("Accuracy :", accuracy_score(reviews_csv['sentiment_label'], pattern_sentiment))

F1 score:  0.8754691231661549
PR_AUC score:  0.8050572447178244
ROC_AUC score:  0.6543468189427947
Accuracy : 0.7988611315209405


In [None]:
F1 score:  0.8740791896869244
PR_AUC score:  0.8108267234637438
ROC_AUC score:  0.6666363658698375
Accuracy : 0.799044819985305