# Sentiment Classification
The Task of Sentiment Classification can be Broadly divided into two parts:
1. Natural Language Processing
2. Sentiment Classification
    - Naive Bayes Classifier
    - Support Vector Machines
    - RoBERTa (AutoEncoder Transformers)

So let us first import all the pre-requisites before hand:

In [None]:
import sys
%pip install -U pip setuptools wheel
%pip install -U spacy
%pip install spacy[transformer]
%pip install pandas-profiling[notebook]
%pip install -U scikit-learn
%pip install -U matplotlib
%pip install -U pandas


In [None]:
# ToDO 
 prep all the packages for nlp


In [None]:
%python -m spacy download en_core_web_sm
%python -m spacy download en_core_web_trf


In [None]:
#import packages
import pandas as pd
import matplotlib.pyplot as plt
import spacy
from sklearn.preprocessing import LabelEncoder

In [None]:
raw_trainDF=pd.read_csv('train.csv')
raw_testDF=pd.read_csv('test.csv')
raw_trainDF.head()
#prints the first five entries of test data

In [None]:
trainDF=raw_trainDF
testDF=raw_testDF

In [None]:
#pandas profiling report
from pandas_profiling import ProfileReport
p_train=ProfileReport(raw_trainDF, title="Train profile report")
p_train.to_file(output_file="TrainReport.html")
p_train

In [None]:
p_test=ProfileReport(raw_testDF, title="Test profile Report")
p_test.to_file(output_file="TestReport.html")
p_test

In [None]:
#Visualization For the EDA
#Train Set
sent=trainDF.groupby(by=['ReviewAt']).count()['Sentiment']
time=trainDF.ReviewAt.unique()

fig=plt.figure(figsize=(15,7))
plt.xticks(rotation=70)
plt.bar(time, sent)
plt.xlabel("Date")
plt.ylabel("Count")
plt.title("Number of Reviews by time (Train)")
plt.show()


In [None]:
#Test Set
sent=testDF.groupby(by=['ReviewAt']).count()['Sentiment']
time=testDF.ReviewAt.unique()

fig=plt.figure(figsize=(15,7))
plt.xticks(rotation=70)
plt.bar(time, sent)
plt.xlabel("Date")
plt.ylabel("Count")
plt.title("Number of Reviews by time (Test)")
plt.show()

In [None]:
#Visualizing of Sentiment Category
sent_names=list(set(trainDF.Sentiment.values))
train_sent_cats=[]
test_sent_cats=[]

for name in sent_names:
    train_sent_cats.append(trainDF.Sentiment.value_counts()[name])
    test_sent_cats.append(testDF.Sentiment.value_counts()[name])

fig, ax = plt.subplots(1,2, figsize=(10,5))
ax[0].pie(train_sent_cats, labels=sent_names, autopct='%1.1f%%')
ax[0].set_title("Sentiment Categories in Train Data")
ax[1].pie(test_sent_cats, labels=sent_names, autopct='%1.1f%%')
ax[1].set_title("Sentiment Categories in Test Data")
plt.show()

# Resplitting part
Boring as repeat splitting and merging process

In [None]:
sample_text=trainDF.OriginalReview[1]
print(sample_text)


In [None]:
# Load the english library from SpaCy
nlp=spacy.load("en_core_web_sm")

#nlp() to all values
testDF.nlp=testDF.OriginalTweet.apply(lambda x: nlp(x))


In [None]:
testDF.np=testDF.nlp.apply(lambda x: [chunk.text for chunk in x.noun_chunk])
testDF.vb=testDF.nlp.apply(lambda x: [token.lemma_ for token in x if token.pos_ =="VERB"])
print(testDF.np.head())
print(testDF.vb.head())


In [None]:
nlp=spacy.load("en_core_web_sm")
doc=nlp(sample_text)

#Analyze syntax

print("Noun phrase: ", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_=="VERB"])

#Finding Named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)
    

# PreProcessing

In [None]:
%pip install contextualSpellCheck
%pip install ipywidgets

# Encoding the labels

In [None]:
# label Encoder for classes in Sentiment
from sklearn.preprocessing import LabelEncoder

encoder=LabelEncoder()
trainDF.encoded_sentiment=encoder.fit_transform(trainDF.Sentiment)
trainDF.encoded_sentiment

encoder=LabelEncoder()
testDF.encoded_sentiment=encoder.fit_transform(testDF.Sentiment)
testDF.encoded_sentiment

In [None]:
#Spelling correction
import contextualSpellCheck
nlp=spacy.load('en_core_web_sm')

#Add Contextual spellchecker to the pipeline

nlp.add_pipe("contextual spellchecker", config={"max_edit_dist": 5})

doc=nlp(sample_text)

print(doc._.outcome_spellCheck)

In [None]:
import string
import re
nlp=spacy.load("en_core_web_sm")

#Add contextual Spell check to pipeline
nlp.add_pipe("Contextual Spellchecker", config={"max_edit_dist": 5})

#Create list of punctuation marks
punctuations = string.punctuation

#Create list of stopwords from spaCy
stopwords=spacy.lang.en.stop_words.STOP_WORDS

# Remove URLs
def remove_urls(text):
    text=re.sub(r"\S*https?:\S&","",text,flags=re.MULTILINE)
    return text
#create tokenizer
def spacy_tokenizer(sentence):
    #init token obj
    tokens=nlp(sentence)
    #lemmetizer
    tokens=[word.lemma_.lower().strip() if word.lemma_ !="PROPN" else word.lower_ for word in tokens]
    #Remove Stopwords
    tokens=[word for word in tokens if word not in stopwords and word not in punctuations]

    #Remove Links
    tokens = [remove_urls(word) for word in tokens]

    return tokens

spacy_tokenizer(sample_text)


# (BoW) Bag of Words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
bow_vector=CountVectorizer(tokenizer=spacy_tokenizer, ngram_range=(1,1))


In [None]:
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
#custom class
class predictors(TransformerMixin):
    def clean_text(text):
        return text.strip().lower()
    
    def transform(self, X, **transform_params):
        # text Cleaning
        return [clean_text(text) for text in X]
    
    def fit(self, Xm y=None, **fit_params):
        return self
    
    def get_params(self, deep=True):
        return {}
    
    from sklearn.feature_extraction.text import CountVectorizer
    bow_vector=CountVectorizer(tokenizer=spacy_tokenizer,ngram_range=(1,1))

    #Multinomial Naive Bayes Classifier
    from sklearn.naive_bayes import MultinomialNB
    classfier=MultinomialNB()

    #pipeline
    pipe=Pipeline([("cleaner", predictors()),
                   ('vectorizer', bow_vector),
                   ('classfier', classifier)])

# Statistical Model Training

In [None]:
X_train=trainDF.OriginalTweet
X_test=testDF.OriginalTweet
y_train=trainDF.encoded_sentiment
y_test=testDF.encoded_sentiment

In [None]:
#Multinomial Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
classifier=MultinomialNB()

pipeNB=Pipeline([("cleaner",predictors()),
                 ("vectorizer",bow_vector),
                 ("classifier",classifier)])

pipeNB.fit(X_train,y_train)


In [None]:
#SVM Classfier
from sklearn.svm import SVC
classifier_svm=SVC()

pipeSVM=Pipeline([("cleaner",predictors()),
                   ("vectorizer", bow_vector),
                   ("classifier",classifier)])

#model Generation
pipeSVM.fit(X_train,y_train)



# Time For Neural Network Shenanigans

In [None]:
#import
import spacy
import pandas as pd
import re
from spacy.tokens import DocBin
from tqdm import tqdm

In [None]:
#PreProcessing data
def remove_url(text):
    parsed_text=re.sub(r"\S*https?:\S*","",text,flags=re.MULTILINE)
    return parsed_text

def preprocess(df, embed):
    df.OriginalTweet=df.OriginalTweet.apply(remove_url)
    data=tuple(zip(df.OriginalTweet.tolist(), df.Sentiment.tolist()))

    nlp=spacy.load(embed)
    print(data[0])

    docs=[]
    for doc, label in tqdm(nlp.pipe(data, as_tuples=True), total=len(data)):
        if label=='Positive':
            doc.cats['positive']=1
            doc.cats['negative']=0
            doc.cats['neutral']=0
        elif label=='Neutral':
            doc.cats['positive']=0
            doc.cats['negative']=0
            doc.cats['neutral']=1
        else:
            doc.cats['positive']=0
            doc.cats['negative']=1
            doc.cats['neutral']=0

        docs.append(doc)
    return df, docs


# Spacy Config

In [None]:
#init from base_config
%python -m spacy init fill-config ../config/base_config.cfg ../config/config.cfg

In [None]:
%python -m spacy debug data ../config/config.cfg

Convert the dataframes to spacy files for training

In [None]:
#preprocess df for train data
trainData,trainDocs=preprocess(trainDF,"en_core_web_sm")
#save to disc
doc_bin=DocBin(docs=trainDocs)
doc_bin.to_disk("/spacy_data/textcat_train.spacy")


#preprocess df for test data
testData,testDocs=preprocess(testDF,"en_core_web_sm")
#save to disc
doc_bin=DocBin(docs=testDocs)
doc_bin.to_disk("/spacy_data/textcat_valid.spacy")

# Model Training
we'll see about that

In [None]:
#View the entities in the train and test docs
train_loc="/spacy_data/textcat_train.spacy"
dev_loc="/spacy_data/textcat_train.spacy"

#Load library and train data
nlp=spacy.load('en_core_web_sm')
doc_bin=DocBin().from_disk(train_loc)
docs=list(doc_bin.get_docs(nlp.vocab))
entities=0

#iterate through the docs
for doc in docs:
    entities+=len(doc.ents)
print(f"TRAIN docs: {len(docs)} with {entities} entities")

#Load Library and test data
doc_bin=DocBin().from_disk(dev_loc)
docs=list(doc_bin.get_docs(nlp.vocab))
entities=0

#iterate over the docs
for doc in docs:
    entities+=len(doc.ents)
print(f"DEV docs: {len(docs)} with {entities} entities")

In [None]:
#train model
%python -m spacy train ../config/config.cfg --verbose --output ../data/textcat_output --path.train ../data/spacy_data/textcat_train.spacy --paths.dev ../data/spacy_data/textcat_valid.spacy

In [None]:
#Pick the best epoch model
#verify model
nlp_model=spacy.load("../data/textcat_output/model-best")
test_text=testData.OriginalTweet.tolist()
test_cats=testData.Sentiment.tolist()
doc_test=nlp_model(test_text[20])
print("Text: "+ test_text[20])
print("Orig Cat: "+ test_cats[20])
print("Predicted Cats: ")
print(doc_test.cats)


# Pre-Trained Bert 

In [None]:
#Convert the train and test dataframes to .spacy files for training

#Preprocess the dataframes for train data
doc_bin=DocBin(docs=trainDocs)
doc_bin.to_disk("/spacy_data/text_roberta_train.spacy")

#Preprocess the dataframes for test data
test_data_roberta, testDocs=preprocess(testDF,"en_core_web_trf")

#Save data and docs in a binary file to disc
doc_bin=DocBin(docs=testDocs)
doc_bin.to_disk("data/spacy_data/textcat_roberta_valid.spacy")

# Model Training

In [None]:
%python -m spacy train ../config/config.cfg --verbose --output ../data/textcat_roberta_output --paths.train ../data/spacy_data/textcat_roberta_train.spacy --paths.dev ../data/spacy_data/textcat_roberta_valid.spacy

In [None]:
#Verification
nlp_model=spacy.load("../data/textcat_roberta_output/model-best")
test_text=test_data_roberta.OriginalTweet.tolist()
test_cats=test_data_roberta.Sentiment.tolist()
doc_test=nlp_model(test_text[20])
print("Text: "+ test_text[20])
print("Og Cat: "+ test_text[20])
print("Predicted Cats: ")
print(doc_test.cats)

In [None]:
#Convert the train and test dataframes to .spacy files for training

#preprocess df for valid data
valid_data, valid_docs=preprocess(validDF,"en_core_web_sm")
valid_data_roberta, valid_docs=preprocess(validDF,"en_core_web_sm")


In [None]:
#verify for english model
nlp_model=spacy.laod("../data/textcat_output/model-best")
valid_text=valid_data.OriginalTweet.tolist()
valid_cats=valid_data.Sentiment.tolist()
doc_valid=nlp_model(valid_text[50])
print("Text: "+ valid_text[50])
print("OG cat: "+ valid_cats[50])

print("Predicted Cats:")
print(doc_valid.cats)



Done! This is to verify the model

In [None]:
nlp_model_bert = spacy.load("../data/textcat_roberta_output/model-best")
doc_valid_bert = nlp_model_bert(valid_text[50])
print("Text: "+ valid_text[50])
print("Orig Cat: "+ valid_cats[50])
print(" Predicted Cats:") 
print(doc_valid_bert.cats)

# Generating a Report and analyzing/ Performance Analysis

In [None]:
#Classification Report
from sklearn.metrics import classification_report

# Predict with a test dataset
predicted = pipeNB.predict(X_test)

# Model Accuracy
print("Naive Bayes Model:\n")
print(classification_report(y_test, predicted, target_names = ['Negative', 'Neutral', 'Positive']))



In [None]:
# Classification Report
from sklearn.metrics import classification_report
# Predicting with a test dataset
predicted = pipeSVM.predict(X_test)

# Model Accuracy
print("Support Vector Machine:\n")
print(classification_report(y_test, predicted, target_names = ['Negative', 'Neutral', 'Positive']))