In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import string
from sklearn.model_selection import train_test_split
import re
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, f1_score


[nltk_data] Downloading package punkt to /home/spyros/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/spyros/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [35]:
df = pd.read_csv('Tweets.csv', index_col=None)
#df = df.loc[:8000,:]
cols2Keep=['airline_sentiment', 'text']
df = df[cols2Keep]
df.rename(columns={'airline_sentiment':'sentiment'}, inplace=True)
df.replace({'neutral':1, 'positive':0, 'negative':2}, inplace=True)
display(df.shape, df.head(), df['sentiment'].value_counts())

(14640, 2)

Unnamed: 0,sentiment,text
0,1,@VirginAmerica What @dhepburn said.
1,0,@VirginAmerica plus you've added commercials t...
2,1,@VirginAmerica I didn't today... Must mean I n...
3,2,@VirginAmerica it's really aggressive to blast...
4,2,@VirginAmerica and it's a really big bad thing...


2    9178
1    3099
0    2363
Name: sentiment, dtype: int64

In [36]:
# balance the classes
df = df.sort_values(by='sentiment')
df = df.iloc[:8400,:]
df = df.sample(frac=1).reset_index(drop=True)
display(df.shape, df.head(), df['sentiment'].value_counts())

(8400, 2)

Unnamed: 0,sentiment,text
0,0,“@JetBlue: @FinleyBklynCFS So glad to hear. Th...
1,0,@USAirways that's why u guys are my #1 choice.
2,0,@AmericanAir Haha I had a boarding pass for 12...
3,2,@USAirways if I try and call your reservations...
4,0,@united @jsumiyasu I am thankful to the Unite...


1    3099
2    2938
0    2363
Name: sentiment, dtype: int64

In [38]:
stop = stopwords.words('english')

# These words are important for this problem. I should not remove them.
excluding = ['against', 'not', 'don', "don't",'ain', 'aren', "aren't", 'couldn', "couldn't",
             'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 
             'haven', "haven't", 'isn', "isn't", 'mightn', "mightn't", 'mustn', "mustn't",
             'needn', "needn't",'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', 
             "weren't", 'won', "won't", 'wouldn', "wouldn't"]

# Expand the stopword list
stop_words = [word for word in stop if word not in excluding]
# Same goes to the punctuation list
string.punctuation = string.punctuation + '£' +'’' + '...'

    
# Function to clean the text
def cleaningText(text, stem, lemm):
    
    text = text.lower() # Lowercase 
    text = text.strip() # Remove leading/trailing whitespace
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text) # Remove punctuation
    text = re.sub('\s+', ' ', text) # Remove extra space and tabs
    text = re.compile('<.*?>').sub('', text) # Remove any HTML tags/markups
    
    #import pdb; pdb.set_trace()
    filtered_sentence=[]
    # Tokenize the sentence
    words = word_tokenize(text)
    for w in words:
        if w not in stop_words:
            filtered_sentence.append(w)
    text = " ".join(filtered_sentence)
    
    if stem:
        # Initialize the stemmer
        snow = SnowballStemmer('english')

        stemmed_sentence = []
        # Tokenize the sentence
        words = word_tokenize(text)
        for w in words:
            # Stem the word/token
            stemmed_sentence.append(snow.stem(w))
        text = " ".join(stemmed_sentence)
    
    
    # Initialize the lemmatizer
    lemmatizer = WordNetLemmatizer()

    # This is a helper function to map NTLK position tags
    def get_wordnet_position(tag):
        if tag.startswith('J'):
            return wordnet.ADJ
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('N'):
            return wordnet.NOUN
        elif tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN
    if lemm:
        lemmatized_sentence = []
        # Tokenize the sentence
        words = word_tokenize(text)
        # Get position tags
        word_pos_tags = nltk.pos_tag(words)
        # Map the position tag and lemmatize the word/token
        for idx, tag in enumerate(word_pos_tags):
            lemmatized_sentence.append(lemmatizer.lemmatize(tag[0], get_wordnet_position(tag[1])))
        text = " ".join(lemmatized_sentence)
    
    return text

df['text'] = df['text'].astype(str)

# I will not use stem - found out that it removes the suffixes in a way that is not helpful here. Lemmatizer is doing a better job at this case.
df['textClean'] = df.apply(lambda x: cleaningText(x['text'], False, False),  axis=1)   

In [39]:
# vectorize the data
input_texts = df['textClean'].values
labels = df['sentiment'].values

In [40]:
# split my data
train_text, test_text, Ytrain, Ytest = train_test_split(input_texts, labels, test_size=0.05)
len(Ytrain), len(Ytest)

(7980, 420)

In [41]:
counter = 1
word2index = {'<unknown>': 0}

# populate word2idx
for text in train_text:
    tokens = text.split()
    for token in tokens:
        if token not in word2index:
            word2index[token] = counter
            counter += 1

In [42]:
voc_len = len(word2index)
print(voc_len)

10768


In [43]:
# convert data into integer format
integerTrainText = []
integerTestText = []

for text in train_text:
    tokens = text.split()
    integerSentence = [word2index[token] for token in tokens]  #assign the unique integer of every word in the sentence to the corresponding list
    integerTrainText.append(integerSentence)

for text in test_text:
    tokens = text.split()
    integerSentence = [word2index.get(token, 0) for token in tokens]
    integerTestText.append(integerSentence)

In [None]:
# initialize A and pi matrices - for both sentiments
V = len(word2index)

A0 = np.ones((V, V))
pi0 = np.ones(V)

A1 = np.ones((V, V))
pi1 = np.ones(V)

A2 = np.ones((V, V))
pi2 = np.ones(V)

In [22]:
# compute counts for A and pi
# This is no other than the fitting part of any sklearn interface approximation
def compute_counts(integerText, A, pi):
    for tokens in integerText:
        last_idx = None
        for idx in tokens:
            if last_idx is None:
            # it's the first word in a sentence, so we need to populate pi arrays for both classes
                pi[idx] += 1
            else:
                # the last word exists, so count a transition. The transition matrix is the A matrix which is 
                # populated by previous word transiting to the next word
                A[last_idx, idx] += 1

            # update last idx to correct its value for the next iterration of the for block
            last_idx = idx


compute_counts([t for t, y in zip(integerTrainText, Ytrain) if y == 0], A0, pi0)   #input parameters ~ keep only zipped of sentence integers with the class
compute_counts([t for t, y in zip(integerTrainText, Ytrain) if y == 1], A1, pi1)   #will update my A, pi matrices for each class
compute_counts([t for t, y in zip(integerTrainText, Ytrain) if y == 2], A2, pi2)   #will update my A, pi matrices for each class

In [23]:
# normalize A and pi so they are converted from counts to valid probability matrices

A0 /= A0.sum(axis=1, keepdims=True)   # keepdims to format 2D products
pi0 /= pi0.sum()

A1 /= A1.sum(axis=1, keepdims=True)
pi1 /= pi1.sum()

A2 /= A2.sum(axis=1, keepdims=True)
pi2 /= pi2.sum()

In [24]:
# working with log probabilities to avoid small multiplications effects
logA0 = np.log(A0)
logpi0 = np.log(pi0)

logA1 = np.log(A1)
logpi1 = np.log(pi1)

logA2 = np.log(A2)
logpi2 = np.log(pi2)

In [25]:
# prior probabilities computation
count0 = sum(y == 0 for y in Ytrain)
count1 = sum(y == 1 for y in Ytrain)
count2 = sum(y == 2 for y in Ytrain)
total = len(Ytrain)
p0 = count0 / total
p1 = count1 / total
p2 = count2 / total
logp0 = np.log(p0)
logp1 = np.log(p1)
logp2 = np.log(p2)
p0, p1, p2

(0.2794486215538847, 0.3705513784461153, 0.35)

In [26]:
# build a classifier
class Classifier:
    def __init__(self, logAs, logpis, logpriors): #the constructor will take as inputs the log form of transition matrix, the initial pis and the priors probabilities
        self.logAs = logAs
        self.logpis = logpis
        self.logpriors = logpriors
        self.K = len(logpriors) # number of classes

    def _compute_log_likelihood(self, input_, class_): #parameters: input text of integers with the corresponding class
        logA = self.logAs[class_]                    #the class value will determine which of 2 Markov Model is to be used
        logpi = self.logpis[class_]

        last_idx = None
        logprob = 0   #initialize the final answer
        for idx in input_:
            if last_idx is None:
            # it's the first token of the sentence so we use pi matrix
                logprob += logpi[idx]
            else:
                # it's NOT the first token of the sentence so we use the state transition matrix
                logprob += logA[last_idx, idx]
      
            # update last_idx for the next iterration
            last_idx = idx
    
        return logprob
  
    def predict(self, inputs):
        predictions = np.zeros(len(inputs)) #store the preds
        for i, input_ in enumerate(inputs):
            posteriors = [self._compute_log_likelihood(input_, c) + self.logpriors[c] for c in range(self.K)]
            pred = np.argmax(posteriors)  #get the largest probability as a presult
            predictions[i] = pred #store it
        return predictions

In [27]:
# each array must be in order since classes are assumed to index these lists
clf = Classifier([logA0, logA1, logA2], [logpi0, logpi1, logpi2], [logp0, logp1, logp2])

In [34]:
Ptrain = clf.predict(integerTrainText)
print(f"Train acc: {round(np.mean(Ptrain == Ytrain),3)}")

Train acc: 0.966


In [33]:
Ptest = clf.predict(integerTestText)
print(f"Test acc: {round(np.mean(Ptest == Ytest),3)}")

Test acc: 0.707
