# Machine Learning Sentiment Analysis

References of similar ML implementations:

https://github.com/tthustla/yet_another_tiwtter_sentiment_analysis_part1/blob/master/Yet_Another_Twitter_Sentiment_Analysis_part1-Copy1.ipynb

https://towardsdatascience.com/fine-grained-sentiment-analysis-in-python-part-1-2697bb111ed4

https://sajalsharma.com/portfolio/sentiment_analysis_tweets

In [None]:
import numpy as np
import json

# import training and test data
training_file = '../scripts/train.json'
valid_file = '../scripts/valid.json'
test_file = '../scripts/test.json'

with open(training_file, 'r') as json_file:
    training = json.load(json_file)
        
with open(valid_file, 'r') as json_file:
    valid = json.load(json_file)
        
with open(test_file, 'r') as json_file:
    test = json.load(json_file)

training_data = training['data']
training_texts = [tweet['cleaned_text'] for tweet in training_data]
training_labels = [tweet['sentiment'] for tweet in training_data]

test_data = test['data']
test_texts = [tweet['cleaned_text'] for tweet in test_data]
test_labels = [tweet['sentiment'] for tweet in test_data]

valid_data = valid['data']
valid_texts = [tweet['cleaned_text'] for tweet in valid_data]
valid_labels = [tweet['sentiment'] for tweet in valid_data]

train_data = np.concatenate((training_data, valid_data), axis=None)
train_texts = np.concatenate((training_texts, valid_texts), axis=None)
train_labels = np.concatenate((training_labels, valid_labels), axis=None)

### Import lexicon

In [None]:
with open('../../data/senticon-en.json', 'r') as json_file:
      lexicon = json.load(json_file)
lexicon_pos = lexicon['positive']
lexicon_neg = lexicon['negative']

### Manual BoW like in https://sajalsharma.com/portfolio/sentiment_analysis_tweets

In [None]:
# english stopwords would be removed
from nltk.corpus import stopwords

stopwords = set(stopwords.words('english'))

#generate Bag of Words
def generateBoW(data, ignoreStopwords):
    bow = {}
    for tweet in data:
        for token in tweet['tokens']:
            word = token['token']
            if ignoreStopwords:
                if word not in stopwords:
                    bow[word] = bow.get(word,0) + 1
            else:
                bow[word] = bow.get(word,0) + 1
    return bow

bow_train = generateBoW(train_data, True)
len(bow_train)

### Generate manual feature extractors 

In [None]:

# syntactical features
'''
the number of occurrences of nouns, adjectives and adverbs are added to the feature vector.
the number of emphasis tokens present in the tweet is added to the feature space
'''
def get_syntactic_features(data):
    syntatic_features_dicts = []
    for tweet in data:
        syntatic_dict = {}
        syntatic_dict['nouns'] = 0
        syntatic_dict['adjectives'] = 0
        syntatic_dict['adverbs'] = 0
        syntatic_dict['emphasis'] = 0
        for token in tweet['tokens']:
            if token['tag'] == 'NOUN':
                syntatic_dict['nouns'] = syntatic_dict['nouns'] + 1
            elif token['tag'] == 'ADJECTIVE':
                syntatic_dict['adjectives'] = syntatic_dict['adjectives'] + 1
            elif token['tag'] == 'ADVERB':
                syntatic_dict['adverbs'] = syntatic_dict['adverbs'] + 1
            elif token['tag'] == 'EMPHASIS':
                syntatic_dict['emphasis'] = syntatic_dict['emphasis'] + 1
        syntatic_features_dicts.append(syntatic_dict)
    
    return syntatic_features_dicts

# lexicon features
def get_lexicon_features(data):
    lexicon_feature_dict = []
    for tweet in data:
        tweet_dict = {}
        pos_pol = []
        neg_pol = []
        for token in tweet['tokens']:
            word = token['token']
            if word in lexicon_pos:
                pos_pol.append(lexicon_pos[word])
            elif word in lexicon_neg:
                neg_pol.append(lexicon_neg[word])
            
            p = len(pos_pol)
            n = len(neg_pol)
            pos_pol = [float(el) for el in pos_pol]
            neg_pol = [float(el) for el in neg_pol]
            tweet_dict['avg-pos'] = np.round(np.mean(pos_pol),3) if p > 0 else 0 
            tweet_dict['avg-neg'] = np.round(np.mean(neg_pol),3) if n > 0 else 0 
            tweet_dict['last-pos'] = pos_pol[-1] if p > 0 else 0
            tweet_dict['last-neg'] = neg_pol[-1] if n > 0 else 0
            tweet_dict['max-pos'] = max(pos_pol) if p > 0 else 0
            tweet_dict['max-neg'] = min(neg_pol) if n > 0 else 0
            # TWEET POLARITY COMPUTATION
            # firs condition: P > N
            if p > n:
                if n > 0:
                    tweet_dict['polarity'] = np.round(1 - n/p,3)
                else:
                    tweet_dict['polarity'] = np.mean(pos_pol)
            elif p < n:
                if p > 0:
                    tweet_dict['polarity'] = np.round(p/n - 1,3)
                else:
                    tweet_dict['polarity'] = np.mean(neg_pol)
            elif p == n:
                tweet_dict['polarity'] = 0
        lexicon_feature_dict.append(tweet_dict)
    
    return lexicon_feature_dict
            
            
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn

from nltk.corpus import opinion_lexicon
positive_words = opinion_lexicon.positive()
negative_words = opinion_lexicon.negative()

In [None]:
# extract syntactical features
train_synt_dict = get_syntactic_features(train_data)
test_synt_dict = get_syntactic_features(test_data)
# convert features dictionary to a sparse representation, so that they can be used by sklearn ML algorithms
# https://scikit-learn.org/stable/modules/feature_extraction.html
from sklearn.feature_extraction import DictVectorizer

vectorizer = DictVectorizer()

# Convert feature dictionaries to sparse representations
train_synt_features = vectorizer.fit_transform(train_synt_dict)
test_synt_features = vectorizer.transform(test_synt_dict)
#train_custom_features.toarray()

In [None]:
# extract ngram eatures
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = CountVectorizer(stop_words=stopwords, max_features=100000, ngram_range=(1,3))
#vectorizer = TfidfVectorizer(stop_words=stopwords)

# Generate feature vectors
train_ngram_features = vectorizer.fit_transform(train_texts)
test_ngram_features = vectorizer.transform(test_texts)

In [None]:
# extract lexicon features
train_lex_dict = get_lexicon_features(train_data)
test_lex_dict = get_lexicon_features(test_data)

vectorizer = DictVectorizer()

# Convert feature dictionaries to sparse representations
train_lex_features = vectorizer.fit_transform(train_lex_dict)
test_lex_features = vectorizer.transform(test_lex_dict)

In [None]:
# concatenate feature dictionaries
from scipy.sparse import hstack, vstack
training_combined = hstack([train_ngram_features, train_synt_features, train_lex_features])
test_combined = hstack([test_ngram_features, test_synt_features, test_lex_features])

### Classification

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier
#train_scaler = StandardScaler(with_mean=False)
#scaled_features = train_scaler.fit_transform(train_ngram_features)
classifier = SGDClassifier(
    loss='log',
        random_state=0, 
        learning_rate='constant',
        eta0=0.02,
        max_iter=300, 
        early_stopping=True,
        validation_fraction=0.15
    )
classifier.fit(training_combined, train_labels)

In [None]:
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
#test_scaler = StandardScaler(with_mean=False)
#test_scaled_features = test_scaler.fit_transform(test_ngram_features)
#test_predictions = lr.predict(test_combined)
test_predictions = classifier.predict(test_combined)

In [None]:
test_report = classification_report(test_labels, test_predictions, output_dict=True)
# get metrics of interest
avg_rec = test_report['macro avg']['recall']*100
f1_neg = test_report['negative']['f1-score']
f1_pos = test_report['positive']['f1-score']
avg_f1 = np.mean([f1_neg, f1_pos])*100
acc = test_report['accuracy']*100
print(f'Macro avgRec: {avg_rec:.2f}%')
print(f'Macro avgF1: {avg_f1:.2f}%')
print(f'Acc: {acc:.2f}%')