# IST 736 Text Mining
### HW1 - An Evaluation of Sentiment Classification Tools


Ryan Timbrook (RTIMBROO)  
DATE:10/12/2019<br>
Topic: Sentiment Analysis - AI Trend<br>


## 1. Objective
_____________________________________________________________________________________________
An Evaluation of Sentiment Classification Tools

Artificial Intelligence (AI) has become a popular topic recently. Assume that you are a consultant at a public relations firm, and a client of your firm would like you to evaluate the current public sentiment toward AI in social media like Facebook and Twitter. 

Since there are too many comments on social media, you can't manually collect and analyze them all. Fortunately you have discovered some free sentiment analysis tools, and now need to evaluate whether they are good enough to do sentiment analysis for your assigned task.

You need to collect a sample data set and choose two tools to compare their effectiveness in sentiment analysis. Write a report to describe (1) your sampling strategy and whether it would result in a representative sample of public sentiment toward AI, (2) data preparation and system evaluation process, and (3) your conclusion on whether these tools are suitable for your task.
 





## Findings / Recommendations
place findings and recommendations here  





In [1]:
# toggle for working with colab
isColab = False

In [None]:
#*ONLY RUN WHEN WORKING ON COLAB*
#===================================================
# mount google drive for working in colab

#from google.colab import drive
#drive.mount('/content/gdrive', force_remount=True)

# working within colab, set base working directory
#base_dir = "./gdrive/My Drive/IST707_PRJ_Realestate/buy_rent_sell/"

# validate directory mapping
#ls f'{base_dir}'

# upload custome python files
#from google.colab import files
#uploaded_files = files.upload()

# print files uploaded
#for f in uploaded_files.keys():
#  print(f'file name: {f}')

#isColab = True

______________________________________________________________________________________________
### Coding Environment Setup
Import packages

In [2]:
# import packages for analysis and modeling
import pandas as pd #data frame operations
import numpy as np #arrays and math functions
from scipy.stats import uniform #for training and test splits
from scipy.stats import gaussian_kde as kde # for resampling dataset
from scipy import stats #
import statsmodels.formula.api as smf #R-like model specification
import matplotlib.pyplot as plt #2D plotting
%matplotlib inline
import seaborn as sns #
import requests
import os
import pickle


In [3]:
# packages for twitter
import tweepy as tw
from tweepy import OAuthHandler
import json
from tweepy import Stream
from tweepy.streaming import StreamListener


# packages for NLTK
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import re

import sys
from os import path

In [4]:
# custome python packages
import rtimbroo_utils as br             # custome python helper functions

In [5]:
# set global properties
notebook_file_name = 'hw1_twitter_sentiment'
report_file_name = 'HW_1_Timbrook_Ryan'
app_name = 'AI_Public_Sentiment'
log_level = 10 # 10-DEBUG, 20-INFO, 30-WARNING, 40-ERROR, 50-CRITICAL

# setup working directory structure
# set global properties
if not isColab:
    dataDir = './data'
    outputDir = './output'
    configDir = './config'
    logOutDir = './logs'
    imageDir = './images'
    modelDir = './models'
else:
    # working within colab
    dataDir = f'{base_dir}data'
    outputDir = f'{base_dir}output'
    configDir = f'{base_dir}config'
    logOutDir = f'{base_dir}logs'
    imageDir = f'{base_dir}images'
    modelDir = f'{base_dir}models'

In [6]:
# create base output directories if they don't exist
if not os.path.exists(outputDir): os.mkdir(outputDir)
if not os.path.exists(logOutDir): os.mkdir(logOutDir)
if not os.path.exists(imageDir): os.mkdir(imageDir)
if not os.path.exists(modelDir): os.mkdir(modelDir)
if not os.path.exists(dataDir): os.mkdir(dataDir)
if not os.path.exists(configDir): os.mkdir(configDir)

In [7]:
# get a logger for troubleshooting / data exploration
logger = br.getFileLogger(logOutDir+'/',app_name,level=log_level)
np.random.seed(42) # NumPy

## 2. OBTAIN the data   
________________________________________________________________________________________________
Import external datasets for evaluation

In [8]:
# load twitter credentials
with open(f'{configDir}/twitter_credentials.json', 'r') as f:
    tw_cred = json.load(f)

# instantiate tweepy object
auth = OAuthHandler(tw_cred['CONSUMER_KEY'], tw_cred['CONSUMER_SECRET'])
auth.set_access_token(tw_cred['ACCESS_TOKEN'], tw_cred['ACCESS_SECRET'])
api = tw.API(auth)

#py_tweets = Twython(tw_cred['CONSUMER_KEY'],tw_cred['CONSUMER_SECRET'],tw_cred['ACCESS_TOKEN'],tw_cred['ACCESS_SECRET'])

# setup base twitter search query
search_terms = 'Artificial+Intelligence OR machine+learning'
search_start_date = '2019-10-15'
# add filters to search criteria
filtered_search_terms = search_terms + " -filter:retweets"
# number of tweets to return
num_tweets = 10000

# query
#base_tw_query = {
   # 'q':filtered_search_terms,
   # 'since':search_start_date,
   # 'count':num_tweets,
   # 'lang':'en',
    
#}

In [9]:
# use .Cursor() to search twitter for tweets containing the search term
tweets = tw.Cursor(api.search,q=filtered_search_terms,lang='en',since=search_start_date).items(num_tweets)
#tweets

In [None]:
#for i,t in enumerate(tweets):
    #print(t.text+'\n')
    #if i > 5: break

In [10]:
# save tweets off to a file
tweet_file_name = 'ai_tweets.txt'
tweet_file_name = search_start_date+'_'+tweet_file_name
raw_tweet_file_name = 'raw_ai_tweets.txt'
failed_tweets_text = []
failed_count = 0

try:
    with open(dataDir+'/'+tweet_file_name, 'w+') as f:
        for t in tweets:
            try:
                f.write(t.text+'\n')
                f.write('\n')
            except:
                failed_count = failed_count + 1
                failed_tweets_text.append(t.text)
                f.write('\n')
                print(f'Failed to write tweet:\n{t.text}')
            
except BaseException as be:
    print(f'Caught BaseException:\n {be}')
    pass

print(f'Failed to write {failed_count}')


Failed to write tweet:
The Complete TensorFlow Masterclass: Machine Learning Models

☞ https://t.co/TDPkEM2Cj8

#ai #TensorFlow https://t.co/yCCpdB077P
Failed to write tweet:
The Complete TensorFlow Masterclass: Machine Learning Models

☞ https://t.co/mjd3OCJ7zs

#ai #TensorFlow https://t.co/6VYoR74vV7
Failed to write tweet:
The Complete TensorFlow Masterclass: Machine Learning Models

☞ https://t.co/dnTBHpzYul

#ai #TensorFlow https://t.co/w4mb9shm4R
Failed to write tweet:
👍 on @YouTube: Artificial Intelligence in PowerApps with AI Builder https://t.co/bAzGUFNm2B
Failed to write tweet:
Machine Learning Infrastructure Engineer at Apple (Beijing, China) 👇 https://t.co/yQgNIgc4mh
Failed to write tweet:
You know who never goes on strike? Artificial Intelligence 🤔

Oh and AI also knows math...so there's that. https://t.co/YHN3CeEz6w
Failed to write tweet:
@dli_odoir @missbarton D'you I don't ever disagree with you so it's about time 😉. Back in the day I worked in AI (A… https://t.co/giB732

In [11]:
logger.info(f'Failed to write tweets length: {len(failed_tweets_text)}')


Failed to write tweets length: 122


### 2.1 SCRUB / CLEAN
Clean and perform initial transformations steps of the data

In [12]:
# perform scrubbing and cleaning techniques
#%%time
# read tweets text file
line_count=0
hash_count=0
word_count=0
tweet_index=0
bag_of_words=[]
bag_of_hashes=[]
bag_of_links=[]
tweets_cleaned={}
tweet_hashs={}
tweet_links={}

# setup some regular expressions
#------ Barrowed from Dr. Gates - TwitterMining_Tokens
regex1=re.compile('^#.+')
regex2=re.compile('[^\W\d]') #no numbers
regex3=re.compile('^http*')
regex4=re.compile('.+\..+')

with open(f'{dataDir}/{tweet_file_name}') as f:
    for i, line in enumerate(f):
        if line == '\n': 
            tweet_index+=1
            
        #print(line)
        #line = line.replace('\n','')
        line = line.strip()
        line_count+=1
        
        tweetSplitter = TweetTokenizer(strip_handles=True, reduce_len=True)
        tweetTokens = tweetSplitter.tokenize(line)
        
        tweet_tokens_words=[]
        tweet_tokens_links=[]
        tweet_tokens_hashes=[]
        
        for token in tweetTokens:
            if(len(token)>2):
                logger.debug(f'Token: {token}')
                
                if((re.match(regex1,token))):
                    logger.debug(f'Token: {token} | matched regex1 {regex1} - will be added to bag of hashes list')
                    n_token=token[1:]
                    bag_of_hashes.append(n_token) # capture all hashtags
                    tweet_tokens_hashes.append(n_token)
                    hash_count+=1
                elif(re.match(regex2,token)):
                    logger.debug(f'Token: {token} | matched regex2 {regex2}')
                    if(re.match(regex3,token) or re.match(regex4,token)):
                        logger.debug(f'Token: {token} | matched regex3 {regex3} or regex4 {regex4} - will be added to bag of links list')
                        bag_of_links.append(token) # capture all url links
                        tweet_tokens_links.append(token)
                    else:
                        logger.debug(f'Token: {token} | did not match any of the regex patterns - will be added to bag of words list')
                        bag_of_words.append(token) # capture all words
                        tweet_tokens_words.append(token)
                        word_count+=1
                else:
                    logger.debug(f'Token: {token} | did not match regex2 {regex2} | contains numbers')
                    pass
            else:
                logger.debug(f'Token: {token} | is less than 2')
                pass
        # capture tweet index
        tweets_cleaned[tweet_index] = tweet_tokens_words
        tweet_hashs[tweet_index] = tweet_tokens_hashes
        tweet_links[tweet_index] = tweet_tokens_links
        
        #--End For Loop over tokens
    #--End For loop over lines


logger.info(f'bag_of_words count: {len(bag_of_words)} | bag_of_hashes count: {len(bag_of_hashes)} | bag_of_links count: {len(bag_of_links)}')
logger.info(f'Tweets Collected: {tweet_index}')
logger.info(f'Tweets - Line Count: {line_count} | Hash Count: {hash_count}')

bag_of_words count: 25708 | bag_of_hashes count: 2102 | bag_of_links count: 2779
Tweets Collected: 2855
Tweets - Line Count: 5958 | Hash Count: 2102


In [13]:
len(tweets_cleaned)

2856

In [None]:
#bag_of_words

In [14]:
tweet_sents={}

with open(f'{dataDir}/{search_start_date}_ai_tweets_cleaned.txt','a') as f:
    
    for tweet_id, tokens in tweets_cleaned.items():
        if not len(tweets_cleaned[tweet_id]) == 0:
            sent = ''
            #tokens = list(filter(None,tokens))
        
            for t in tokens:
                sent = sent+t+' '
            
            sent = sent.strip()
            sent = sent.replace('\n','')
            tweet_sents[tweet_id] = sent
            f.write(sent)
            #print(tweet_sents)
            #break
    print(tweet_sents)



### 2.2 Initial EXPLORE
Explore the datasets

In [15]:
# create a list of words to be removed from BOWs for analysis and visualizations
def remove_words(word_list):
    stopwords_set = set(stopwords.words("english"))
    other_words = ['Artificial','artificial','Intelligence','intelligence','artificial intelligence',
                   'Machine','machine','Learning','learning','Data','Science',
                  'The','He','Via','via','New','new','Python','Google','HealthCare']
    
    new_list = set(list(stopwords_set) + other_words)
    
    words_without_stopwords = [word for word in word_list if not word in new_list]
    
    return words_without_stopwords

In [16]:
logger.info(set(stopwords.words("english")))

{'mustn', 'so', 'under', 'this', 'her', 'himself', 'is', 'from', 'ain', 'about', "needn't", "didn't", 'while', 'these', 'o', 'if', 'after', 'now', 'it', "don't", 'when', 'other', 'which', "shan't", "haven't", 'them', 'should', 'can', 'yours', 'itself', 'ourselves', 'over', 'who', 'the', 'few', 'd', "that'll", 'm', 'shouldn', 'won', 'a', 'into', "hasn't", 'very', "should've", 'being', 'she', 'any', 'against', "it's", 'or', "wouldn't", 'hers', 'because', 'you', 'are', 'by', 'through', "doesn't", "shouldn't", 'aren', 'why', "aren't", 'on', 'more', 'below', 'has', 'having', "mightn't", 'my', 'me', 'how', 'will', "you've", 'does', 'an', "isn't", 'i', 'here', 'have', 'with', 'out', "she's", 'down', 'were', 'wouldn', 'didn', 'such', 'shan', 'haven', "couldn't", 'before', "wasn't", 'same', 'there', 'between', 'again', 'not', 't', 'your', 'couldn', 'herself', 'needn', 'do', 'during', 'weren', 've', "you'd", 'he', 'and', 'at', 'to', 'that', 'mightn', 'but', 'yourself', 'we', 'his', 'don', 'doing

In [17]:
def get_word_features(word_list):
    word_list = nltk.FreqDist(word_list)
    features = word_list.keys()
    return features, word_list


In [18]:
w_features = get_word_features(remove_words(bag_of_words))
logger.info(w_features[1])

<FreqDist with 6134 samples and 16700 outcomes>


In [19]:
b = remove_words(bag_of_words)
print(b)



In [20]:
def wordcloud_draw(data, color='black', width=1000, height=750, max_font_size=50, max_words=100):
    words = ' '.join([word for word in data])
    #cleaned_word = " ".join([word for word in words])
    wordcloud = WordCloud(stopwords=STOPWORDS,
                    background_color=color,
                    width=width,
                    height=height,
                    max_font_size=max_font_size,
                    max_words=max_words,
                     ).generate(words)
    plt.figure(1,figsize=(10.5, 7))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()

In [None]:
wordcloud_draw(bag_of_words, color='white', max_words=300)

In [None]:
wordcloud_draw(remove_words(bag_of_words),color='white', max_words=300)

In [None]:
wordcloud_draw(remove_words(bag_of_words))

In [None]:
# Create and generate a word cloud image:
wordcloud = WordCloud(stopwords=STOPWORDS).generate(ai_tweets_cleaned)

# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
# use parameters to adjust your word cloud, such as 
# lower max_font_size, change the maximum number of word and lighten the background:
wordcloud = WordCloud(stopwords=STOPWORDS,max_font_size=50, max_words=100, background_color="white").generate(ai_tweets_cleaned)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
# remove search terms 'Artificial, Intelligence, Machine Learning'
#ai_tweets_cleaned_v1 = ai_tweets_cleaned.replace()
ai_tweets_cleaned_v1 = ai_tweets_cleaned.replace('Artificial','')
ai_tweets_cleaned_v1 = ai_tweets_cleaned_v1.replace('artificial','')
ai_tweets_cleaned_v1 = ai_tweets_cleaned_v1.replace('Intelligence','')
ai_tweets_cleaned_v1 = ai_tweets_cleaned_v1.replace('intelligence','')
ai_tweets_cleaned_v1 = ai_tweets_cleaned_v1.replace('Machine','')
ai_tweets_cleaned_v1 = ai_tweets_cleaned_v1.replace('machine','')
ai_tweets_cleaned_v1 = ai_tweets_cleaned_v1.replace('Learning','')
ai_tweets_cleaned_v1 = ai_tweets_cleaned_v1.replace('learning','')

# Create and generate a word cloud image:
wordcloud = WordCloud(stopwords=STOPWORDS).generate(ai_tweets_cleaned_v1)

# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [30]:
# use parameters to adjust your word cloud, such as 
# lower max_font_size, change the maximum number of word and lighten the background:
wordcloud = WordCloud(stopwords=STOPWORDS,max_font_size=50, max_words=100, background_color="white").generate(ai_tweets_cleaned_v1)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

NameError: name 'ai_tweets_cleaned_v1' is not defined

In [21]:
# perform exploratory data analysis techiques
#with open(f'{dataDir}/ai_tweets_cleaned.txt','r') as f:
tweet_sent_clean = [line.strip().rstrip('\n') for line in open(f'{dataDir}/ai_tweets_cleaned.txt')]

In [22]:
#tweet_sent_clean = list(filter(None,tweet_sent_clean))
len(tweet_sent_clean)

30471

In [None]:
tweet_sent_clean

## 3. MODEL
_________________________________________________________________________________________________
Create models

### 3.1 Model Analysis
perform model creation and validation techniques

In [23]:
# NLTK downloads
import nltk
nltk.download('subjectivity')
nltk.download('vader_lexicon')

[nltk_data] Downloading package subjectivity to
[nltk_data]     C:\Users\rt310\AppData\Roaming\nltk_data...
[nltk_data]   Package subjectivity is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\rt310\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [24]:
# NLTK assessment
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
from nltk.sentiment.util import mark_negation

n_instances = 1000
subj_docs = [(sent,'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]]
obj_docs = [(sent,'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]]
len(subj_docs),len(obj_docs)

(1000, 1000)

In [25]:
subj_docs[0]

(['smart',
  'and',
  'alert',
  ',',
  'thirteen',
  'conversations',
  'about',
  'one',
  'thing',
  'is',
  'a',
  'small',
  'gem',
  '.'],
 'subj')

In [26]:
# NLTK create training / test splits
train_subj_docs = subj_docs[:800]
test_subj_docs = subj_docs[800:1000]
train_obj_docs = obj_docs[:800]
test_obj_docs = obj_docs[800:1000]

training_docs = train_subj_docs+train_obj_docs
testing_docs = test_subj_docs+test_obj_docs

In [None]:
testing_docs[0]

In [None]:
# instantiate a sentiment analyzer object
sentiment_analyzer = SentimentAnalyzer()
all_words_neg = sentiment_analyzer.all_words([mark_negation(doc) for doc in training_docs])


In [None]:
all_words_neg

In [None]:
# use simple unigram word features, handling negation
unigram_feats = sentiment_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
#len(unigram_feats)
sentiment_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)

In [None]:
# apply features to obtain a feature-value representation of dataset
training_set = sentiment_analyzer.apply_features(training_docs)
test_set = sentiment_analyzer.apply_features(testing_docs)


### 3.2 Model Exploration

In [None]:
trainer = NaiveBayesClassifier.train
classifier = sentiment_analyzer.train(trainer, training_set)

#save SentimentAnalyzer
with open('sa_subjectivity.pickle','wb') as f:
    pickle.dump(sentiment_analyzer,f)

for k,v in sorted(sentiment_analyzer.evaluate(test_set).items()):
    print(f'{k}: {v}')


In [None]:
# tweets
#tweet_sent_clean
tw_sentiment_analyzer = SentimentAnalyzer()

for d in tweet_sent_clean:
    print(d)
    break

#[mark_negation(tw) for tw in tweet_sent_clean]
neg_tweets = tw_sentiment_analyzer.all_words([mark_negation(tw) for tw in tweet_sent_clean])

### 3.3 Model Build

In [None]:
tweet_sent_clean

### 4. VADER Sentiment Analyzer

In [27]:
def classify_vader_score_threshold(compound_score):
    pos_sent = 'positive'
    neu_sent = 'neutral'
    neg_sent = 'negative'
    sentiment_class = ''
    
    if compound_score >= 0.05:
        sentiment_class = pos_sent;
    elif compound_score > -0.05 and compound_score < 0.05:
        sentiment_class = neu_sent;
    elif compound_score <= -0.05:
        sentiment_class = neg_sent;
    else:
        logger.warning(f'classify_vader_score_threshold: compound score not in range: {compound_score}')
    return sentiment_class
    

In [28]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer 

sid = SentimentIntensityAnalyzer()
for sentence in tweet_sent_clean:
    logger.info(sentence)
    if len(sentence) > 1:
        ss = sid.polarity_scores(sentence)
        for k in ss:
            logger.info(f'{k}: {ss[k]}')
        logger.info('')

using for your defense
neg: 0.0
neu: 0.667
pos: 0.333
compound: 0.128

blacklists Chinese artificial intelligence firms ahead trade talks via
neg: 0.0
neu: 0.721
pos: 0.279
compound: 0.4767

Machine Learning Software Development Techniques and Tools
neg: 0.0
neu: 1.0
pos: 0.0
compound: 0.0

EHR Software With Artificial Intelligence The Next Big Thing
neg: 0.0
neu: 0.721
pos: 0.279
compound: 0.4767

Rahko raises seed from Balderton for quantum machine learning tech via
neg: 0.0
neu: 1.0
pos: 0.0
compound: 0.0

Artificial intelligence isn threat humanity natural stupidity Human being are only creatures Earth
neg: 0.301
neu: 0.431
pos: 0.268
compound: -0.1779

love this quote When folks think about artificial intelligence they tend think explicitly about the magi
neg: 0.0
neu: 0.657
pos: 0.343
compound: 0.8074

working against you How artificial intelligence bias can block you
neg: 0.299
neu: 0.486
pos: 0.215
compound: -0.0516



Are you passionate about Machine Learning Data Science And 

In [29]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
pol_scores=[]
with open(f'{outputDir}/vader_tweet_sentiment.txt','w+') as f:
    for sentence in tweet_sent_clean:
        vs = analyzer.polarity_scores(sentence)
        pol_scores.append(vs)
        f.write("{:-<65} {}".format(sentence, str(vs))+'\n')
        print("{:-<65} {}".format(sentence, str(vs)))


using for your defense------------------------------------------- {'neg': 0.0, 'neu': 0.667, 'pos': 0.333, 'compound': 0.128}
blacklists Chinese artificial intelligence firms ahead trade talks via {'neg': 0.0, 'neu': 0.721, 'pos': 0.279, 'compound': 0.4767}
Machine Learning Software Development Techniques and Tools------- {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
EHR Software With Artificial Intelligence The Next Big Thing----- {'neg': 0.0, 'neu': 0.721, 'pos': 0.279, 'compound': 0.4767}
Rahko raises seed from Balderton for quantum machine learning tech via {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}
Artificial intelligence isn threat humanity natural stupidity Human being are only creatures Earth {'neg': 0.301, 'neu': 0.431, 'pos': 0.268, 'compound': -0.1779}
love this quote When folks think about artificial intelligence they tend think explicitly about the magi {'neg': 0.0, 'neu': 0.657, 'pos': 0.343, 'compound': 0.8074}
working against you How artificial intell

In [None]:
# evaluate the polarity scoring
pol_scores_df = pd.DataFrame(pol_scores)
logger.info(pol_scores_df.describe())
#pol_scores_df.head()

# classify each sentence as 'positive', 'negative' or 'neutral' - see function above
sentiment_classes = [classify_vader_score_threshold(c) for c in pol_scores_df['compound'] ]
pol_scores_df['sentiment_label'] = sentiment_classes
pol_scores_df.head()

In [None]:
sns.scatterplot(x='neg',y='pos', hue='sentiment_label', data=pol_scores_df);

In [None]:
sns.countplot(x='sentiment_label', data=pol_scores_df);

In [None]:
logger.info(f'Negative Count: {len(pol_scores_df[pol_scores_df["sentiment_label"] == "negative"])}')
logger.info(f'Positive Count: {len(pol_scores_df[pol_scores_df["sentiment_label"] == "positive"])}')
logger.info(f'Neutral Count: {len(pol_scores_df[pol_scores_df["sentiment_label"] == "neutral"])}')

neg_count = len(pol_scores_df[pol_scores_df["sentiment_label"] == "negative"])
pos_count = len(pol_scores_df[pol_scores_df["sentiment_label"] == "positive"])

print(neg_count/pos_count)
print(pos_count/neg_count)            

In [None]:
# Initialize Figure and Axes object
fig, ax = plt.subplots()
# Create violinplot
ax.violinplot(pol_scores_df['compound'], vert=False)

# Show the plot
plt.show()

In [None]:
# Initialize Figure and Axes object
fig, ax = plt.subplots()
# Create violinplot
ax.violinplot(pol_scores_df['pos'], vert=False)

# Show the plot
plt.show()

In [None]:
pos_neg = pol_scores_df[['pos', 'neg']]
pos_neg.head()
pos_neg_melt = pd.melt(pos_neg)
pos_neg_melt.head()


In [None]:
# Swarmplot with melted_df
#sns.swarmplot(x='variable', y='value', data=pos_neg_melt)

In [None]:
# pull in SentiStrength classifier results for comparison Vader classification results
senti_strength_analysis = {'tweet':[],'pos_score':[],'neg_score':[]}
with open(f'{dataDir}/ai_tweets_cleaned+results2.txt','r') as f:
    for line in f.readlines():
        temp = line.split('\t')
        senti_strength_analysis['tweet'].append(temp[0])
        senti_strength_analysis['pos_score'].append(temp[1])
        senti_strength_analysis['neg_score'].append(temp[2].replace('\n',''))
        


In [None]:
#
def classify_senti_strength(scores):
    logger.debug(scores)
    pos_sent = 'positive'
    neu_sent = 'neutral'
    neg_sent = 'negative'
    sentiment_class = ''
    
    logger.debug(f'pos_score:{scores[0]} | neg_score: {scores[1]}')
    pos_score = scores[0]
    neg_score = scores[1]
    logger.debug(f'score = {(pos_score+neg_score)}')
    
    if (pos_score+neg_score) >= 1:
        sentiment_class = pos_sent
    elif (pos_score+neg_score)  < 0:
        sentiment_class = neg_sent
    elif (pos_score+neg_score)  == 0:
        sentiment_class = neu_sent
    else:
        logger.warning(f'classify_senti_strength: score not in range: {scores}')

        
    return sentiment_class

In [None]:
senti_df = pd.DataFrame(senti_strength_analysis)
senti_df = senti_df[1:]
senti_df.pos_score =  senti_df.pos_score.astype('int32', copy=False)
senti_df.neg_score =  senti_df.neg_score.astype('int32', copy=False)
senti_df.head()

In [None]:
# classify each sentence as 'positive', 'negative' or 'neutral' - see function above
sentiment_classes = []
for (idx,row) in senti_df.iterrows():
    sentiment_classes.append(classify_senti_strength([row['pos_score'],row['neg_score']]))

senti_df['sentiment_label'] = sentiment_classes
senti_df.head()

In [None]:
sns.scatterplot(x='neg_score',y='pos_score', hue='sentiment_label', data=senti_df);

In [None]:
sns.countplot(x='sentiment_label', data=senti_df);

In [None]:
logger.info(f'Negative Count: {len(senti_df[senti_df["sentiment_label"] == "negative"])}')
logger.info(f'Positive Count: {len(senti_df[senti_df["sentiment_label"] == "positive"])}')
logger.info(f'Neutral Count: {len(senti_df[senti_df["sentiment_label"] == "neutral"])}')

            
neg_count = len(senti_df[senti_df["sentiment_label"] == "negative"])
pos_count = len(senti_df[senti_df["sentiment_label"] == "positive"])
neu_count = len(senti_df[senti_df["sentiment_label"] == "neutral"])

print(neg_count/pos_count)
print(pos_count/neg_count)            