# Social Media Analytics | Module 10 Assignment | Paras Ahuja

In [1]:
import pandas as pd
import numpy as np
import re
import random
import math
import nltk
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
import string
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

In [2]:
def tweets(file):
    tweets = []
    with open(file, 'r') as file:
        for line in file:
            columns = line.split(',')
            sentiment = columns[0]
            tweet = columns[2]
            words_filtered = [e.lower().replace('"','') for e in tweet.split() if len(e) >= 3]
            tweets.append((words_filtered, sentiment))

    return tweets

def get_words_in_tweets(tweets):
    all_words = []
    for (words, sentiment) in tweets:
        all_words.extend(words)
    return all_words

def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    word_features = wordlist.keys()
    return word_features

def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

In [3]:
training_tweets = pd.read_csv('training.csv', header=None)
training_tweets = training_tweets.sample(frac=1)
training_tweets.head()

Unnamed: 0,0,1,2,3,4,5
509183,0,2189346567,Mon Jun 15 23:49:30 PDT 2009,NO_QUERY,steelslinger,Foot locking up again
602839,0,2221501095,Thu Jun 18 05:30:06 PDT 2009,NO_QUERY,roxiijonas,@Xxxsteviexxx Yeah I'm not sure if i'll be abl...
285851,0,1993728822,Mon Jun 01 10:50:22 PDT 2009,NO_QUERY,SexyMaMaD,"Dam, I really miss riding the light and Tempe ..."
435431,0,2065422981,Sun Jun 07 08:33:52 PDT 2009,NO_QUERY,Di_Elle,@Lemomina can you check my photobucket account...
552730,0,2203468435,Tue Jun 16 23:48:30 PDT 2009,NO_QUERY,givemestrength,@chriscardell I didn't back any problems with...


In [4]:
training_tweets.drop(columns=[1, 2, 3], inplace=True)
training_tweets.head()

Unnamed: 0,0,4,5
509183,0,steelslinger,Foot locking up again
602839,0,roxiijonas,@Xxxsteviexxx Yeah I'm not sure if i'll be abl...
285851,0,SexyMaMaD,"Dam, I really miss riding the light and Tempe ..."
435431,0,Di_Elle,@Lemomina can you check my photobucket account...
552730,0,givemestrength,@chriscardell I didn't back any problems with...


In [5]:
def tweet_sentiment(x):
    if x == 4:
        return 'positive'
#     elif x == 0:
#         return 'neutral'
    else:
        return 'negative'
    
training_tweets[0] = training_tweets[0].apply(lambda x: tweet_sentiment(x))
training_tweets = training_tweets.sample(frac=1)
training_tweets.to_csv('training_adjusted', index = False, header=None)

In [6]:
training_tweets = tweets('training_adjusted')[0:15000]

In [7]:
test_tweets = pd.read_csv('testing.csv', header=None)
test_tweets.head()

Unnamed: 0,0,1,2,3,4,5
0,4,3,Mon May 11 03:17:40 UTC 2009,kindle2,tpryan,@stellargirl I loooooooovvvvvveee my Kindle2. ...
1,4,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,Reading my kindle2... Love it... Lee childs i...
2,4,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,"Ok, first assesment of the #kindle2 ...it fuck..."
3,4,6,Mon May 11 03:19:04 UTC 2009,kindle2,SIX15,@kenburbary You'll love your Kindle2. I've had...
4,4,7,Mon May 11 03:21:41 UTC 2009,kindle2,yamarama,@mikefish Fair enough. But i have the Kindle2...


In [8]:
test_tweets.drop(columns=[1, 2, 3], inplace=True)
test_tweets.head()

Unnamed: 0,0,4,5
0,4,tpryan,@stellargirl I loooooooovvvvvveee my Kindle2. ...
1,4,vcu451,Reading my kindle2... Love it... Lee childs i...
2,4,chadfu,"Ok, first assesment of the #kindle2 ...it fuck..."
3,4,SIX15,@kenburbary You'll love your Kindle2. I've had...
4,4,yamarama,@mikefish Fair enough. But i have the Kindle2...


In [9]:
test_tweets[0] = test_tweets[0].apply(lambda x: tweet_sentiment(x))
test_tweets.to_csv('test_adjusted', index = False, header=None)
test_tweets = tweets('test_adjusted')

In [10]:
word_features = get_word_features(get_words_in_tweets(training_tweets))

In [11]:
tweets = pd.read_csv('twitter.csv')
tweets.head()

Unnamed: 0,commenter_id,comment_text
0,JoeBiden,A safe and effective COVID-19 vaccine could s...
1,JoeBiden,1: What criteria will be used to ensure a vac...
2,JoeBiden,2: Who will validate that the Administration’...
3,JoeBiden,3: What is the plan to allocate and distribut...
4,blaklamb,Will the vaccine be made by the same lab that...


In [12]:
tweets.isna().sum()

commenter_id     0
comment_text    24
dtype: int64

In [13]:
tweets.dropna(axis=0, inplace=True)
tweets.drop(columns="commenter_id", inplace=True)
tweets.to_csv('twitter_adjusted', index = False, header=None)

In [14]:
def twitter_sentiment_analysis(classifier):
    print ('Twitter Sentiment Analysis: ')
    print("\n")
    lines = 1
    with open('twitter_adjusted', encoding="utf8") as f:
        comments = f.readlines()
        for comment in comments[1:]:
            if comment.startswith('"'):
                comment = comment.lstrip('"')
                comment = comment.lstrip(' ')
            if comment[0].isdigit():
                comment = comment[1:]
            if comment.startswith(':'):
                comment = comment[1:]
                comment = comment.lstrip(' ')
            print ('Tweet ' + str(lines) + ": " + comment.replace('\n',''))
            print ('Sentiment ' + str(lines) + ": " + classifier.classify(extract_features(comment.split())))
            print("\n")
            lines += 1
            if lines == 11:
                break

In [15]:
def classify():
    training_set = nltk.classify.apply_features(extract_features, training_tweets)
    classifier = nltk.NaiveBayesClassifier.train(training_set)
    print ('classifier accuracy:')
    test_set = nltk.classify.apply_features(extract_features, test_tweets)
    print ('\t' + str(nltk.classify.accuracy(classifier, test_set)) + '\n')
    twitter_sentiment_analysis(classifier)

In [16]:
classify()

classifier accuracy:
	0.642570281124498

Twitter Sentiment Analysis: 


Tweet 1: What criteria will be used to ensure a vaccine meets the scientific standard of safety and efficacy?  The FDA must clarify what criteria it will use, and President Trump must commit to respecting the independent judgment of the non-political professional staff at the FDA."
Sentiment 1: positive


Tweet 2: Who will validate that the Administration’s decision to greenlight a COVID-19 vaccine is driven by science and not political motivation?   We must hear uncensored statements on efficacy and safety from  @FDACBER , the  @US_FDA  vaccine advisory committee, and Dr. Fauci."
Sentiment 2: negative


Tweet 3: What is the plan to allocate and distribute vaccines to Americans cost-free, safely, equitably, and without politics?   President Trump should release a detailed distribution plan to ensure every American — not just the wealthy and well-connected — receives the vaccine."
Sentiment 3: negative


Tweet 4:  W

There are a few limitations of this sentiment analysis. Firstly note that the while the vast majority of the tweets here are negative. There is a lack of context. The first 3 tweets are by Joe Biden, they are mostly negative for Trump, but positive for himself. Furthermore, note that the analysis of tweet 5 is negative, but it is negative for Donald Trump, not for Joe Biden, so we have some limitations when it comes to context. Another limitation is that some tweets may be sarcastic or hyperbolic, and these will either be classfied as positive or negative when the opposite might be true. The program is also computationally expensive. There are 999999 examples in the training dataset we are only using 15000. Training all of the examples is quite expensive. Note that There are only 10 tweets being printed out for demonstration purposes, there are approximately 170 tweets. More tweets can be obtained by chainging the number of lines in the following code in instance [14]:

            
            if lines == 11:
                break