In [1]:
##################################################################
# In this Notebook I conducted Sentiment Analysis 
# by doing the following:
# 1) I first compiled all tweets mentioning charlottesville from the All Twitter Scraped Tweets
# 2) Passed these Tweets through the Sentiment analysis
# 3) For these texts I also calculated the V, A, and D scores and found a mean of their
# composites
# 4) Put and formatted these texts into a single table
#################################################################

In [2]:
# Here are the notebooks I needed
%run ImportStatements.ipynb
%run functions.ipynb

### Data formatting

In [3]:
# First I read in both JSON files I compiled of tweets that mentioned Charlottesville
tweets = []
for line in open('data/charlottesvillerally/#charlottesville_2017-08-11_to_2017-08-20.json', 'r'):
    tweets.append(json.loads(line))
    
for line in open('data/charlottesvillerally/#charlottesville_2017-08-11_to_2017-08-20.json', 'r'):
    tweets.append(json.loads(line))

# I then compiled all the text into a single list
all_tweets_text = []
for tweet in tweets:
    all_tweets_text.append(tweet)

# Here I read in the 2017 IRA file and reformatted, subsetted, and prepared it for analysis
IRA2017 = pd.read_csv("data/2017/IRA2017.csv")
IRA2017['publish_date'] = pd.to_datetime(IRA2017['publish_date']).dt.strftime('%Y/%m/%d')
IRA2017 = pd.DataFrame(IRA2017).sort_values(by='publish_date') 
IRA2017_charlottesville = IRA2017[(IRA2017['publish_date'] >= '2017/08/11') & (IRA2017['publish_date'] <= '2017/08/20')]

# I then checked for Tweets mentioning charlottesvillerally or charlottesvillerallyrally 
IRA2017_charlottesvillerally = []

for item in IRA2017_charlottesville['content']:
    if item.lower().count("charlottesvillerally") > 0 or item.lower().count("charlottesville") > 0:
        IRA2017_charlottesvillerally.append(item)

### All VAD Score

In [4]:
# Here I performed some sentiment analysis
sid = SentimentIntensityAnalyzer()

sid_scores_IRA = []
for tweet in IRA2017_charlottesvillerally:
    sid_scores_IRA.append(sid.polarity_scores(tweet))

sid_scores_all = []
for tweet in all_tweets_text:
    sid_scores_all.append(sid.polarity_scores(tweet['text']))


# This shows how much more negative the IRA Tweets are than the overall Twitter discussion

In [5]:
sum([t['compound'] for t in sid_scores_IRA])/ len(IRA2017_charlottesvillerally)

-0.1962597586350404

In [6]:
sum([t['compound'] for t in sid_scores_all])/ len(all_tweets_text)

-0.07746629999999999

- IRA Tone was overall more negative
- Not as significant of a difference as I would have expected

### VAD Analysis

In [7]:
NRC_VAD_lexicon = open('data/NRC-VAD-Lexicon.txt').readlines()

NRC_VAD = {}

for r, line in enumerate(open('data/NRC-VAD-Lexicon.txt')):
    if r>0:
        word, V,A,D = line.strip().split('\t')
        NRC_VAD[word] = {'V': float(V), 
                         'A': float(A),
                         'D': float(D)}

In [8]:
tt=TweetTokenizer()

In [9]:
# One important step is converting the IRA2017 panda dataframe into a dictionary so that I can 
# utilize the VADER functions
dictIRA2017 = IRA2017_charlottesville.T.to_dict().values()

for tweet1 in tweets:
    process_tweet(tweet1)

for tweet2 in dictIRA2017:
    process_tweet(tweet2)
    
    

In [10]:
# I then created separate lists to measure each of the scores so I could find a compound
totalVTwitter = []
totalATwitter = []
totalDTwitter = []
totalVIRA = []
totalAIRA = []
totalDIRA = []

for item in tweets:
    totalVTwitter.append(item.get("Valence",''))
    totalATwitter.append(item.get("Arousal",''))
    totalDTwitter.append(item.get("Dominance",''))

for item in dictIRA2017:
    totalVIRA.append(item.get("Valence",''))
    totalAIRA.append(item.get("Arousal",''))
    totalDIRA.append(item.get("Dominance",''))

### VAD Table

In [11]:
# I then made a list of values that I could format into a nice table
values = [['<b>All Twitter</b>', '<b>Internet Research Agency</b>', '<b>Difference</b>'], #1st col
    [str(average(totalVTwitter)),str(average(totalVIRA)),str(round(average(totalVTwitter)-average(totalVIRA),2))],
    [str(average(totalATwitter)),str(average(totalAIRA)),str(round(average(totalATwitter)-average(totalAIRA),2))],
    [str(average(totalDTwitter)),str(average(totalDIRA)),str(round(average(totalDTwitter)-average(totalDIRA),2))]
    ]

colors = ['royalblue', 'white', 'white',
          'gray']


fig_VAD = go.Figure(data=[go.Table(
  columnorder = [1,2,3,4],
  columnwidth = [80,80,80,80,80],
  header = dict(
    values = [['<b>Corpus</b>'],
                  ['<b>Valence</b>'],
             ['<b>Arousal</b>'],
             ['<b>Dominance</b>']],
    line_color='darkslategray',
    fill_color='royalblue',
    align=['left','center'],
    font=dict(color='white', size=12),
    height=20
  ),
  cells=dict(
    values=values,
    line_color='darkslategray',
    align=['left', 'center'],
    font_size=12,
    height=30)
    )
])

fig_VAD.write_html("images/VAD_table.html")



Key Findings:
- IRA is overall more negative than Twitter
- When comparing individual scores, not that much of a difference, suggesting echo theory
- Emphasizes the "adding to noise" effort