# 3M Russian tweets
This notebook uses bokeh to generate an interactive visualization exploring a dataset of 3M tweets.

Some notes on the dataset from the source article:
<em>"The data set published here includes 2,973,371 tweets from 2,848 Twitter handles. It includes every tweet’s author, text and date; the author’s follower count and the number of accounts the author followed; and an indication of whether the tweet was a retweet. The entire corpus of tweets published here dates from February 2012 to May 2018, with the vast majority from 2015 to 2017."</em>

1. Source article when tweets were released: https://fivethirtyeight.com/features/why-were-sharing-3-million-russian-troll-tweets/
2. Followup article citing other items completed with data: https://fivethirtyeight.com/features/what-you-found-in-3-million-russian-troll-tweets/

## Important
You need to have downloaded the Tweet CSVs located in the following repository to rerun this notebook:
https://github.com/fivethirtyeight/russian-troll-tweets

In [47]:
import datetime as dt
import numpy as np
import pandas as pd
import bokeh

In [48]:
#Note: You need to have the downloaded csvs here

### Exploratory analysis and data prep

In [49]:
frames = []
for x in range(1,10):
    df = pd.DataFrame.from_csv('IRAhandle_tweets_' + str(x) + '.csv', index_col=None)
    df = df[df['account_category'] == 'Fearmonger']
    frames.append(df)

  This is separate from the ipykernel package so we can avoid doing imports until


In [50]:
df = pd.concat(frames)

In [51]:
df.head(n=3)

Unnamed: 0,external_author_id,author,content,region,language,publish_date,harvested_date,following,followers,updates,post_type,account_type,new_june_2018,retweet,account_category
425,2535565000.0,1D_NICOLE_,#FoodPoisoning is not a joke! #Walmart #KochFa...,United States,English,11/26/2015 22:20,11/26/2015 22:20,48,40,394,RETWEET,Koch,0,1,Fearmonger
426,2535565000.0,1D_NICOLE_,Thanks God that #turkey i ate few days ago was...,United States,English,11/26/2015 22:35,11/26/2015 22:35,48,40,395,RETWEET,Koch,0,1,Fearmonger
427,2535565000.0,1D_NICOLE_,Kim and kanye wedding http://t.co/6YdR6QlR6p,United States,Tagalog (Filipino),5/25/2015 12:38,5/25/2015 12:38,59,51,353,,Koch,0,0,Fearmonger


In [52]:
df.shape

(11140, 15)

In [53]:
len(df.author.unique())

124

In [54]:
df['publish_day'] = pd.to_datetime(df['publish_date'], format='%m/%d/%Y %H:%M')
df['_date'] = df['publish_day'].dt.strftime('%m/%d/%Y')
df['date'] = pd.to_datetime(df['_date'], format='%m/%d/%Y')

In [55]:
authors = []
for a in df.author.unique():
    authors.append(a)

In [56]:
cols = ['author', 'date', 'content', 'followers']
df = df[cols]

In [57]:
df.head(n=3)

Unnamed: 0,author,date,content,followers
425,1D_NICOLE_,2015-11-26,#FoodPoisoning is not a joke! #Walmart #KochFa...,40
426,1D_NICOLE_,2015-11-26,Thanks God that #turkey i ate few days ago was...,40
427,1D_NICOLE_,2015-05-25,Kim and kanye wedding http://t.co/6YdR6QlR6p,51


In [58]:
grp = df.groupby(['date', 'author'], as_index=False).agg({'followers': 'count'})
grp['tweet_count'] = grp['followers']
del grp['followers']
grp.head()

Unnamed: 0,date,author,tweet_count
0,2014-12-10,GWEN_GARLAND,9
1,2014-12-10,RYANNCOOPERWOOD,1
2,2014-12-11,GWEN_GARLAND,6
3,2014-12-12,GWEN_GARLAND,33
4,2014-12-13,GWEN_GARLAND,21


In [59]:
df = df.merge(grp, on=['date', 'author'])
df['followers_log'] = np.log(df['followers']+1)

In [60]:
df['tweets_times_followers'] = df['followers'] * df['tweet_count']

In [61]:
dct = df.to_dict()

In [62]:
df = df.drop_duplicates(['date', 'author'])

In [63]:
df.shape

(801, 7)

### Bokeh interactive visualization

In [67]:
from bokeh.io import show, output_file
from bokeh.models import ColumnDataSource
from bokeh.palettes import Spectral6
from bokeh.plotting import figure
from bokeh.transform import factor_cmap
from bokeh.io import output_file, show
from bokeh.layouts import gridplot, row, column
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure
from bokeh.models import HoverTool
from bokeh.models import CustomJS
output_file("bar_colormapped.html")

HEIGHT = 400
WIDTH = 600

source = ColumnDataSource(df)

TOOLS = 'crosshair,lasso_select'
left = figure(plot_height=HEIGHT, plot_width=WIDTH, 
              tools=TOOLS, title="Select tweets here:", x_axis_type="datetime")
left.circle(x='date', y='tweets_times_followers', fill_color='#02A695', alpha=.5,
            size=8, source=source,
            line_color='grey') 
custom_hove = HoverTool(point_policy='follow_mouse')
TOOLTIPS1 = """
    <div>
        <div>
            <span style="font-size: 14px; font-weight: bold;">@author</span>
        </div>
        <div>
            <span style="font-size: 10px; color: #1B919E;">Tweets today: @tweet_count</span>
            <br>
            <span style="font-size: 10px; color: #1B919E;">Followers today: @followers</span>
        </div>
    </div>
"""
custom_hove.tooltips = TOOLTIPS1
left.add_tools(custom_hove)

s2 = ColumnDataSource(data=dict(date=[], author=[], content=[]))
right = figure(plot_width=WIDTH, plot_height=HEIGHT, 
               title="Hover to read each tweet (y-axis is tweet author)",
               y_range=authors,
               tools='crosshair',
               x_axis_type="datetime")
right.circle(x='date', y='author', fill_color='#A60D71', size=15,
             alpha=.7,
             line_color='white',
             source=s2)


source.callback = CustomJS(args=dict(s2=s2), code="""
        var inds = cb_obj.selected.indices;
        var d1 = cb_obj.data;
        var d2 = s2.data;
        d2['date'] = []
        d2['author'] = []
        d2['content'] = []
        for (var i = 0; i < inds.length; i++) {
            d2['date'].push(d1['date'][inds[i]])
            d2['author'].push(d1['author'][inds[i]])
            d2['content'].push(d1['content'][inds[i]])
        }
        s2.change.emit();
    """)


custom_hove_2 = HoverTool()

TOOLTIPS2 = """
    <div>
        <div>
            <img
                src="http://www.lter-europe.net/document-archive/image-gallery/albums/logos/TwitterLogo_55acee.png"
                height="20" alt="Tweet" width="20"
                style="float: left; margin: 0px 5px 5px 0px;"
                border="0"
            ></img>
        </div>
        <div>
            <span style="font-size: 14px; font-weight: bold;">@author</span>
        </div>
        <div>
            <span style="max-width:50px, margin:auto, font-size: 12px; color: #1B919E;">@content</span>
        </div>
    </div>
"""
custom_hove_2.tooltips = TOOLTIPS2

right.add_tools(custom_hove_2)

left.xaxis.axis_label = "Date"
left.yaxis.axis_label = "Influence (tweets * followers)"
right.yaxis.axis_label= "Tweet author"
right.yaxis.visible=False

p = column([left,right])
p.sizing_mode="scale_width"


In [68]:
show(p)