In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import glob
import seaborn as sns
from datetime import datetime

In [None]:
# Read in all the data
d = {}

for filename in glob.glob('russian-troll-tweets-master/*.csv'):
    d[filename[:-4]] = pd.read_csv(filename, header=0)

In [None]:
# Create Dataframe
data = pd.concat(d.values())

In [None]:
# View first few rows
data.head()

In [None]:
# Size of dataframe
data.shape

In [None]:
# Plot tweets function
def plot_tw(d, col, title, ax=None):
    
    if ax is None:
        ax = plt.gca()
        
    time_list = []
    for item in d['publish_date']:
        old_date = datetime.strptime(item, '%m/%d/%Y %H:%M')
        new_date = old_date.date()
        time_list.append(new_date)
        
    df = pd.DataFrame(time_list) 
    
    plotting_df = df.stack().value_counts().reset_index(name='counts')

    ax.plot_date(plotting_df['index'], plotting_df['counts'], fmt='.', color=col)
    ax.set_ylim(0, plotting_df['counts'].values.max()+1)
    fig.autofmt_xdate()
    plt.xlabel('Date', fontsize=18)
    plt.ylabel('Counts', fontsize=18)
    ax.grid(color='grey', linestyle='-', linewidth=0.25, alpha=0.9)
    ax.set_facecolor("#f7f7f7")
    ax.set_title(title, y=0.9, fontsize=18)
    # We change the fontsize of minor ticks label 
    ax.tick_params(axis='both', which='major', labelsize=15)
    ax.tick_params(axis='both', which='minor', labelsize=15)
    
    return ax

# Plot total tweets over time
fig, ax = plt.subplots(figsize=(20,12))
plot_tw(data, 'purple', 'Total Tweets by Bots Over Time',ax)

In [None]:
# packages need for text mining
import nltk
nltk.download('stopwords')
nltk.download('punkt') # tockenizer
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# import necessary modules
import re
import string
from nltk.stem import WordNetLemmatizer

wnl = WordNetLemmatizer()

In [None]:
# Tokenize words
def tokenize_text(text):
    tokens = nltk.word_tokenize(text) 
    tokens = [token.strip() for token in tokens]
    return tokens

In [None]:
# Remove special characters
def remove_special_characters(text):
    tokens = tokenize_text(text)
    pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
    filtered_tokens = filter(None, [pattern.sub('', token) for token in tokens])
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

In [None]:
# Remove stopwords
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
def remove_stopwords(text,stopword_list):
    tokens = tokenize_text(text)
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [None]:
# Remove twitter link
def remove_custom(text):
    sep = 'htt'
    rest = text.split(sep, 1)[0]
    return rest

In [None]:
from nltk import pos_tag
from nltk.corpus import wordnet as wn

# Annotate text tokens with POS tags
def pos_tag_text(text):
    
    def penn_to_wn_tags(pos_tag):
        if pos_tag.startswith('J'):
            return wn.ADJ
        elif pos_tag.startswith('V'):
            return wn.VERB
        elif pos_tag.startswith('N'):
            return wn.NOUN
        elif pos_tag.startswith('R'):
            return wn.ADV
        else:
            return None

    tagged_text = pos_tag(text)
    tagged_lower_text = [(word.lower(), penn_to_wn_tags(pos_tag))
                         for word, pos_tag in
                         tagged_text]
    return tagged_lower_text

In [None]:
# lemmatize text based on POS tags    
def lemmatize_text(text):
    text = tokenize_text(text)
    pos_tagged_text = pos_tag_text(text)
    lemmatized_tokens = [wnl.lemmatize(word, pos_tag) if pos_tag
                         else word                     
                         for word, pos_tag in pos_tagged_text]
    lemmatized_text = ' '.join(lemmatized_tokens)
    return lemmatized_text

In [None]:
# Text normalization pipeline
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
import re

def keep_text_characters(text):
    filtered_tokens = []
    tokens = tokenize_text(text)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

# Main normalize text function
def normalize_text(text,tokenize=False):
    text = lemmatize_text(text)
    text = remove_special_characters(text)
    text = text.lower()
    text = remove_stopwords(text,ENGLISH_STOP_WORDS)
    text = keep_text_characters(text)
    text = remove_custom(text)

    return text

In [None]:
# Get tweets text
tweets = data['content'].dropna()

In [None]:
# Example of normalization
print("Example sentence:", tweets.iloc[101])
print("Normalized sentence:", normalize_text(tweets.iloc[101]))

In [None]:
# Normalize all the tweets (takes several hours)
normalized_list = []

for word_entry in tweets:
    normalized_list.append(normalize_text(word_entry))

# Save in file
with open('norm.txt', 'w') as f:
    for item in normalized_list:
        f.write("%s\n" % item)

In [None]:
# Get list from file
with open('norm.txt', 'r') as f:
    mylist = f.read().split('\n')

In [None]:
from nltk.tokenize import word_tokenize

# Split normalized tweets for single words
single_word_list = []

for entry in mylist:
    tk = word_tokenize(entry)
    for i in tk:
        single_word_list.append(i)

single_word_list[0:10]

In [None]:
# Finds most frequent words
fdist = nltk.FreqDist(single_word_list)

try:
    del fdist["s"]
    del fdist['nt']
    del fdist['rt']
except KeyError:
    print("Key not found")
    
fdist

In [None]:
# bar chart with count of most common words

x, y = zip(*fdist.most_common(n=20)) # Unzip the tuples into lists
plt.figure(figsize=(20,8))
plt.bar(range(len(x)), y)
plt.xticks(range(len(x)), x)
plt.tick_params(axis='both', which='major', labelsize=14)
plt.tick_params(axis='both', which='minor', labelsize=16)
plt.title("Highest Frequency Words in Tweets", fontsize = 24)
plt.ylabel("Number of Occurances", fontsize=18)
plt.show()

In [None]:
# Find tweets with word 'Trump'

data_subset = data.sample(n = 100, random_state = 122) 

for index, row in data_subset.iterrows():
    if ('trump' in row['content'] or 'Trump' in row['content']):
        print(row['author'], ',', row['publish_date'], ':', row['content'], '\n')

In [None]:
# Find tweets with word 'Obama'

data_subset2 = data.sample(n = 500, random_state = 122) 

for index, row in data_subset2.iterrows():
    if ('Obama' in row['content'] or 'obama' in row['content']):
        print(row['author'], ',', row['publish_date'], ':', row['content'], '\n')

In [None]:
# View different categories
print(data['account_category'].nunique())
data.groupby('account_category').agg('count')

In [None]:
# Subset data based on three groups
fearmonger = data.loc[data['account_category'] == 'Fearmonger']
leftTroll = data.loc[data['account_category'] == 'LeftTroll']
rightTroll = data.loc[data['account_category'] == 'RightTroll']

In [None]:
# Plot tweets throughout time per group

fig, axs = plt.subplots(2, 2, sharex=True, sharey=True,
                        gridspec_kw={'hspace': 0, 'wspace': 0},
                        figsize=(20,12))

(ax1, ax2), (ax3, ax4) = axs
fig.suptitle('Bot Tweets over Time', fontsize = 20)
plot_tw(data, 'green', 'All Bots', ax1)
plot_tw(fearmonger, 'orange', 'Fearmonger', ax2)
plot_tw(leftTroll, 'blue', 'Left Troll', ax3)
plot_tw(rightTroll, 'red', 'Right Troll', ax4)

for ax in axs.flat:
    ax.label_outer()

In [None]:
# Plot biplot
biTradeWords = nltk.bigrams(single_word_list)
biFdist = nltk.FreqDist(biTradeWords)
print(biFdist.most_common(10))
biFdist.plot(20, cumulative=False)
plt.xticks(fontsize=20)

In [None]:
# Plot histogram of tweet responses

fig, ax = plt.subplots(figsize=(20,12))

update_list = data.groupby('updates')['tweet_id'].count()

ax = sns.distplot(data['updates'], kde=False)
ax.set_title("Distribution of Tweet Updates for a Tweet", fontsize = 20)
plt.xlabel('Tweet Reception (Likes, Retweets, Replies)', fontsize=18)
plt.ylabel('Count', fontsize=18)
ax.tick_params(axis='both', which='major', labelsize=15)
ax.tick_params(axis='both', which='minor', labelsize=15)
ax.set_xlim(-100, 70000)

In [None]:
# Find median of tweet responses
np.median(data['updates'])

In [None]:
# Check to see if tweets on October 6 were unique

# Get times
time_list2 = []
for item in data['publish_date']:
    old_date = datetime.strptime(item, '%m/%d/%Y %H:%M')
    new_date = old_date.date()
    time_list2.append(new_date)
    
import datetime

# Get time equal to Oct 6, 2016
indexes = []
for i in range(len(time_list2)):
    if (time_list2[i] == datetime.date(2016, 10, 6)):
        indexes.append(i)
        
import random
random.seed(122)
idx_subset = random.sample(indexes, 200)

# View tweets on that day
for i in idx_subset:
    print(data['author'].iloc[i], ',', data['publish_date'].iloc[i], ':', data['content'].iloc[i], '\n')

In [None]:
# View tweets from category Hashtag Gamer

data_subset4 = data.sample(n = 2000, random_state = 122) 

for index, row in data_subset4.iterrows():
    if (row['account_category'] == 'HashtagGamer'):
        print(row['author'], ',', row['publish_date'], ':', row['content'], '\n')

In [None]:
# Plot wordcloud

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

wordcloud = WordCloud(background_color="white").generate(" ".join(single_word_list))

plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
# Find tweets from category fearmonger

data_subset_fear = fearmonger.sample(n = 5, random_state = 1234) 

for index, row in data_subset_fear.iterrows():
    print(row['author'], ',', row['publish_date'], ':', row['content'], '\n')

In [None]:
# View data by language
data.groupby('language').count()