In [1]:
import pandas as pd
import string

# put in: a list of all the words from a set of tweets
# get out: a list of all the unique words in that set of tweets
def get_unique(vector):
    unique_list = []
    for item in vector:
        if item not in unique_list:
            unique_list.append(item)
    return unique_list

# put in: vector of words you want to count and a dict of all possible words
# get out: the frequency of all possible words in that vetor of words
def get_freq(vector, my_dict):
    return_dict = {}
    for key in my_dict:
        return_dict[key] = 0
    for item in vector:
        return_dict[item] += 1
    return return_dict

# put in: the full df and the category you want to pull out
# get out: a list of all the words from all of the tweets in that category
def get_category(series, category):
    #print(series)
    category_series = series[series['account_category'] == category]
    category_content = []
    for item in category_series['content']:
        category_content.append(str(item))
    for i in range(len(category_content)):
        category_content[i] = category_content[i].split()
    category_list = []
    for item1 in category_content:
        for item2 in item1:
            category_list.append(str(item2))
    return category_list

# put it: df of all the tweets
# get out: list of all the word in those tweets
def get_content(series):
    content = []
    for item in series['content']:
        content.append(str(item))
    for i in range(len(content)):
        content[i] = content[i].split()
    content_list = []
    for item1 in content:
        for item2 in item1:
            content_list.append(str(item2))
    return content_list

# the total number of words in that category
# sum values in the dictionary of word frequecies for that category
def get_total(d):
    return(sum(d.values()))

# get the proportion of words that category accounts for to use as multiplier
# if that category does not have words, return 0
def get_multiplier(total, bot_total):
    try:
        multiplier = total / bot_total
        return multiplier
    except:
        return 0

# calculate the uniqueness score of each word for that category
def get_scores(bot_dict, master_dict, bot_multiplier):
    score_dict = {}
    for item in master_dict:
        score = ((bot_dict[item]**2)/master_dict[item])*bot_multiplier
        score_dict[item] = [score]
    return score_dict


In [2]:
# get all the tweets where the language is English
# datasets were compiled in R to give one very large dataset. 
# don't print it unless you want to spend forever waiting for it to process.
tweets = pd.read_csv("IRAhandle_tweets.csv")
english_tweets = tweets[tweets['language'] == 'English']

In [3]:
# pull out all possible words in all the tweets
All_Tweets = get_content(english_tweets)
blank_dict = {}
for item in All_Tweets:
    blank_dict[item] = 0
master_dict = get_freq(All_Tweets, blank_dict)

In [4]:
# find the frequency of all words in the Unknown category
Unknown = get_category(english_tweets, 'Unknown')
Unknown_dict = get_freq(Unknown, blank_dict)

In [5]:
# find the frequency of all words in the Fearmonger category
Fearmonger = get_category(english_tweets, 'Fearmonger')
Fearmonger_dict = get_freq(Fearmonger, blank_dict)

In [6]:
# find the frequency of all words in the NonEnglish category
NonEnglish = get_category(english_tweets, 'NonEnglish')
NonEnglish_dict = get_freq(NonEnglish, blank_dict)

In [7]:
# find the frequency of all words in the Commercial category
Commercial = get_category(english_tweets, 'Commercial')
Commercial_dict = get_freq(Commercial, blank_dict)

In [8]:
# find the frequency of all words in the HashtagGamer category
HashtagGamer = get_category(english_tweets, 'HashtagGamer')
HashtagGamer_dict = get_freq(HashtagGamer, blank_dict)

In [9]:
# find the frequency of all words in the LeftTroll category
LeftTroll = get_category(english_tweets, 'LeftTroll')
LeftTroll_dict = get_freq(LeftTroll, blank_dict)

In [10]:
# find the frequency of all words in the NewsFeed category
NewsFeed = get_category(english_tweets, 'NewsFeed')
NewsFeed_dict = get_freq(NewsFeed, blank_dict)

In [11]:
# find the frequency of all words in the RightTroll category
RightTroll = get_category(english_tweets, 'RightTroll')
RightTroll_dict = get_freq(RightTroll, blank_dict)

In [12]:
# sum up the toal frequency of all words in all tweets
master_dict = blank_dict
for key in master_dict:
    master_dict[key] = Unknown_dict[key] + Fearmonger_dict[key] + NonEnglish_dict[key] + Commercial_dict[key] + HashtagGamer_dict[key] + LeftTroll_dict[key] + NewsFeed_dict[key] + RightTroll_dict[key]

In [13]:
# sum up the total number of words in each category and overall
#all_total = get_total(All_Tweets)
NewsFeed_total = get_total(NewsFeed_dict)
RightTroll_total = get_total(RightTroll_dict)
Commercial_total = get_total(Commercial_dict)
NonEnglish_total = get_total(NonEnglish_dict)
Fearmonger_total = get_total(Fearmonger_dict)
HashtagGamer_total = get_total(HashtagGamer_dict)
LeftTroll_total = get_total(LeftTroll_dict)
Unknown_total = get_total(Unknown_dict)

total = NewsFeed_total + RightTroll_total + Commercial_total + NonEnglish_total
total = total + Fearmonger_total + HashtagGamer_total + LeftTroll_total + Unknown_total

In [14]:
# calculate the proportion of words accounted for by each category
NewsFeed_multiplier = get_multiplier(total, NewsFeed_total)
RightTroll_multiplier = get_multiplier(total, RightTroll_total)
Commercial_multiplier = get_multiplier(total, Commercial_total)
NonEnglish_multiplier = get_multiplier(total, NonEnglish_total)
Fearmonger_multiplier = get_multiplier(total, Fearmonger_total)
HashtagGamer_multiplier = get_multiplier(total, HashtagGamer_total)
LeftTroll_multiplier = get_multiplier(total, LeftTroll_total)
Unknown_multiplier = get_multiplier(total, Unknown_total)

In [15]:
# calculate the uniqueness score for all of the words in each category
NewsFeed_scores = get_scores(NewsFeed_dict, master_dict, NewsFeed_multiplier)
RightTroll_scores = get_scores(RightTroll_dict, master_dict, RightTroll_multiplier)
Commercial_scores = get_scores(Commercial_dict, master_dict, Commercial_multiplier)
NonEnglish_scores = get_scores(NonEnglish_dict, master_dict, NonEnglish_multiplier)
Fearmonger_scores = get_scores(Fearmonger_dict, master_dict, Fearmonger_multiplier)
HashtagGamer_scores = get_scores(HashtagGamer_dict, master_dict, HashtagGamer_multiplier)
LeftTroll_scores = get_scores(LeftTroll_dict, master_dict, LeftTroll_multiplier)
Unknown_scores = get_scores(Unknown_dict, master_dict, Unknown_multiplier)

In [16]:
# save these scores to csv's
pd.DataFrame(RightTroll_scores).transpose().to_csv('RightTroll_Scores2a.csv')
pd.DataFrame(LeftTroll_scores).transpose().to_csv('LeftTroll_Scores2a.csv')
pd.DataFrame(Commercial_scores).transpose().to_csv('Commercial_Scores2a.csv')
pd.DataFrame(Fearmonger_scores).transpose().to_csv('Fearmonger_Scores2a.csv')
pd.DataFrame(HashtagGamer_scores).transpose().to_csv('HashtagGamer_Scores2a.csv')
pd.DataFrame(Unknown_scores).transpose().to_csv('Unknown_Scores2a.csv')
pd.DataFrame(NonEnglish_scores).transpose().to_csv('NonEnglish_Scores2a.csv')
pd.DataFrame(NewsFeed_scores).transpose().to_csv('NewsFeed_Scores2a.csv')