In [129]:
import numpy as N
import pandas as pd
import json
import matplotlib.pyplot as plt
from IPython.display import clear_output
import re
import string
plt.style.use('nuala')

In [287]:
filename='data/stream__thelindywest___Lesdoggg___KimKardashian.json'

In [288]:
# read in the training set data
tweets_data = []
tweets_file = open(filename, "r")
for line in tweets_file:
    try:
        tweet = json.loads(line)
        tweets_data.append(tweet)
    except:
        continue

In [401]:
# get the relevant info about each tweet -- this might not be exhaustive
tweets = pd.DataFrame()
tweets['text'] = map(lambda tweet: tweet['text'], tweets_data)
tweets['retweeted'] = map(lambda tweet: tweet['retweeted'], tweets_data)
tweets['username'] = map(lambda tweet: tweet['user']['screen_name'], tweets_data)
tweets['verified'] = map(lambda tweet: tweet['user']['verified'], tweets_data)
tweets['hashtags'] = map(lambda tweet: tweet['entities']['hashtags'], tweets_data)
tweets['number of hashtags'] = map(lambda tweet: len(tweet['entities']['hashtags']), tweets_data)
tweets['user_mentions'] = map(lambda tweet: [tweet['entities']['user_mentions'][i]['screen_name'] for i in N.arange(len(tweet['entities']['user_mentions']))], tweets_data)

tweets['favorited'] = map(lambda tweet: tweet['favorited'], tweets_data)
tweets['timestamp'] = map(lambda tweet: tweet['created_at'], tweets_data)
tweets['source'] = map(lambda tweet: tweet['source'], tweets_data)

In [415]:
#change text to lower case
tweets['text']=tweets['text'].str.lower()
tweets['text'] = tweets['text'].str.replace('https?:.*', '')

In [403]:
#training set classification (as found in the other notebook)
troll_class = N.load('/Users/nuala/Documents/Research/Code/repos/twitter-trolls/data/training_set_classification.npy')

In [416]:
#list of words in each tweet - this will be used to figure out the most common words used


text_lists = tweets['text'].str.split('[\s,\.]')

In [417]:
# data sets of positive / negative / curse words, these may come in handy
negative_words = N.loadtxt('data/opinion-lexicon-English/negative-words.txt', dtype=str, comments=';').tolist()
positive_words = N.loadtxt('data/opinion-lexicon-English/positive-words.txt', dtype=str, comments=';').tolist()
curse_words1 = N.loadtxt('data/opinion-lexicon-English/curse-words.txt', dtype=str, delimiter='\n').tolist()
curse_words2 = N.loadtxt('data/opinion-lexicon-English/swearWords.txt', dtype=str, delimiter='\n').tolist()

In [418]:
# a list of common words we don't care about

common_words = ['and', 'a','an', 'the', 'is', 'are', 'in', 'of', 'to', 'this', 'that', 'it', 'its', 'on', 'at', 'as']



In [419]:
# get 2 lists of all the (interesting) words used by trolls and non-trolls

troll_text = []
normal_text = []

for i in N.arange(len(text_lists)):
    if (troll_class[i]==True):
        wordlist = troll_text
    else:
        wordlist = normal_text
    for word in text_lists[i]:
            
        tmp = word.encode('ascii', 'ignore')
        # we don't care about mentions or hash tags or RT or URLs
        if ((tmp.startswith('@')) | (tmp.startswith('#')) | (tmp=='rt') | (tmp.startswith('htt'))):
            continue
        # take out punctuation
        tmp = tmp.translate(None, string.punctuation)
        if (tmp == ''):
            continue
        # don't include words that are just numbers
        if (re.search('[a-z]', tmp) == None):
            continue
        # don't include the common words
        if (tmp in common_words):
            continue
        # if it makes it here, we probably have a proper word
        wordlist.append(tmp)
    
    



In [420]:
print len(normal_text)
print len(troll_text)

10828
638


In [421]:
utext1, count1 = N.unique(normal_text, return_counts=True)
utext2, count2 = N.unique(troll_text, return_counts=True)
index1 = N.argsort(count1)[::-1]
index2 = N.argsort(count2)[::-1]

# the following are a list of the words most commonly used by things classified as trolls and non-trolls
# it is in order of the most common
nontroll_words = utext1[index1]
troll_words = utext2[index2]

In [423]:
print troll_words[0:30]
print nontroll_words[0:30]

['you' 'i' 'kim' 'god' 'u' 'so' 'like' 'ya' 'lol' 'what' 'kimisaripoff'
 'shit' 'me' 'your' 'my' 'know' 'amp' 'slayer' 'for' 'get' 'was' 'just'
 'fucking' 'ur' 'got' 'her' 'good' 'whore' 'all' 'do']
['about' 'report' 'much' 'petition' 'online' 'ado' 'you' 'i' 'be' 'de'
 'robbery' 'for' 'do' 'des' 'what' 'your' 'her' 'she' 'en' 'like' 'was'
 'so' 'shes' 'people' 'new' 'not' 'robbed' 'my' 'amp' 'paris']


In [366]:
tweets['source category'] = 0
tweets.loc[tweets['source'].str.contains('iPhone|iPad'), 'source category'] = 1
tweets.loc[tweets['source'].str.contains('Android'), 'source category'] = 2
tweets.loc[tweets['source'].str.contains('Web Client'), 'source category'] = 3
tweets.loc[tweets['source'].str.contains('twittbot.net'), 'source category'] = 4
tweets.loc[tweets['source'].str.contains('SocialFlow'), 'source category'] = 5
tweets.loc[tweets['source'].str.contains('Windows Phone'), 'source category'] = 6
tweets.loc[tweets['source'].str.contains('BlackBerry'), 'source category'] = 7


In [425]:
tweets.loc[troll_class == 1, 'timestamp']

26      Wed Oct 05 23:17:40 +0000 2016
27      Wed Oct 05 23:17:44 +0000 2016
29      Wed Oct 05 23:17:49 +0000 2016
30      Wed Oct 05 23:17:53 +0000 2016
33      Wed Oct 05 23:18:06 +0000 2016
34      Wed Oct 05 23:18:09 +0000 2016
36      Wed Oct 05 23:18:13 +0000 2016
37      Wed Oct 05 23:18:16 +0000 2016
39      Wed Oct 05 23:18:20 +0000 2016
41      Wed Oct 05 23:18:25 +0000 2016
42      Wed Oct 05 23:18:28 +0000 2016
43      Wed Oct 05 23:18:33 +0000 2016
45      Wed Oct 05 23:18:38 +0000 2016
49      Wed Oct 05 23:18:43 +0000 2016
51      Wed Oct 05 23:18:47 +0000 2016
52      Wed Oct 05 23:18:51 +0000 2016
54      Wed Oct 05 23:18:54 +0000 2016
56      Wed Oct 05 23:18:56 +0000 2016
57      Wed Oct 05 23:18:57 +0000 2016
59      Wed Oct 05 23:19:10 +0000 2016
61      Wed Oct 05 23:19:18 +0000 2016
63      Wed Oct 05 23:19:26 +0000 2016
67      Wed Oct 05 23:19:33 +0000 2016
71      Wed Oct 05 23:19:41 +0000 2016
74      Wed Oct 05 23:19:29 +0000 2016
84      Wed Oct 05 23:20:

In [285]:
tweets['source'][tweets['source category']==0][5]

u'<a href="http://twittbot.net/" rel="nofollow">twittbot.net</a>'

In [274]:
#features to include:
# *whether the tweet contains the N most common troll words
# *whether the tweet contains a few really bad curse words (these might not get pulled out as separate words
#  because they could be in a hashtag, so just check if the full text of the tweet contains those words)
# *number of hash tags
# *device used (iphone/ipad, android, blackberry, etc)
# *time stamp?
# *verified






0        True
1       False
2       False
3       False
4        True
5       False
6       False
7       False
8       False
9        True
10       True
11       True
12      False
13      False
14      False
15      False
16      False
17       True
18       True
19      False
20      False
21      False
22      False
23      False
24      False
25      False
26      False
27      False
28      False
29      False
        ...  
1493     True
1494    False
1495     True
1496    False
1497    False
1498    False
1499     True
1500    False
1501    False
1502    False
1503    False
1504    False
1505     True
1506    False
1507     True
1508    False
1509     True
1510    False
1511    False
1512    False
1513    False
1514    False
1515    False
1516     True
1517    False
1518    False
1519    False
1520    False
1521    False
1522    False
Name: source, dtype: bool