In [119]:
import numpy as N
import pandas as pd
import json
import matplotlib.pyplot as plt
from IPython.display import clear_output
import re
import string
%matplotlib inline
plt.style.use('nuala')

In [120]:
filename='data/stream__thelindywest___Lesdoggg___KimKardashian.json'

In [121]:
# read in the training set data
tweets_data = []
tweets_file = open(filename, "r")
for line in tweets_file:
    try:
        tweet = json.loads(line)
        tweets_data.append(tweet)
    except:
        continue

In [122]:
# get the relevant info about each tweet -- this might not be exhaustive
tweets = pd.DataFrame()

# Text of the tweet
tweets['text'] = map(lambda tweet: tweet['text'], tweets_data)

# Whether or not the tweet was retweeted
tweets['retweeted'] = map(lambda tweet: tweet['retweeted'], tweets_data)

# Username of person who tweeted it
tweets['username'] = map(lambda tweet: tweet['user']['screen_name'], tweets_data)

# Whether the account is verified
tweets['verified'] = map(lambda tweet: tweet['user']['verified'], tweets_data)

# List of hashtags used
tweets['hashtags'] = map(lambda tweet: [tweet['entities']['hashtags'][i]['text'].lower() for i in N.arange(len(tweet['entities']['hashtags']))], tweets_data)

# Number of hashtags used
tweets['number of hashtags'] = map(lambda tweet: len(tweet['entities']['hashtags']), tweets_data)

# List of users mentioned in the tweet
tweets['user_mentions'] = map(lambda tweet: [tweet['entities']['user_mentions'][i]['screen_name'] for i in N.arange(len(tweet['entities']['user_mentions']))], tweets_data)

# Whether the tweet has been favorited
tweets['favorited'] = map(lambda tweet: tweet['favorited'], tweets_data)

# Time stamp of tweet
tweets['timestamp'] = map(lambda tweet: int(tweet['created_at'].split()[3].split(':')[0])+int(tweet['created_at'].split()[3].split(':')[1])/60., tweets_data)

# Source of tweet (iPhone, Android, etc)
tweets['source'] = map(lambda tweet: tweet['source'], tweets_data)

# Language
tweets['language'] = map(lambda tweet: tweet['lang'], tweets_data)

In [123]:
# Change text to lower case
tweets['text']=tweets['text'].str.lower()

# Replace URLs with 'httpaddr'
tweets['text'] = tweets['text'].str.replace('https?:.*', 'httpaddr')

In [124]:
# Create a new column called 'Source Category'
tweets['source category'] = 0
tweets.loc[tweets['source'].str.contains('iPhone|iPad'), 'source category'] = 1
tweets.loc[tweets['source'].str.contains('Android'), 'source category'] = 2
tweets.loc[tweets['source'].str.contains('Web Client'), 'source category'] = 3
tweets.loc[tweets['source'].str.contains('twittbot.net'), 'source category'] = 4
tweets.loc[tweets['source'].str.contains('SocialFlow'), 'source category'] = 5
tweets.loc[tweets['source'].str.contains('Windows Phone'), 'source category'] = 6
tweets.loc[tweets['source'].str.contains('BlackBerry'), 'source category'] = 7

In [125]:
# Training set classification (as found in the other notebook)
troll_class = N.load('/Users/nuala/Documents/Research/Code/repos/twitter-trolls/data/training_set_classification.npy')

In [126]:
# List of words in each tweet - this will be used to figure out the most common words used
text_lists = tweets.loc[:, 'text'].str.split('[\s,\.]')

In [127]:
# data sets of positive / negative / curse words, these may come in handy
negative_words = N.loadtxt('data/opinion-lexicon-English/negative-words.txt', dtype=str, comments=';').tolist()
positive_words = N.loadtxt('data/opinion-lexicon-English/positive-words.txt', dtype=str, comments=';').tolist()
curse_words1 = N.loadtxt('data/opinion-lexicon-English/curse-words.txt', dtype=str, delimiter='\n').tolist()
curse_words2 = N.loadtxt('data/opinion-lexicon-English/swearWords.txt', dtype=str, delimiter='\n').tolist()
really_bad_words = N.loadtxt('data/opinion-lexicon-English/really_bad_words.txt', dtype =str, delimiter='\n').tolist()

In [128]:
# A list of common words we don't care about
common_words = ['and', 'a','an', 'the', 'is', 'are', 'in', 'of', 'to', 'this', 'that', 'it', 'its', 'on', 'at', 'as']

In [129]:
# get 2 lists of all the (interesting) words used by trolls and non-trolls

troll_text = []
normal_text = []

for i in text_lists.index:
    # Only consider tweets that are in English
    if (tweets.loc[i, 'language']!='en'):
        continue
        
    # If this is a troll tweet, add text to troll_text, otherwise add it to normal_text
    if (troll_class[i]==True):
        wordlist = troll_text
    else:
        wordlist = normal_text
        
    # Go through the words in the tweet
    for word in text_lists[i]:
        tmp = word.encode('ascii', 'ignore')
        # we don't care about mentions or hash tags or RT or URLs
        if ((tmp.startswith('@')) | (tmp.startswith('#')) | (tmp=='rt') | (tmp.startswith('htt'))):
            continue
        # take out punctuation
        tmp = tmp.translate(None, string.punctuation)
        if (tmp == ''):
            continue
        # don't include words that are just numbers
        if (re.search('[a-z]', tmp) == None):
            continue
        # don't include the common words
        if (tmp in common_words):
            continue
        if (tmp.startswith('kim')):
            continue
        # if it makes it here, we probably have a proper word
        wordlist.append(tmp)


In [130]:
print 'Number of words added to non-troll word list:', len(normal_text)
print 'Number of words added to troll word list:', len(troll_text)

Number of words added to non-troll word list: 9437
Number of words added to troll word list: 605


In [131]:
# Get unique words in these lists, sort by most common
utext1, count1 = N.unique(normal_text, return_counts=True)
utext2, count2 = N.unique(troll_text, return_counts=True)
index1 = N.argsort(count1)[::-1]
index2 = N.argsort(count2)[::-1]

# the following are a list of the words most commonly used by things classified as trolls and non-trolls
# it is in order of the most common
nontroll_words = utext1[index1]
troll_words = utext2[index2]

In [132]:
print 'Most common troll words:', troll_words[0:30]
print 'Most common non-troll words:', nontroll_words[0:30]

Most common troll words: ['you' 'i' 'god' 'u' 'like' 'so' 'ya' 'shit' 'me' 'what' 'for' 'know' 'get'
 'amp' 'my' 'was' 'your' 'got' 'good' 'ur' 'do' 'make' 'her' 'lol' 'just'
 'robbery' 'all' 'fucking' 'whore' 'black']
Most common non-troll words: ['about' 'report' 'much' 'online' 'petition' 'ado' 'you' 'i' 'be' 'robbery'
 'for' 'what' 'her' 'your' 'do' 'she' 'so' 'was' 'shes' 'like' 'people'
 'new' 'not' 'robbed' 'my' 'but' 'amp' 'or' 'im' 'paris']


In [133]:
# Use the 50 most common troll words and 50 most common normal words
num = 50
word_list = troll_words[0:num].tolist() + nontroll_words[0:num].tolist()

In [134]:

# Initialize new columns
for i in N.arange(len(word_list)):
    column_name = 'used word %i'%i
    tweets[column_name]=False
for i in N.arange(len(really_bad_words)):
    column_name = 'bad word %i'%i
    tweets[column_name]=False
    

for i in N.arange(len(tweets)):
    for word in text_lists[i]:
        tmp = word.encode('ascii', 'ignore')
        #first check the list of actual words in the tweet (not hashtags)
        if (word in word_list):
            ii = word_list.index(word)
            column_name = 'used word %i'%ii
            tweets.loc[i, column_name]=True
        if (word in really_bad_words):
            ii = really_bad_words.index(word)
            column_name = 'bad word %i'%ii
            tweets.loc[i, column_name]=True
        else:
            continue
    #next check the list of hashtags
    for word in tweets.loc[i, 'hashtags']:
        tmp = word.encode('ascii', 'ignore')
        for bad_word in really_bad_words:
            if (word.find(bad_word)>-1):
                ii = really_bad_words.index(bad_word)
                column_name = 'bad word %i'%ii
                tweets.loc[i, column_name]=True
            else:
                continue
                    




In [135]:
#features to include:
# *whether the tweet contains the N most common troll words
# *whether the tweet contains a few really bad curse words (these might not get pulled out as separate words
#  because they could be in a hashtag, so just check if the full text of the tweet contains those words)
# *number of hash tags
# *device used (iphone/ipad, android, blackberry, etc)
# *time stamp?
# *verified

columns = ['verified', 'timestamp', 'number of hashtags', 'source category']

for i in N.arange(len(word_list)):
    column_name = 'used word %i'%i
    columns.append(column_name)
for i in N.arange(len(really_bad_words)):
    column_name = 'bad word %i'%i
    columns.append(column_name)

In [136]:
features = tweets.loc[:, columns]

In [137]:
ntot = len(features)
ntrain = N.int(ntot / 3)
ncross = N.int(ntot / 3)
ntest = ntot - ntrain - ncross

# Randomize indices
rind = N.transpose([N.arange(ntot), N.random.permutation(ntot)])

# training set
training_set = features.loc[rind[0:ntrain, 1], :]
ytrain = troll_class[rind[0:ntrain, 1]]

# cross validation set
validation_set = features.loc[rind[ntrain:ntrain+ncross, 1], :]
yval = troll_class[rind[ntrain:ntrain+ncross, 1]]

# test set
test_set = features.loc[rind[ntrain+ncross:, 1], :]
ytest = troll_class[rind[ntrain+ncross:, 1]]


In [138]:
print 'Size of training set:', len(training_set)
print 'Size of validation set:', len(validation_set)
print 'Size of test set:', len(test_set)

Size of training set: 507
Size of validation set: 507
Size of test set: 509


In [139]:
# Support vector machine
from sklearn import svm

In [140]:
# Find the values of C and kernel that minimize false positives (wrongly classify non-trolling tweets as trolling)
numc = 20
nkernel = 3

precision = N.zeros((numc, nkernel), dtype=N.float)

j = 0
for kernel in ['linear', 'rbf', 'poly']:
    i = 0
    for c in N.logspace(-3, 1, numc):
        clf = svm.SVC(kernel=kernel, C = c)
        clf.fit(training_set, ytrain)
        ypredict = clf.predict(validation_set)
        
        precision[i,j] = N.sum((ypredict == True) & (yval == True))/ N.float(N.sum(yval==True))
        i = i+1
    j = j+1
        
        

In [143]:
ind = N.unravel_index(precision.argmax(), precision.shape)
cval = N.logspace(-3, 1, numc)[ind[0]]
kernel = ['linear', 'rbf', 'poly'][ind[1]]

print 'Best value of C:', cval
print 'Best kernel:', kernel
print 'Precision:', N.max(precision)

Best value of C: 0.88586679041
Best kernel: linear
Precision: 0.645833333333


In [144]:
# Using optimal value of C and kernel, train the algorithm again
clf = svm.SVC(kernel=kernel, C=cval)
clf.fit(training_set, ytrain)

SVC(C=0.88586679041008232, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [145]:
ypredict = clf.predict(test_set)

In [146]:
N.sum(ypredict == ytest) / N.float(len(ytest))

0.93320235756385073

In [147]:
false_negative = (ypredict == False) & (ytest == True)
print 'False negative rate:', N.sum(false_negative)/ N.float(len(ytest))

False negative rate: 0.0550098231827


In [148]:
false_positive = (ypredict == True) & (ytest == False)
print 'False positive rate:', N.sum(false_positive)/ N.float(len(ytest))

False positive rate: 0.0117878192534


In [152]:
# An example of false positive (I didn't think it was a troll but algorithm marked it as troll)
print tweets.loc[test_set.loc[false_positive].index[0], 'text']

rt @mxtros: @theshoenice @bethrose669 @kimkardashian @kanyewest fuck haha


In [153]:
# An example of false negative (troll not picked up by algorithm)
print tweets.loc[test_set.loc[false_negative].index[0], 'text']

@kimkardashian so are your boobs
