In [129]:
import numpy as N
import pandas as pd
import json
import matplotlib.pyplot as plt
from IPython.display import clear_output
import re
import string
plt.style.use('nuala')

In [518]:
filename='data/stream__thelindywest___Lesdoggg___KimKardashian.json'

In [519]:
# read in the training set data
tweets_data = []
tweets_file = open(filename, "r")
for line in tweets_file:
    try:
        tweet = json.loads(line)
        tweets_data.append(tweet)
    except:
        continue

In [542]:
# get the relevant info about each tweet -- this might not be exhaustive
tweets = pd.DataFrame()
tweets['text'] = map(lambda tweet: tweet['text'], tweets_data)
tweets['retweeted'] = map(lambda tweet: tweet['retweeted'], tweets_data)
tweets['username'] = map(lambda tweet: tweet['user']['screen_name'], tweets_data)
tweets['verified'] = map(lambda tweet: int(tweet['user']['verified']), tweets_data)
tweets['hashtags'] = map(lambda tweet: [xx['text'].lower() for xx in tweet['entities']['hashtags']], tweets_data)
tweets['number of hashtags'] = map(lambda tweet: len(tweet['entities']['hashtags']), tweets_data)
tweets['user_mentions'] = map(lambda tweet: [tweet['entities']['user_mentions'][i]['screen_name'] for i in N.arange(len(tweet['entities']['user_mentions']))], tweets_data)

tweets['favorited'] = map(lambda tweet: tweet['favorited'], tweets_data)
tweets['timestamp'] = map(lambda tweet: int(tweet['created_at'].split()[3].split(':')[0])+int(tweet['created_at'].split()[3].split(':')[1])/60., tweets_data)
tweets['source'] = map(lambda tweet: tweet['source'], tweets_data)

In [543]:
#change text to lower case
tweets['text']=tweets['text'].str.lower()
tweets['text'] = tweets['text'].str.replace('https?:.*', 'httpaddr')

In [544]:
#training set classification (as found in the other notebook)
troll_class = N.load('/Users/nuala/Documents/Research/Code/repos/twitter-trolls/data/training_set_classification.npy')

In [545]:
#list of words in each tweet - this will be used to figure out the most common words used


text_lists = tweets['text'].str.split('[\s,\.]')

In [546]:
# data sets of positive / negative / curse words, these may come in handy
negative_words = N.loadtxt('data/opinion-lexicon-English/negative-words.txt', dtype=str, comments=';').tolist()
positive_words = N.loadtxt('data/opinion-lexicon-English/positive-words.txt', dtype=str, comments=';').tolist()
curse_words1 = N.loadtxt('data/opinion-lexicon-English/curse-words.txt', dtype=str, delimiter='\n').tolist()
curse_words2 = N.loadtxt('data/opinion-lexicon-English/swearWords.txt', dtype=str, delimiter='\n').tolist()

In [547]:
# a list of common words we don't care about

common_words = ['and', 'a','an', 'the', 'is', 'are', 'in', 'of', 'to', 'this', 'that', 'it', 'its', 'on', 'at', 'as']



In [548]:
# get 2 lists of all the (interesting) words used by trolls and non-trolls

troll_text = []
normal_text = []

for i in N.arange(len(text_lists)):
    if (troll_class[i]==True):
        wordlist = troll_text
    else:
        wordlist = normal_text
    for word in text_lists[i]:
            
        tmp = word.encode('ascii', 'ignore')
        # we don't care about mentions or hash tags or RT or URLs
        if ((tmp.startswith('@')) | (tmp.startswith('#')) | (tmp=='rt') | (tmp.startswith('htt'))):
            continue
        # take out punctuation
        tmp = tmp.translate(None, string.punctuation)
        if (tmp == ''):
            continue
        # don't include words that are just numbers
        if (re.search('[a-z]', tmp) == None):
            continue
        # don't include the common words
        if (tmp in common_words):
            continue
        if (tmp.startswith('kim')):
            continue
        # if it makes it here, we probably have a proper word
        wordlist.append(tmp)
    
    



In [549]:
print len(normal_text)
print len(troll_text)

10798
613


In [550]:
utext1, count1 = N.unique(normal_text, return_counts=True)
utext2, count2 = N.unique(troll_text, return_counts=True)
index1 = N.argsort(count1)[::-1]
index2 = N.argsort(count2)[::-1]

# the following are a list of the words most commonly used by things classified as trolls and non-trolls
# it is in order of the most common
nontroll_words = utext1[index1]
troll_words = utext2[index2]

In [551]:
print troll_words[0:30]
print nontroll_words[0:30]

['you' 'i' 'god' 'so' 'u' 'like' 'ya' 'lol' 'shit' 'what' 'me' 'amp' 'was'
 'get' 'slayer' 'your' 'my' 'know' 'for' 'fucking' 'got' 'good' 'do' 'make'
 'just' 'whore' 'her' 'all' 'ur' 'slut']
['about' 'report' 'much' 'petition' 'online' 'ado' 'you' 'i' 'be' 'de'
 'robbery' 'for' 'do' 'des' 'what' 'your' 'her' 'she' 'en' 'so' 'like'
 'was' 'shes' 'people' 'new' 'not' 'robbed' 'my' 'amp' 'paris']


In [552]:
tweets['source category'] = 0
tweets.loc[tweets['source'].str.contains('iPhone|iPad'), 'source category'] = 1
tweets.loc[tweets['source'].str.contains('Android'), 'source category'] = 2
tweets.loc[tweets['source'].str.contains('Web Client'), 'source category'] = 3
tweets.loc[tweets['source'].str.contains('twittbot.net'), 'source category'] = 4
tweets.loc[tweets['source'].str.contains('SocialFlow'), 'source category'] = 5
tweets.loc[tweets['source'].str.contains('Windows Phone'), 'source category'] = 6
tweets.loc[tweets['source'].str.contains('BlackBerry'), 'source category'] = 7


In [594]:
num = 50
word_list = troll_words[0:num].tolist()

#check for these words as well, these are the only ones that will be checked for in the hashtags
really_bad_words = ['cunt', 'nigger', 'slut']


for i in N.arange(len(word_list)):
    column_name = 'used word %i'%i
    tweets[column_name]=0
for i in N.arange(len(really_bad_words)):
    column_name = 'bad word %i'%i
    tweets[column_name]=0
    
for i in N.arange(len(tweets)):
    for word in text_lists[i]:
        tmp = word.encode('ascii', 'ignore')
        #first check the list of actual words in the tweet (not hashtags)
        if (word in word_list):
            ii = word_list.index(word)
            column_name = 'used word %i'%ii
            tweets.loc[i, column_name]=1
        if (word in really_bad_words):
            ii = really_bad_words.index(word)
            column_name = 'bad word %i'%ii
            tweets.loc[i, column_name]=1
        else:
            continue
    #next check the list of hashtags
    for word in tweets.loc[i, 'hashtags']:
        tmp = word.encode('ascii', 'ignore')
        
        for bad_word in really_bad_words:
            if (word.find(bad_word)>-1):
                ii = really_bad_words.index(bad_word)
                column_name = 'bad word %i'%ii
                tweets.loc[i, column_name]=1
            else:
                continue
                    




In [595]:
tweets.loc[tweets['bad word 0']==1, 'username']

1288    InfamousSOB247
Name: username, dtype: object

In [596]:
#features to include:
# *whether the tweet contains the N most common troll words
# *whether the tweet contains a few really bad curse words (these might not get pulled out as separate words
#  because they could be in a hashtag, so just check if the full text of the tweet contains those words)
# *number of hash tags
# *device used (iphone/ipad, android, blackberry, etc)
# *time stamp?
# *verified

columns = ['verified', 'timestamp', 'number of hashtags', 'source category']

for i in N.arange(len(word_list)):
    column_name = 'used word %i'%i
    columns.append(column_name)
for i in N.arange(len(really_bad_words)):
    column_name = 'bad word %i'%i
    columns.append(column_name)

In [597]:
features = tweets.loc[:, columns]

In [598]:
ntot = len(features)
ntrain = N.int(ntot / 2)
ncross = N.int(ntot / 2)
rind = N.random.permutation(ntot)
training_set = features.loc[rind[0:ntrain], :]
ytrain = troll_class[rind[0:ntrain]]
validation_set = features.loc[rind[ntrain:], :]
yval = troll_class[rind[ntrain:]]


In [599]:
print len(training_set)
print len(ytrain)
print len(validation_set)

761
761
762


In [557]:
from sklearn import svm

In [624]:
clf = svm.SVC(gamma = 0.1)

In [625]:
clf.fit(training_set, ytrain)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [626]:
ypredict = clf.predict(validation_set)

In [627]:
N.sum(ypredict == yval) / N.float(len(yval))

0.92125984251968507

In [593]:
clf.n_support_

array([105,  85], dtype=int32)