In [1]:
import numpy as N
import pandas as pd
import json
import matplotlib.pyplot as plt
from IPython.display import clear_output
plt.style.use('nuala')



In [2]:
filename='data/stream__thelindywest___Lesdoggg___KimKardashian.json'

In [4]:
# Read in json file
tweets_data = []
tweets_file = open(filename, "r")
for line in tweets_file:
    try:
        tweet = json.loads(line)
        tweets_data.append(tweet)
    except:
        continue

In [7]:
print 'total number of tweets in dataset:', len(tweets_data)

total number of tweets in dataset: 1523


In [12]:
print 'here is info from the first tweet:'
tweets_data[0]

here is info from the first tweet:


{u'contributors': None,
 u'coordinates': None,
 u'created_at': u'Wed Oct 05 23:14:59 +0000 2016',
 u'entities': {u'hashtags': [],
  u'symbols': [],
  u'urls': [],
  u'user_mentions': [{u'id': 215952307,
    u'id_str': u'215952307',
    u'indices': [3, 12],
    u'name': u'James Corden',
    u'screen_name': u'JKCorden'},
   {u'id': 25365536,
    u'id_str': u'25365536',
    u'indices': [40, 54],
    u'name': u'Kim Kardashian West',
    u'screen_name': u'KimKardashian'}]},
 u'favorite_count': 0,
 u'favorited': False,
 u'filter_level': u'low',
 u'geo': None,
 u'id': 783807726799421441,
 u'id_str': u'783807726799421441',
 u'in_reply_to_screen_name': None,
 u'in_reply_to_status_id': None,
 u'in_reply_to_status_id_str': None,
 u'in_reply_to_user_id': None,
 u'in_reply_to_user_id_str': None,
 u'is_quote_status': False,
 u'lang': u'en',
 u'place': None,
 u'retweet_count': 0,
 u'retweeted': False,
 u'retweeted_status': {u'contributors': None,
  u'coordinates': None,
  u'created_at': u'Mon Oct 03 

In [97]:
# Load relevant info into a pandas data frame

tweets = pd.DataFrame()

# Text of the tweet
tweets['text'] = map(lambda tweet: tweet['text'], tweets_data)

# Whether or not the tweet was retweeted
tweets['retweeted'] = map(lambda tweet: tweet['retweeted'], tweets_data)

# Username of person who tweeted it
tweets['username'] = map(lambda tweet: tweet['user']['screen_name'], tweets_data)

# Whether the account is verified
tweets['verified'] = map(lambda tweet: tweet['user']['verified'], tweets_data)

# List of hashtags used
tweets['hashtags'] = map(lambda tweet: tweet['entities']['hashtags'], tweets_data)

# List of users mentioned in the tweet
tweets['user_mentions'] = map(lambda tweet: [tweet['entities']['user_mentions'][i]['screen_name'] for i in N.arange(len(tweet['entities']['user_mentions']))], tweets_data)

# Whether the tweet has been favorited
tweets['favorited'] = map(lambda tweet: tweet['favorited'], tweets_data)

# Time stamp of tweet
tweets['timestamp'] = map(lambda tweet: tweet['created_at'], tweets_data)

# Source of tweet (iPhone, Android, etc)
tweets['source'] = map(lambda tweet: tweet['source'], tweets_data)

# Language
tweets['language'] = map(lambda tweet: tweet['lang'], tweets_data)

In [98]:
# Do some editing of the tweet text:

# Make all letters lowercase
tweets['text'] = tweets['text'].str.lower()

# Remove all URLs
tweets['text'] = tweets['text'].str.replace('https?:.*', '')

# Strip all whitespace
tweets['text'] = tweets['text'].str.strip()


In [99]:
print 'Percent of tweets in the dataset that are directed at Kim K:', N.sum(['KimKardashian' in list for list in tweets['user_mentions']]) / 1523.

Percent of tweets in the dataset that are directed at Kim K: 0.923834537098


In [100]:
# Load in some lists of positive words, negative words, curse words, etc. These might be useful for identifying trolling tweets

negative_words = N.loadtxt('data/opinion-lexicon-English/negative-words.txt', dtype=str, comments=';').tolist()
positive_words = N.loadtxt('data/opinion-lexicon-English/positive-words.txt', dtype=str, comments=';').tolist()
curse_words1 = N.loadtxt('data/opinion-lexicon-English/curse-words.txt', dtype=str, delimiter='\n').tolist()
curse_words2 = N.loadtxt('data/opinion-lexicon-English/swearWords.txt', dtype=str, delimiter='\n').tolist()
bad_words = N.loadtxt('data/opinion-lexicon-English/really_bad_words.txt', dtype =str, delimiter='\n').tolist()

In [101]:
total_pattern = ''
for pattern in bad_words:
    first_char = pattern[0]
    num_stars = len(pattern)-1
    newstring = first_char + '*'*num_stars
    print 'number of tweets containing the word %s:'%newstring, \
    tweets['text'].str.contains(pattern).sum()
    total_pattern = total_pattern + pattern + '|'

number of tweets containing the word b****: 63
number of tweets containing the word w****: 14
number of tweets containing the word c***: 1
number of tweets containing the word f***: 50
number of tweets containing the word s***: 5
number of tweets containing the word a******: 7
number of tweets containing the word n*****: 6
number of tweets containing the word f*****: 0


In [102]:
sep = '|'
total_pattern = sep.join(bad_words)
print 'Total number of tweets using the', len(bad_words), 'bad words', N.sum(tweets['text'].str.contains(total_pattern))

Total number of tweets using the 8 bad words 122


In [103]:
#one option is to make a list of bad users and label all of their tweets as trolling
bad_users = N.unique(tweets['username'][tweets['text'].str.contains(sep.join([bad_words[2], bad_words[6]]))])
print 'These are definitely trolls in this dataset:', bad_users

These are definitely trolls in this dataset: [u'DashtoKrap' u'GameChanger6371' u'InfamousSOB247' u'KIMisisSTUPID']


In [104]:
#is this one a troll?
tweets.loc[19, 'text']

u'@kimkardashian is too stupid to learn her lesson from robbery - watch her turn it to an episode of kuwtk -'

In [105]:
# Make a new column classifying all tweets as trolling or not
# Initialize as false. We will go through and label the trolling ones afterward.
tweets['is_troll'] = False

In [106]:
# Mark these users' tweets as trolling
for user in bad_users:
    tweets.loc[tweets['username']==user, 'is_troll'] = True    

In [107]:
print 'Between these', len(bad_users), 'users there are', N.sum(tweets['is_troll']==True), 'tweets in the dataset'

Between these 4 users there are 254 tweets in the dataset


In [110]:
# go through any tweets containing swear words
cwstring = sep.join(curse_words2)

# Get a list of tweets of non-trolls that contain swear words
# Also require the tweets are in English
# there are a bunch of tweets with the word petition which are being classified as trolling because they contain "tit"... exclude those
possible_trolls = tweets.loc[(tweets['is_troll']==False) & (tweets['text'].str.contains(cwstring)) & (tweets['text'].str.contains('petition')==False) & (tweets['language'] == 'en'), 'text']
print 'Number of tweets to classify manually:', len(possible_trolls)

Number of tweets to classify manually: 76


In [111]:
tweets.iloc[possible_trolls.index[0]]


text             rt @jibzonlinemmg: jibz scrilla talks about @k...
retweeted                                                    False
username                                                 babyy_xoo
verified                                                     False
hashtags         [{u'indices': [87, 90], u'text': u'np'}, {u'in...
user_mentions           [JibzOnlineMMG, KimKardashian, NICKIMINAJ]
favorited                                                    False
timestamp                           Wed Oct 05 23:18:47 +0000 2016
source           <a href="http://twitter.com/download/android" ...
language                                                        en
is_troll                                                     False
Name: 50, dtype: object

In [117]:
tweets.loc[0, 'is_troll']

False

In [118]:
# Go through the possible trolls and classify tweets as trolling (1) or not (0)
for i in N.arange(len(possible_trolls)):
    print possible_trolls.iloc[i]
    s = input('troll? [0/1]')
    if (N.int(s)==1):
        tweets.loc[possible_trolls.index[i], 'is_troll'] = True
    clear_output()
    
    



In [120]:
print 'Total number of trolling tweets:', N.sum(tweets['is_troll']==True)

Total number of trolling tweets: 274


In [122]:
# Save the 'is_troll' column so we can use it for our classification algorithm

N.save('/Users/nuala/Documents/Research/Code/repos/twitter-trolls/data/training_set_classification', tweets['is_troll'])