# Twitter Social Network
I collected about 5000 tweets on a topic (e.g. Trump). The script for collecting twitter data comes from http://www.karambelkar.info/2015/01/how-to-use-twitters-search-rest-api-most-effectively./. Then I wrote a script that parses through the tweets and does the following: 

Any retweet (RT), mention or reply should result in an arrow from the person retweeting to the person retweeted, mentioned or replied to. Create a three-column .CSV file that contains Column 1 (person who retweets, if N/A then original tweet author), Column 2 (original tweet author), Column 3 (type of content, e.g. tweet or RT). The result will allow social network analysis tools to take the first two two columns and draw arrows from the user in the left column to the one in the right, creating a network.

In [4]:
import tweepy

auth = tweepy.AppAuthHandler('__', '__') # replace with your own
 
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
 
if (not api):
    print ("Can't Authenticate")
    sys.exit(-1)

In [5]:
import sys
import jsonpickle
import os

searchQuery = '#Trump'  
maxTweets = 6000 
tweetsPerQry = 100  # this is the max the API permits
fName = 'tweets_2.txt' 

# If results from a specific ID onwards are reqd, set since_id to that ID.
# else default to no lower limit, go as far back as API allows
sinceId = None

# If results only below a specific ID are, set max_id to that ID.
# else default to no upper limit, start from the most recent tweet matching the search query.
max_id = -1L

tweetCount = 0
print("Downloading max {0} tweets".format(maxTweets))
with open(fName, 'w') as f:
    while tweetCount < maxTweets:
        try:
            if (max_id <= 0):
                if (not sinceId):
                    new_tweets = api.search(q=searchQuery, count=tweetsPerQry)
                else:
                    new_tweets = api.search(q=searchQuery, count=tweetsPerQry,
                                            since_id=sinceId)
            else:
                if (not sinceId):
                    new_tweets = api.search(q=searchQuery, count=tweetsPerQry,
                                            max_id=str(max_id - 1))
                else:
                    new_tweets = api.search(q=searchQuery, count=tweetsPerQry,
                                            max_id=str(max_id - 1),
                                            since_id=sinceId)
            if not new_tweets:
                print("No more tweets found")
                break
            for tweet in new_tweets:
                f.write(jsonpickle.encode(tweet._json, unpicklable=False) +
                        '\n')
            tweetCount += len(new_tweets)
            print("Downloaded {0} tweets".format(tweetCount))
            max_id = new_tweets[-1].id
        except tweepy.TweepError as e:
            # Just exit if any error
            print("some error : " + str(e))
            break

print ("Downloaded {0} tweets, Saved to {1}".format(tweetCount, fName))

Downloading max 6000 tweets
Downloaded 100 tweets
Downloaded 200 tweets
Downloaded 300 tweets
Downloaded 397 tweets
Downloaded 497 tweets
Downloaded 597 tweets
Downloaded 695 tweets
Downloaded 795 tweets
Downloaded 891 tweets
Downloaded 991 tweets
Downloaded 1091 tweets
Downloaded 1191 tweets
Downloaded 1291 tweets
Downloaded 1391 tweets
Downloaded 1491 tweets
Downloaded 1573 tweets
Downloaded 1673 tweets
Downloaded 1773 tweets
Downloaded 1867 tweets
Downloaded 1957 tweets
Downloaded 2054 tweets
Downloaded 2140 tweets
Downloaded 2240 tweets
Downloaded 2340 tweets
Downloaded 2440 tweets
Downloaded 2540 tweets
Downloaded 2640 tweets
Downloaded 2740 tweets
Downloaded 2840 tweets
Downloaded 2940 tweets
Downloaded 3040 tweets
Downloaded 3140 tweets
Downloaded 3240 tweets
Downloaded 3340 tweets
Downloaded 3440 tweets
Downloaded 3540 tweets
Downloaded 3637 tweets
Downloaded 3736 tweets
Downloaded 3836 tweets
Downloaded 3936 tweets
Downloaded 4036 tweets
Downloaded 4136 tweets
Downloaded 4236 

## Convert twitter data to dataframe

In [2]:
try:
    import json
except ImportError:
    import simplejson as json

tweets_filename = 'trumptweets.txt'
tweets_file = open(tweets_filename, "r")

data = {'user_id': [], 'text': [], 'screen_name': [], 'created_at': [],
        'retweet_count': [], 'favorite_count': [],
        'friends_count': [], 'followers_count': []}

for line in tweets_file:
    try:
        # Read in one line of the file, convert it into a json object 
        tweet = json.loads(line.strip())
        if 'text' in tweet: # only messages contains 'text' field is a tweet

            data['user_id'].append(tweet['user']['id'])
            data['text'].append(tweet['text'])
            data['screen_name'].append(tweet['user']['screen_name'])
            data['created_at'].append(tweet['created_at'])
            data['retweet_count'].append(tweet['retweet_count'])
            data['favorite_count'].append(tweet['favorite_count'])
            data['friends_count'].append(tweet['user']['friends_count'])
            data['followers_count'].append(tweet['user']['followers_count'])
                
    except:
        # read in a line is not in JSON format (sometimes error occured)
        continue

In [3]:
# take 5000 unique user/tweet
import pandas as pd
df = pd.DataFrame(data)
df = df.drop_duplicates(subset=['screen_name', 'text'])[:5000]
len(df)

5000

In [7]:
# remove invalid chars
df['text'] = df['text'].map(lambda x: "".join(i for i in x if ord(i)<128))

## Create csv file

In [8]:
import re

# if retweet, create row for original tweet and retweet
def check_RT(row):
    c1 = pd.DataFrame()
    c2 = pd.DataFrame()
    first_word = row['text'].split(' ', 1)[0]
    if len(row['text'].split()) > 1:
        second_word = row['text'].split(' ')[1]
    if first_word=='RT':
        c1 = c1.append({'C1': second_word[1:len(second_word)-1], 'C2': second_word[1:len(second_word)-1], 'C3': 'Tweet'}, ignore_index=True)
        c2 = c2.append({'C1': row['screen_name'], 'C2': second_word[1:len(second_word)-1], 'C3': 'RT'}, ignore_index=True)
        return ", ".join(c1['C1']),  ", ".join(c1['C2']),  ", ".join(c1['C3']), \
               ", ".join(c2['C1']),  ", ".join(c2['C2']),  ", ".join(c2['C3'])

# if reply, create row for reply
def check_reply(row):
    c1 = pd.DataFrame()
    first_word = row['text'].split(' ', 1)[0]
    if first_word[:1]=='@':
        c1 = c1.append({'C1': row['screen_name'], 'C2': first_word[1:], 'C3': 'Reply'}, ignore_index=True)
        return ", ".join(c1['C1']),  ", ".join(c1['C2']),  ", ".join(c1['C3'])

# if mention, create row for mention
def check_mention(row):
    c1 = pd.DataFrame()
    first_word = row['text'].split(' ', 1)[0]
    if [word for word in row['text'].split() if word.startswith('@')]:
        c1 = c1.append({'C1': row['screen_name'], 'C2': ", ".join(re.findall(r'@(\w+)', row['text'])), 'C3': 'Mention'}, ignore_index=True)
        return ", ".join(c1['C1']),  ", ".join(c1['C2']),  ", ".join(c1['C3'])

rt = df.apply(check_RT, axis=1).apply(pd.Series).dropna(how='all')  
reply = df.apply(check_reply, axis=1).apply(pd.Series).dropna(how='all')  
mention = df.apply(check_mention, axis=1).apply(pd.Series).dropna(how='all')

In [9]:
rt.head()

Unnamed: 0,0,1,2,3,4,5
2,TrumpSuperPAC,TrumpSuperPAC,Tweet,scks386,TrumpSuperPAC,RT
3,LyndaKinkade,LyndaKinkade,Tweet,99_treble,LyndaKinkade,RT
5,LVNancy,LVNancy,Tweet,KevinCrabtree1,LVNancy,RT
6,StopTrump2020,StopTrump2020,Tweet,NancyRGold,StopTrump2020,RT
7,bocavista2016,bocavista2016,Tweet,newheart4sandy,bocavista2016,RT


In [10]:
reply.head()

Unnamed: 0,0,1,2
1,jabbaoolie,realDonaldTrump,Reply
28,Isabellarowling,DalaiLama,Reply
59,Lwbayfront,Joy_Villa,Reply
64,HullDockster,NIHAustin,Reply
72,Geoff_InBoston,Joy_Villa,Reply


In [11]:
import numpy as np
import itertools

# replace blanks with NA, drop rows with NA
mention[1].replace('', np.nan, inplace=True)
mention.dropna(subset=[1], inplace=True)

# reformat mentions dataframe so each mention gets its own row
w = mention[1].str.split(', ')
c = w.map(len)
idx = np.repeat(c.index, c.values)
words = list(itertools.chain.from_iterable(w.values))
s = pd.Series(words, index=idx)
s.name = "words"
mention = mention.join(s)[[0,'words',2]]
mention.columns = ['0','1','2']
mention.head()

Unnamed: 0,0,1,2
1,jabbaoolie,realDonaldTrump,Mention
2,scks386,TrumpSuperPAC,Mention
3,99_treble,LyndaKinkade,Mention
3,99_treble,Deanofcomedy,Mention
3,99_treble,cnni,Mention


In [13]:
# more reformatting for RT and reply - currently each observation is next to each other, so extract observations and make them into one column
def transform_layout(df):
    i = 0
    new = pd.DataFrame()
    while i < len(df.columns):
        subset = df[[i,i+1,i+2]]
        subset.columns = ['0','1','2']
        new = new.append(subset)
        i = i+3
    return new

three_col_file = transform_layout(rt).append(transform_layout(reply), ignore_index=True).append(mention, ignore_index=True)
three_col_file

Unnamed: 0,0,1,2
0,TrumpSuperPAC,TrumpSuperPAC,Tweet
1,LyndaKinkade,LyndaKinkade,Tweet
2,LVNancy,LVNancy,Tweet
3,StopTrump2020,StopTrump2020,Tweet
4,bocavista2016,bocavista2016,Tweet
5,noclador,noclador,Tweet
6,LVNancy,LVNancy,Tweet
7,GaltsGultch,GaltsGultch,Tweet
8,FoxBusiness,FoxBusiness,Tweet
9,bocavista2016,bocavista2016,Tweet


In [24]:
# output to csv
three_col_file.to_csv('Three_Column_File.csv')