# Extracting Twitter Data

* The dataset only provides the tweet id with the emotion label. This code below extracts the text data for each tweet id.
* final dataset is smaller since it seems like some tweets were deleted after the dataset was created

## Dataset information:

Paper: http://knoesis.org/sites/default/files/wenbo_socialcom_2012_0.pdf

Data: http://knoesis.org/projects/emotion


In [2]:
#Importing libraries
import tweepy
import pandas as pd
import datetime
import time

__Importing Data__

In [2]:
filelist = ['./data/test.txt','./data/dev.txt','./data/train_1.txt','./data/train_2_1.txt','./data/train_2_10.txt','./data/train_2_2.txt','./data/train_2_3.txt','./data/train_2_4.txt','./data/train_2_5.txt','./data/train_2_6.txt','./data/train_2_7.txt','./data/train_2_8.txt','./data/train_2_9.txt']
df = pd.concat([pd.read_csv(item , sep = '\t' , header = None , names = ['tweet_id','emotion']) for item in filelist], axis=0)
df.reset_index(inplace = True)
df.drop(columns = 'index', inplace = True)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9718024 entries, 0 to 9718023
Data columns (total 2 columns):
tweet_id    int64
emotion     object
dtypes: int64(1), object(1)
memory usage: 148.3+ MB


In [4]:
df.drop_duplicates(keep = 'first', inplace = True)
df.reset_index(inplace = True)
df.drop(columns = 'index', inplace = True)

In [5]:
df.shape

(2488982, 2)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2488982 entries, 0 to 2488981
Data columns (total 2 columns):
tweet_id    int64
emotion     object
dtypes: int64(1), object(1)
memory usage: 38.0+ MB


In [7]:
df.emotion.value_counts()

joy             706182
sadness         616471
anger           574170
love            301759
fear            135154
thankfulness    131340
surprise         23906
Name: emotion, dtype: int64

## Subsetting by emotion

* This will help to run the extract in batches

In [8]:
sad = df.loc[df.emotion == 'sadness']
sad.shape

(616471, 2)

In [9]:
joy = df.loc[df.emotion == 'joy']
joy.shape

(706182, 2)

In [10]:
anger = df.loc[df.emotion == 'anger']
anger.shape

(574170, 2)

In [11]:
fear = df.loc[df.emotion == 'fear']
fear.shape

(135154, 2)

In [12]:
surprise = df.loc[df.emotion == 'surprise']
surprise.shape

(23906, 2)

In [13]:
thankfulness = df.loc[df.emotion == 'thankfulness']
thankfulness.shape

(131340, 2)

In [14]:
love = df.loc[df.emotion == 'love']
love.shape

(301759, 2)

## Setting up Twitter API

* credentials removed since this code is being shared in github

In [15]:
# Variables that contains the credentials to access Twitter API
ACCESS_TOKEN =  ''
ACCESS_SECRET = ''
CONSUMER_KEY = ''
CONSUMER_SECRET = ''


# Setup access to API
def connect_to_twitter_OAuth():
    auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
    auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)

    api = tweepy.API(auth)
    return api

# Create API object
api = connect_to_twitter_OAuth()

__Get Tweet posts by Tweet ID__

Twitter Reference: https://developer.twitter.com/en/docs/tweets/search/api-reference/get-search-tweets

In [16]:
def get_tweets(category,label):
    """This helper function to download tweet posts based on ID
    each request pulls 100 id's, the max amount of request is 450 every 15 minutes
    will process 45k id's then will wait 15 mins and will continue.
    
    Inputs: category->the dataset that contains the tweet ids you want to download
            label-> the name of the emotion corresponding to those tweets
    """
    
    e = list(category.tweet_id)
    data = []
    total = len(e)
    start = 0
    end = 100
    counter = 0
    e_batch = e[start:end]
    st = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
    print(st,'- extracting ',label)
    while total>0:
        status = api.statuses_lookup(e_batch)
        count = list(range(0,len(status)))
        d_counter1 = len(data)         
        for x in count:
            post = status[x]
            tweet = post._json
            data.append({'id': tweet['id'] , 'text': tweet['text']})
        d_counter2 = len(data) 
        delta = d_counter2 - d_counter1
        counter += delta
        start += 100
        end += 100
        total -= 100
        e_batch = e[start:end]      
        #when counter is 45k or more it will stop for 15 mins, then the counter is restarted
        if counter >= 45000:
            st = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
            print('records so far:', len(data))
            #wait 15 mins
            st = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
            print(st,'- waiting 15 mins for next',label,'batch...')
            time.sleep(900)
            counter = 0
            st = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
            print(st,'restarted counter:',counter)
    edf = pd.DataFrame(data)
    name = './data/'+label+'.csv'
    st = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
    print(st,'done with',label,'batch.')
    edf.to_csv(name)           

In [21]:
get_tweets(joy,'joy')

2019-11-06 17:10:18 - extracting  joy
records so far: 45028
counter = , 45028
2019-11-06 17:10:18 - waiting 15 mins for next joy batch...
restarted counter: 0
records so far: 90081
counter = , 45053
2019-11-06 17:10:18 - waiting 15 mins for next joy batch...
restarted counter: 0
records so far: 135092
counter = , 45011
2019-11-06 17:10:18 - waiting 15 mins for next joy batch...
restarted counter: 0
records so far: 180127
counter = , 45035
2019-11-06 17:10:18 - waiting 15 mins for next joy batch...
restarted counter: 0
records so far: 225167
counter = , 45040
2019-11-06 17:10:18 - waiting 15 mins for next joy batch...
restarted counter: 0
records so far: 270198
counter = , 45031
2019-11-06 17:10:18 - waiting 15 mins for next joy batch...
restarted counter: 0
records so far: 315230
counter = , 45032
2019-11-06 17:10:18 - waiting 15 mins for next joy batch...
restarted counter: 0
records so far: 360269
counter = , 45039
2019-11-06 17:10:18 - waiting 15 mins for next joy batch...
restarted

In [23]:
time.sleep(900)
get_tweets(surprise,'surprise')

2019-11-06 20:32:57 - extracting  surprise
2019-11-06 20:35:15 done with surprise batch.


In [None]:
time.sleep(900)
get_tweets(thankfulness,'thankfulness')

2019-11-06 20:50:15 - extracting  thankfulness
records so far: 45054
2019-11-06 20:57:32 - waiting 15 mins for next thankfulness batch...
2019-11-06 21:12:32 restarted counter: 0
2019-11-06 21:18:00 done with thankfulness batch.


In [None]:
time.sleep(900)
get_tweets(fear,'fear')

2019-11-06 21:33:00 - extracting  fear
records so far: 45039
2019-11-06 21:40:56 - waiting 15 mins for next fear batch...
2019-11-06 21:55:56 restarted counter: 0
2019-11-06 22:00:41 done with fear batch.


In [17]:
get_tweets(love,'love')

2019-11-06 22:35:30 - extracting  love
records so far: 45053
2019-11-06 22:43:21 - waiting 15 mins for next love batch...
2019-11-06 22:58:21 restarted counter: 0
records so far: 90056
2019-11-06 23:06:08 - waiting 15 mins for next love batch...
2019-11-06 23:21:08 restarted counter: 0
records so far: 135091
2019-11-06 23:28:37 - waiting 15 mins for next love batch...
2019-11-06 23:43:37 restarted counter: 0
2019-11-06 23:49:20 done with love batch.


In [19]:
get_tweets(anger,'anger')

2019-11-07 01:21:00 - extracting  anger
records so far: 45019
2019-11-07 01:28:18 - waiting 15 mins for next anger batch...
2019-11-07 01:43:18 restarted counter: 0
records so far: 90042
2019-11-07 01:50:35 - waiting 15 mins for next anger batch...
2019-11-07 02:05:35 restarted counter: 0
records so far: 135087
2019-11-07 02:13:17 - waiting 15 mins for next anger batch...
2019-11-07 02:28:17 restarted counter: 0
records so far: 180117
2019-11-07 02:36:03 - waiting 15 mins for next anger batch...
2019-11-07 02:51:03 restarted counter: 0
records so far: 225153
2019-11-07 02:58:51 - waiting 15 mins for next anger batch...
2019-11-07 03:13:51 restarted counter: 0
records so far: 270179
2019-11-07 03:21:58 - waiting 15 mins for next anger batch...
2019-11-07 03:36:58 restarted counter: 0
2019-11-07 03:40:40 done with anger batch.


In [20]:
time.sleep(900)
get_tweets(sad,'sadness')

2019-11-07 14:04:59 - extracting  sadness
records so far: 45044
2019-11-07 14:13:13 - waiting 15 mins for next sadness batch...
2019-11-07 14:28:13 restarted counter: 0
records so far: 90069
2019-11-07 14:35:14 - waiting 15 mins for next sadness batch...
2019-11-07 14:50:14 restarted counter: 0
records so far: 135088
2019-11-07 14:57:33 - waiting 15 mins for next sadness batch...
2019-11-07 15:12:33 restarted counter: 0
records so far: 180094
2019-11-07 15:19:46 - waiting 15 mins for next sadness batch...
2019-11-07 15:34:46 restarted counter: 0
records so far: 225098
2019-11-07 15:42:16 - waiting 15 mins for next sadness batch...
2019-11-07 15:57:16 restarted counter: 0
records so far: 270123
2019-11-07 16:05:20 - waiting 15 mins for next sadness batch...
2019-11-07 16:20:20 restarted counter: 0
records so far: 315131
2019-11-07 16:28:50 - waiting 15 mins for next sadness batch...
2019-11-07 16:43:50 restarted counter: 0
2019-11-07 16:46:47 done with sadness batch.
