In [1]:
import requests
import jsonlines
import pandas as pd
from requests_oauthlib import OAuth1

**PART 1**

In [12]:
API_KEY = 'enter_api_key'
API_SECRET = 'enter_api_secret_key'
USER_OAUTH_TOKEN = 'enter_access_token'
USER_OAUTH_TOKEN_SECRET = 'enter_access_token_secret'
SCREEN_NAME = 'midasIIITD'

Get all the tweets made by the user. Since twitter api retrieves a maximum of 200 tweets at a time. Thus this function loops over and retrieves 200 tweets made before max_idth tweet until it gets less than 200 tweets which marks that there are no more tweets that can be retrieved

In [13]:
def get_user_tweets(handle, auth):
    final_data = []
    url = 'https://api.twitter.com/1.1/statuses/user_timeline.json'
    params = {
        'screen_name': handle,
        'tweet_mode':'extended', #setting the extended mode to get full-text and all images
        'count': 200 #Setting the count to maximum number of tweets that can be retrieved at a time
    }
    while 1:
        R = requests.get(url, auth=auth, params=params)
        data = R.json()
        final_data += data
        length  = len(data)
        if length<200:
            break
        max_id = data[199]['id']-1 #store the id of the last tweet to get all the tweets before this tweet in the next iteration
        params['max_id'] = max_id
    return final_data 

**Fetch all the tweets done by midas@IIITD twitter handle and dump the responses into JSONlines file**

In [14]:
auth =  OAuth1(API_KEY, API_SECRET, USER_OAUTH_TOKEN, USER_OAUTH_TOKEN_SECRET)
url = 'https://api.twitter.com/1.1/account/verify_credentials.json'
#checking the validity of user credentials
print(requests.get(url, auth=auth))
#get tweets 
data = get_user_tweets(SCREEN_NAME, auth)
#Dump the data into JSONlines file
with jsonlines.open('midas_tweets.jsonl', mode='w') as writer:
    writer.write(data)

<Response [200]>


**PART 2**

This function finds all the images in a tweet by checking if it is present in extended_entities or entities as well as checking the type of the media is photo as some other types of media can be gifs or videos which should not be taken into account

In [33]:

def get_images_count(data):
    tweet_counts = len(data)
    images_count = []
    for i in range(tweet_counts):
        count = 0
        #checking if media is present in extended_entities
        try:
            x = len(data[i]['extended_entities']['media'])
            for j in range(x):
                #checking if the type of the media is photo
                if data[i]['extended_entities']['media'][j]['type']=='photo':
                    count += 1
            images_count.append(count)
        except:
            #else checking if media is present in entities
            if count == 0 and 'media' in data[i]['entities']:
                y = len(data[i]['entities']['media'])
                for j in range(y):
                #checking if the type of the media is photo
                    if data[i]['entities']['media'][j]['type']=='photo':
                        count += 1
                images_count.append(count)
            else:
                images_count.append('None')
                continue
    return images_count

In [34]:
#Read the data from jsonlines files and print the data in tabular format
with jsonlines.open('midas_tweets.jsonl') as reader:
    for obj in reader:
        data = obj

images_count = pd.Series(get_images_count(data))
df = pd.DataFrame(data)
df2 = pd.DataFrame(data)
df2['images_count'] = images_count.values
df2 = df2[['created_at','full_text','favorite_count','retweet_count','images_count']]
df2.rename(columns={'created_at':'data_and_time'}, inplace=True)
df2

Unnamed: 0,data_and_time,full_text,favorite_count,retweet_count,images_count
0,Fri Mar 29 19:43:24 +0000 2019,RT @isarth23: Thanks for the support and help ...,0,2,
1,Fri Mar 29 17:16:40 +0000 2019,"Since SemEval-2019 will be held June 6-7, 2019...",9,1,
2,Fri Mar 29 17:04:30 +0000 2019,+@aggarwal_kartik.\nCongrats! Wish you many mo...,2,0,
3,Fri Mar 29 17:03:29 +0000 2019,RT @aggarwal_kartik: Our work (@midasIIITD ) a...,0,1,
4,Fri Mar 29 17:02:24 +0000 2019,"Congratulations! @midasIIITD team, @isarth23 @...",8,1,
5,Fri Mar 29 05:35:22 +0000 2019,@EEMLcommunity @radamihalcea too many deadline...,0,0,
6,Thu Mar 28 16:55:01 +0000 2019,RT @stanfordnlp: CS224N Natural Language Proce...,0,619,
7,Thu Mar 28 16:54:37 +0000 2019,RT @ylecun: Learn PyTorch by running on Google...,0,153,
8,Wed Mar 27 16:09:09 +0000 2019,Dr. Vineeth N Balasubramanian will present a T...,4,1,1
9,Wed Mar 27 11:53:40 +0000 2019,RT @ylecun: I am extremely honored to be the r...,0,1506,


In [37]:
df2.head(50)

Unnamed: 0,data_and_time,full_text,favorite_count,retweet_count,images_count
0,Fri Mar 29 19:43:24 +0000 2019,RT @isarth23: Thanks for the support and help ...,0,2,
1,Fri Mar 29 17:16:40 +0000 2019,"Since SemEval-2019 will be held June 6-7, 2019...",9,1,
2,Fri Mar 29 17:04:30 +0000 2019,+@aggarwal_kartik.\nCongrats! Wish you many mo...,2,0,
3,Fri Mar 29 17:03:29 +0000 2019,RT @aggarwal_kartik: Our work (@midasIIITD ) a...,0,1,
4,Fri Mar 29 17:02:24 +0000 2019,"Congratulations! @midasIIITD team, @isarth23 @...",8,1,
5,Fri Mar 29 05:35:22 +0000 2019,@EEMLcommunity @radamihalcea too many deadline...,0,0,
6,Thu Mar 28 16:55:01 +0000 2019,RT @stanfordnlp: CS224N Natural Language Proce...,0,619,
7,Thu Mar 28 16:54:37 +0000 2019,RT @ylecun: Learn PyTorch by running on Google...,0,153,
8,Wed Mar 27 16:09:09 +0000 2019,Dr. Vineeth N Balasubramanian will present a T...,4,1,1.0
9,Wed Mar 27 11:53:40 +0000 2019,RT @ylecun: I am extremely honored to be the r...,0,1506,
