In [1]:
import pandas as pd
import requests
import os
import json

In [2]:
import tweepy

In [3]:
from tweepy import OAuthHandler # Used for authentication
from tweepy import Cursor # Used to perform pagination

# Parsing Tweets Using Tweepy

Tweepy is a Python library that assists in extracting tweets from Twitter's API. However, we require functions to assist us in downloading these tweets from Twitter. 
- An authentication function; this authenticates your key and access token that is generated by Twitter for the developers and shouldn't be shared publicly. 
- A client function to interact with the Twitter API
- A function that collects the tweets that are in JSON format and transfer it to a dataframe in Python

In [4]:
#Twitter credential verification
cons_key = ''
cons_secret = ''
acc_token = ''
acc_secret = ''

#creating function for authentication 
def get_auth():
    """
    @return:
        - the authentification to Twitter
    """
    try:
        consumer_key = cons_key
        consumer_secret = cons_secret
        access_token = acc_token
        access_secret = acc_secret
        
    except KeyError:
        sys.stderr.write("Twitter Environment Variable not Set\n")
        sys.exit(1)
        
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_secret)
    
    return auth

In [8]:
#api client function 
def get_twitter_api():
    """
    @return:
        - the client to access the authentification API
    """
    auth = get_auth()
    api = tweepy.API(auth, wait_on_rate_limit=True)
    return api



In [9]:
# (3). Function creating final dataframe
def get_tweets_from_user(twitter_user_name, page_limit=16, count_tweet=200):
    """
    @params:
        - twitter_user_name: the twitter username of a user (company, etc.)
        - page_limit: the total number of pages (max=16)
        - count_tweet: maximum number to be retrieved from a page
        
    @return
        - all the tweets from the user twitter_user_name
    """
    client = get_twitter_api()
    
    all_tweets = []
    
    for page in Cursor(client.user_timeline, 
                        screen_name=twitter_user_name, 
                        count=count_tweet).pages(page_limit):
        for tweet in page:
            parsed_tweet = {}
            parsed_tweet['date'] = tweet.created_at
            parsed_tweet['author'] = tweet.user.name
            parsed_tweet['twitter_name'] = tweet.user.screen_name
            parsed_tweet['text'] = tweet.text
            parsed_tweet['number_of_likes'] = tweet.favorite_count
            parsed_tweet['number_of_retweets'] = tweet.retweet_count
                
            all_tweets.append(parsed_tweet)
    
    # Create dataframe 
    df = pd.DataFrame(all_tweets)
    
    # Revome duplicates if there are any
    df = df.drop_duplicates( "text" , keep='first')
    
    return df

In [10]:
NYT = get_tweets_from_user("nytimes")
print("Data Shape: {}".format(NYT.shape))

Data Shape: (3197, 6)


In [11]:
NYT.head()

Unnamed: 0,date,author,twitter_name,text,number_of_likes,number_of_retweets
0,2022-02-14 09:29:07+00:00,The New York Times,nytimes,Russia has continued to strengthen its militar...,29,7
1,2022-02-14 08:30:11+00:00,The New York Times,nytimes,Remote work is giving people the chance to mov...,73,22
2,2022-02-14 08:17:05+00:00,The New York Times,nytimes,#Beijing2022 has been a lonely Olympics for mo...,130,14
3,2022-02-14 07:29:03+00:00,The New York Times,nytimes,Newcomers are flocking to a rural area in Tenn...,76,15
4,2022-02-14 07:07:06+00:00,The New York Times,nytimes,"RT @RaR: “Cover curling,” the editors said.\n\...",0,37


In [12]:
NYT

Unnamed: 0,date,author,twitter_name,text,number_of_likes,number_of_retweets
0,2022-02-14 09:29:07+00:00,The New York Times,nytimes,Russia has continued to strengthen its militar...,29,7
1,2022-02-14 08:30:11+00:00,The New York Times,nytimes,Remote work is giving people the chance to mov...,73,22
2,2022-02-14 08:17:05+00:00,The New York Times,nytimes,#Beijing2022 has been a lonely Olympics for mo...,130,14
3,2022-02-14 07:29:03+00:00,The New York Times,nytimes,Newcomers are flocking to a rural area in Tenn...,76,15
4,2022-02-14 07:07:06+00:00,The New York Times,nytimes,"RT @RaR: “Cover curling,” the editors said.\n\...",0,37
...,...,...,...,...,...,...
3194,2022-01-16 17:20:05+00:00,The New York Times,nytimes,Coverage of Novak Djokovic loomed large over t...,889,120
3195,2022-01-16 16:55:03+00:00,The New York Times,nytimes,For the generation who grew up in Beirut durin...,178,32
3196,2022-01-16 16:47:25+00:00,The New York Times,nytimes,"Russia hints that if its demands aren’t met, M...",359,170
3197,2022-01-16 16:15:07+00:00,The New York Times,nytimes,Ask the people who know Penélope Cruz best to ...,426,57


In [14]:
NYT.to_csv('NYT_twitter_14-02-22.csv')