In [1]:
!source activate twitterCOVID

We are gonna use tweepy to extract data from the twitter apis. Need twitter API account and keys.

In [2]:
from tweepy import OAuthHandler
from tweepy import API
from secrets import *
from textblob import TextBlob
import os
import jsonpickle
import dataset
from datafreeze import freeze

# Consumer key authentication
auth = OAuthHandler(consumer_key, consumer_secret)

# Access key authentication
auth.set_access_token(access_token, access_token_secret)

# Set up the API with the authentication handler
api = API(auth, wait_on_rate_limit=True)

# Sample tweets that include keywords

In [3]:
from tweepy.streaming import StreamListener
import json
import time
import sys

class OriginalListener(StreamListener):
    '''This a batch extractor, it extraccts tweets in batches as defined by the batch size parameter'''
    def __init__(self, api = None, fprefix = 'streamer', foldername = "StreamDir", batchsize = 100000):
        # set up API
        self.api = api or API()
        self.counter = 0 # number of tweets?
        self.fprefix = fprefix
        self.output  = open('%s/%s_%s.json' % (foldername, self.fprefix, time.strftime('%Y%m%d-%H%M%S')), 'w')
        self.batchsize = batchsize


    def on_data(self, data):
        if  'in_reply_to_status' in data:
            self.on_status(data)
        elif 'delete' in data:
            delete = json.loads(data)['delete']['status']
            if self.on_delete(delete['id'], delete['user_id']) is False:
                return False
        elif 'limit' in data:
            if self.on_limit(json.loads(data)['limit']['track']) is False:
                return False
        elif 'warning' in data:
            warning = json.loads(data)['warnings']
            print("WARNING: %s" % warning['message'])
            return


    def on_status(self, status):
        self.output.write(status)
        self.counter += 1
        if self.counter >= self.batchsize: #tweet batch size
            self.output.close()
            self.output  = open('%s/%s_%s.json' % (foldername, self.fprefix, time.strftime('%Y%m%d-%H%M%S')), 'w')
            self.counter = 0 # uncomment to keep streaming going
        return


    def on_delete(self, status_id, user_id):
        print("Delete notice")
        return


    def on_limit(self, track):
        print("WARNING: Limitation notice received, tweets missed: %d" % track)
        return


    def on_error(self, status_code):
        print('Encountered error with status code:', status_code)
        return 


    def on_timeout(self):
        print("Timeout, sleeping for 60 seconds...")
        time.sleep(60)
        return 

In [4]:
class MyListener(StreamListener):
    '''
    This function is a custom tweepy.StreamListener function that given a valid API, 
    loads tweets in batches in real time 
    '''
    def __init__(self, api = None, retrieve = "custom"):
        self.api = api or API()
        self.retrieve = retrieve #other option would be all
    
    def on_status(self, status):
        '''
        This functions first opens the output json file then "on_status" which means when the API
        gives out a tweet adds the json file and then adds 1 to the counter up to the tweet batchsize when 
        the storage file is closed and a new file is created and the counter is reset
        '''
    
#         if status.retweeted_status:
#             return # if file was a retweet then return nothing

        UserID = status.id
        UserDescription = status.user.description #all user info
        UserAccountCreation = status.user.created_at
        UserLocation = status.user.location
        TweetTime = status.created_at
        TweetText = status.text
        TweetCoordinates = status.coordinates
        TweetPlace = status.place
        NumOfFollowers = status.user.followers_count
        NumOfRetweets = status.retweet_count
        blob = TextBlob(TweetText)
        Sentiment = blob.sentiment

        #json.dumps turns dictionary or json file into string
        if TweetPlace is not None:
            TweetPlace = jsonpickle.encode(TweetPlace)

        if TweetCoordinates is not None:
            TweetCoordinates = jsonpickle.encode(TweetCoordinates)

        table = db[StreamSettings.TABLE_NAME]
        try:
            table.insert(dict(
                UserID=UserID,
                UserDescription=UserDescription,
                UserAccountCreation=UserAccountCreation,
                UserLocation=UserLocation,
                TweetTime=TweetTime,
                TweetText=TweetText,
                TweetCoordinates=TweetCoordinates,
                TweetPlace=TweetPlace,
                NumOfFollowers=NumOfFollowers,
                NumOfRetweets=NumOfRetweets,
                Polarity=Sentiment.polarity,
                Subjectivity=Sentiment.subjectivity
            ))
        except ProgrammingError as err:
            print(err)
            
    def on_error(self, status_code):
        if status_code == 420:
            #returning False in on_data disconnects the stream
            return False

#### Now a json files has been created in the current working directory

In [5]:
toStream = input("Stream: 1(Yes) or 0(No)? ")

Stream: 1(Yes) or 0(No)? 1


In [4]:
from tweepy import Stream
import StreamSettings
db = dataset.connect(StreamSettings.CONNECTION_STRING)

In [7]:
# Set up words to track
keywords_to_track = list(['corona'])
foldername = "StreamSupper"
if toStream == "1":
    if not os.path.exists(foldername):
        os.makedirs(foldername)
    # Instantiate the SListener object 
    listen = MyListener(api)

    # Instantiate the Stream object
    stream = Stream(auth, listen)

    # Begin collecting data
    stream.filter(track = keywords_to_track) # async allows to use different threads in case
    # the current processor runs out of time
else:
    print("Not streaming")

KeyboardInterrupt: 

In [5]:
db = dataset.connect(StreamSettings.CONNECTION_STRING)
result = db["tweets"].all()

In [6]:
print(os.getcwd())
freeze(result, format='csv', filename="where.csv")

/Users/luischavesrodriguez/OneDrive - Imperial College London/ExtratimeWork/HackCOVID/Coding


## Loading our json file and making sense of it

In [2]:
# Load JSON
import json
import pandas as pd
import numpy as np
import os

In [3]:
tweets = []
filedir = "StreamDir"
for file in os.listdir(filedir):    
    for line in open(filedir+'/'+str(file), 'r'):
        tweets.append(json.loads(line))
    print(file)


streamer_20200404-163353.json


In [4]:
tweets = pd.DataFrame(tweets)
tweets

Unnamed: 0,created_at,id,id_str,text,source,truncated,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,in_reply_to_user_id_str,...,lang,timestamp_ms,quoted_status_id,quoted_status_id_str,quoted_status,quoted_status_permalink,display_text_range,extended_tweet,possibly_sensitive,extended_entities
0,Sat Apr 04 15:33:49 +0000 2020,1246460976595009536,1246460976595009536,RT @okmvnny: bro i wanna be babied rn wtf,"<a href=""http://twitter.com/download/iphone"" r...",False,,,,,...,en,1586014429602,,,,,,,,
1,Sat Apr 04 15:33:49 +0000 2020,1246460976435802114,1246460976435802114,RT @MollyJongFast: Imagine if we had a functio...,"<a href=""http://twitter.com/download/iphone"" r...",False,,,,,...,en,1586014429564,1.246457e+18,1246457007214931968,{'created_at': 'Sat Apr 04 15:18:03 +0000 2020...,"{'url': 'https://t.co/t1thqySXd4', 'expanded':...",,,,
2,Sat Apr 04 15:33:49 +0000 2020,1246460976637108224,1246460976637108224,RT @pnplstorm: Retuíta aí que se o Babu for ca...,"<a href=""http://twitter.com/download/android"" ...",False,,,,,...,pt,1586014429612,,,,,,,,
3,Sat Apr 04 15:33:49 +0000 2020,1246460976435789825,1246460976435789825,@CNN Similar experience here in Louisville. My...,"<a href=""https://mobile.twitter.com"" rel=""nofo...",True,1.246415e+18,1246414986919936000,7.592510e+05,759251,...,en,1586014429564,,,,,"[5, 140]",{'full_text': '@CNN Similar experience here in...,,
4,Sat Apr 04 15:33:49 +0000 2020,1246460976389644290,1246460976389644290,kız junmyeona demiş ki sevgilimle ne dinlemeli...,"<a href=""http://twitter.com/download/android"" ...",True,,,,,...,tr,1586014429553,1.246457e+18,1246457208893624321,{'created_at': 'Sat Apr 04 15:18:51 +0000 2020...,"{'url': 'https://t.co/df0mdPdBP6', 'expanded':...",,{'full_text': 'kız junmyeona demiş ki sevgilim...,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,Sat Apr 04 15:34:11 +0000 2020,1246461069188628481,1246461069188628481,RT @Esther9monteiro: agora a questão é:\nas pe...,"<a href=""https://mobile.twitter.com"" rel=""nofo...",False,,,,,...,pt,1586014451678,,,,,,,,
1196,Sat Apr 04 15:34:11 +0000 2020,1246461069217992705,1246461069217992705,@serkan_kaptan18 Eliniz boş oturmayın aslanlar.d,"<a href=""http://twitter.com/download/android"" ...",False,1.246461e+18,1246460879060848640,9.798322e+17,979832211288592384,...,tr,1586014451685,,,,,"[17, 48]",,,
1197,Sat Apr 04 15:34:11 +0000 2020,1246461069180260352,1246461069180260352,CORRENTE DO QUÃO CUZÃO TU É:\n\n01- Não\n02- S...,"<a href=""http://twitter.com/download/iphone"" r...",True,,,,,...,pt,1586014451676,,,,,,{'full_text': 'CORRENTE DO QUÃO CUZÃO TU É: 0...,,
1198,Sat Apr 04 15:34:11 +0000 2020,1246461069238902784,1246461069238902784,RT @DenizDepboylu: Vefatının 23. yılında Kuruc...,"<a href=""http://twitter.com/download/android"" ...",False,,,,,...,tr,1586014451690,,,,,,,,


## Could either use tweet location (place attribute)

In [None]:
index = map(lambda x: x is not None, tweets.place)
tweets.loc[index,:]

In [None]:
tweets.columns

## Or user location (see user attribute)

# Using twint

In [None]:
import twint
import nest_asyncio # for compatibility of notebooks and twint
nest_asyncio.apply()
c = twint.Config()
c.Location = True
c.Limit = 3
c.Pandas = True
c.Search = "covid"

twint.run.Search(c)

In [None]:
tweets = twint.storage.panda.Tweets_df

In [None]:
result