# Imports

In [1]:
import numpy as np
import pandas as pd

from datetime import datetime

import twint

# Fixes runtime errors with twint
import nest_asyncio
nest_asyncio.apply()

from textblob import TextBlob

import matplotlib
%matplotlib inline

# Scrape Tweets

In [2]:
def twint_search(search, username=None, since=None, until=None, drop_cols=None, limit=None):
    '''
    Function to return a pandas dataframe of tweets in English containing search terms using twint.
    Required parameter: search term.
    Optional parameters: username, start date (since) and end date (until) to search, columns to drop, maximum number of tweets (limit).
    '''
    c = twint.Config()
    c.Lang = 'en'
    c.Search = search
    c.Username = username
    c.Since = since
    c.Until = until
    c.Limit = limit
    c.Pandas = True
    # Hide the printing of every tweet during scrape
    c.Hide_output = True
    twint.run.Search(c)
    df = twint.storage.panda.Tweets_df
    # Transform date string into datetime object
    df['date'] = pd.to_datetime(df['date']).dt.date
    return df

In [3]:
def search_loop(start_date, end_date, search, filename, username=None, drop_cols=None, limit=None):
    '''
    Function to loop over date range and perform twint_search function for each day, returning one combined dataframe.
    Periodically saves progress to CSV after each daily search.
    Required parameters: start date, end date, search term.
    Optional parameters: username, columns to drop, maximum number of tweets per day (limit).
    '''
    df = pd.DataFrame()
    date_range = pd.Series(pd.date_range(start_date, end_date))
    for d in range(len(date_range) - 1):
        since = date_range[d].strftime('%Y-%m-%d')
        until = date_range[d + 1].strftime('%Y-%m-%d')
        day_df = twint_search(search=search, username=username, since=since, until=until, drop_cols=drop_cols, limit=limit)
        # Drop empty columns
        day_df.drop(columns=drop_cols, axis=1, inplace=True)
        # Add new daily data to dataframe, reset index, save to CSV
        df = pd.concat([df, day_df])
        del day_df
        df.reset_index(drop=True, inplace=True)
        df.to_csv(f'Datasets/{filename}.csv')
        print(datetime.now(), f'{since} Saved!')
    return df

In [4]:
# Search loop params for March tweets

start_date = '2020-03-17'

end_date = '2020-04-01'

search = '(mask OR masks)'

filename = 'march17_tweets'

drop_cols = ['timezone', 'place', 'cashtags', 'user_id_str', 'day', 'hour', 'search', 'near', 'geo',
             'source', 'user_rt_id', 'user_rt', 'retweet_date', 'translate', 'trans_src', 'trans_dest']

In [None]:
%time df = search_loop(start_date=start_date, end_date=end_date, search=search, filename=filename, drop_cols=drop_cols)

CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 1.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 8.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 27.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 64.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 125.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 216.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 1.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 8.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 27.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecti

2020-06-16 15:10:32.864793 2020-03-17 Saved!


CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 1.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 8.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 27.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 64.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 125.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 216.0 secs
CRITICAL:root:twint.output:checkData:copyrightedTweet
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 1.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 8.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 27.0

2020-06-16 16:56:39.256866 2020-03-18 Saved!


CRITICAL:root:twint.output:checkData:copyrightedTweet
CRITICAL:root:twint.output:checkData:copyrightedTweet
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 1.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 8.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 27.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 64.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 125.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 216.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 1.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpecting value: line 1 column 1 (char 0)
sleeping for 8.0 secs
CRITICAL:root:twint.run:Twint:Feed:noDataExpec

In [None]:
len(df)