# Pulling data from Twitter with twint

In [1]:
# Run this block if you need to install the relevant modules

!pip3 install twint==2.1.20 --user

!pip3 install nest_asyncio==1.4.0 --user
# ^^ Need to install this for twint event loop to work in a jupyter notebook setting

Collecting twint==2.1.20
  Downloading twint-2.1.20.tar.gz (31 kB)
Collecting aiodns
  Downloading aiodns-2.0.0-py2.py3-none-any.whl (4.8 kB)
Collecting aiohttp
  Downloading aiohttp-3.6.2-py3-none-any.whl (441 kB)
[K     |████████████████████████████████| 441 kB 9.3 MB/s eta 0:00:01
[?25hCollecting aiohttp_socks
  Downloading aiohttp_socks-0.5.3-py3-none-any.whl (32 kB)
Collecting cchardet
  Downloading cchardet-2.1.6-cp38-cp38-manylinux2010_x86_64.whl (244 kB)
[K     |████████████████████████████████| 244 kB 2.1 MB/s eta 0:00:01
[?25hCollecting elasticsearch
  Downloading elasticsearch-7.8.1-py2.py3-none-any.whl (188 kB)
[K     |████████████████████████████████| 188 kB 1.8 MB/s eta 0:00:01
[?25hCollecting fake-useragent
  Downloading fake-useragent-0.1.11.tar.gz (13 kB)
Collecting geopy
  Downloading geopy-2.0.0-py3-none-any.whl (111 kB)
[K     |████████████████████████████████| 111 kB 1.0 MB/s eta 0:00:01
[?25hCollecting googletransx
  Downloading googletransx-2.4.2.tar.gz (

Import nest_asyncio if needed for current environment

In [1]:
'''
This block of code is needed for using twint within a jupyter notebook.  Without it, twint searches generate runtime errors because an event loop is already running.
'''
try:
    ip = get_ipython()
    
    if ip.has_trait('kernel'): 
        #TODO: Create outcome
        import nest_asyncio
        nest_asyncio.apply()
        
except:
    Exception('Not working in an iPython environment, skipping this step.')

Import other modules and define some key variables:

In [2]:
import os
import sys
import pandas as pd

import twint
print('twint: '+twint.__version__)

data_path = '../data/raw/' # define location for data to be saved relative to notebook location
print(os.getcwd())

twint: 2.1.20
D:\S2DS\Ditchley\Aug20_Ditchley\notebooks


## Extracting lists of followers

In [5]:
def get_followers(username, fp, full=False, suppress=True):
    '''
    Function to scrape a list of followers of a specific user.
    
    Parameters
    ----------
    username : str
        the twitter handle of the user whose followers to pull.
    fp : str
        Filepath to directory where data should be stored.  
    full : bool
        if true, scrapes all user info; only name if false.
    suppress : bool
        If true, suppress the printed output of the scraping. 
        
    Returns
    -------
    followers : list of str
    '''
    full_path = fp+'followers_'
    if not full:
        full_path += 'names_'
    full_path += username+'.db'
    
    c = twint.Config()
    c.Username = username
    c.Hide_output = suppress
    c.User_full = full
    c.Database = full_path
    
    twint.run.Followers(c)
    
    print('Follower data saved to "'+full_path+'"')

In [6]:
get_followers('bobthephysicist', '../data/raw/', suppress=True)

[+] Inserting into Database: ../data/raw/followers_names_bobthephysicist.db


CRITICAL:root:twint.get:User:
CRITICAL:root:twint.feed:Follow:IndexError
CRITICAL:root:twint.feed:Follow:IndexError


Follower data saved to "../data/raw/followers_names_bobthephysicist.db"


In [20]:
print(len(followers))
print(followers[:5])

TypeError: object of type 'NoneType' has no len()

In [6]:
def get_following(username, fp, full=False, suppress=True):
    '''
    Function to scrape a list of followed users of a specific user.
    
    Parameters
    ----------
    username : str
        the twitter handle of the user whose follows to pull.
    fp : str
        Filepath to directory where data should be stored.  
    full : bool
        if true, scrapes all user info; only name if false.
    suppress : bool
        If true, suppress the printed output of the scraping. 
        
    Returns
    -------
    followers : list of str
        
    TODO: Abstract RAM storage of output
    '''
    full_path = fp+'following_'
    if not full:
        full_path += 'names_'
    full_path += username+'.db'
    
    c = twint.Config()
    c.Username = username
    c.Hide_output = suppress
    c.User_full = full
    c.Database = full_path
    
    twint.run.Following(c)
    
    print('Follow data saved to "'+full_path+'"')

In [7]:
get_following('bobthephysicist', '../data/raw/', suppress=True)

[+] Inserting into Database: ../data/raw/following_names_bobthephysicist.db


CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable
CRITICAL:root:twint.feed:Follow:IndexError
CRITICAL:root:twint.feed:Follow:IndexError


Follow data saved to "../data/raw/following_names_bobthephysicist.db"


In [27]:
del c

In [13]:
tweets = []
c = twint.Config()
c.Username = 'bobthephysicist'
c.Store_object = True
c.Debug = True

twint.run.Search(c)

tweets = twint.output.tweets_list

CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable


1291729444265230342 2020-08-07 14:34:33 GMT Daylight Time <bobthephysicist> Condragulations Dr. @alisoncowan0 🎉🎉🎉  https://twitter.com/UofG_MCMP/status/1291728227325616129 … pic.twitter.com/eF4Cuv5qki
1291719029191856129 2020-08-07 13:53:10 GMT Daylight Time <bobthephysicist> The importance of good #DataVisualization in one graph... https://twitter.com/reina_sabah/status/1291509085855260672 …
1291483906919866370 2020-08-06 22:18:52 GMT Daylight Time <bobthephysicist> Another day, another #python environment obliterated beyond all salvation.   Why is Windows the way it is??? pic.twitter.com/44ZL0cBAA0
1290390082856386562 2020-08-03 21:52:24 GMT Daylight Time <bobthephysicist> Whew. Day one of #S2DS20 was a blast. Excited to get stuck in with a fantastic #DataScience project. But first, 😴😴😴. pic.twitter.com/XcdMmap09r
1289553585659813889 2020-08-01 14:28:28 GMT Daylight Time <bobthephysicist> Paddy is a cheery #doggo today, making the most of Scotland's one day of #summer. pic.twitter.co

In [8]:
vars(tweets[-1])

{'id': 960855259617873921,
 'id_str': '960855259617873921',
 'conversation_id': '960855259617873921',
 'datetime': 1517920720000,
 'datestamp': '2018-02-06',
 'timestamp': '12:38:40',
 'user_id': 927293298280198145,
 'user_id_str': '927293298280198145',
 'username': 'bobthephysicist',
 'name': 'Robert Webster',
 'place': '',
 'timezone': 'GMT Daylight Time',
 'mentions': ['uofg_mcmp', 'stem4brit'],
 'urls': [],
 'photos': [],
 'video': 0,
 'tweet': 'Excited to be accepted to present my work on #thermoelectrics as part of @UofG_MCMP in #Parliament at the @STEM4Brit event in March!',
 'hashtags': ['#thermoelectrics', '#parliament'],
 'cashtags': [],
 'replies_count': '0',
 'retweets_count': '3',
 'likes_count': '8',
 'link': 'https://twitter.com/bobthephysicist/status/960855259617873921',
 'user_rt_id': '',
 'user_rt': '',
 'retweet': False,
 'retweet_id': '',
 'retweet_date': '',
 'quote_url': '',
 'near': '',
 'geo': '',
 'source': '',
 'reply_to': [{'user_id': '927293298280198145', 'u

In [9]:
len(tweets)

147

In [12]:
!twint -u bobthephysicist -o bobthephysicist.csv --csv

unicode error [x] output._output
1291719029191856129 2020-08-07 13:53:10 GMT Daylight Time <bobthephysicist> The importance of good #DataVisualization in one graph... https://twitter.com/reina_sabah/status/1291509085855260672 …
1291483906919866370 2020-08-06 22:18:52 GMT Daylight Time <bobthephysicist> Another day, another #python environment obliterated beyond all salvation.   Why is Windows the way it is??? pic.twitter.com/44ZL0cBAA0
unicode error [x] output._output
1289553585659813889 2020-08-01 14:28:28 GMT Daylight Time <bobthephysicist> Paddy is a cheery #doggo today, making the most of Scotland's one day of #summer. pic.twitter.com/ctqWZntjBQ
1288043175249051649 2020-07-28 10:26:38 GMT Daylight Time <bobthephysicist> Thanks Rachel!
1288043029752877061 2020-07-28 10:26:03 GMT Daylight Time <bobthephysicist> Thanks Bianca!
1287765103396487168 2020-07-27 16:01:40 GMT Daylight Time <bobthephysicist> Thanks Katie! How are you doing?
1287761983144693760 2020-07-27 15:49:16 GMT Dayligh

CRITICAL:root:twint.get:User:'NoneType' object is not subscriptable
CRITICAL:root:twint.output:_output:UnicodeEncodeError
CRITICAL:root:twint.output:_output:UnicodeEncodeError
CRITICAL:root:twint.output:_output:UnicodeEncodeError
CRITICAL:root:twint.output:_output:UnicodeEncodeError
CRITICAL:root:twint.output:_output:UnicodeEncodeError
CRITICAL:root:twint.output:_output:UnicodeEncodeError
CRITICAL:root:twint.output:_output:UnicodeEncodeError
CRITICAL:root:twint.output:_output:UnicodeEncodeError
CRITICAL:root:twint.output:_output:UnicodeEncodeError
CRITICAL:root:twint.output:_output:UnicodeEncodeError
CRITICAL:root:twint.output:_output:UnicodeEncodeError
CRITICAL:root:twint.output:_output:UnicodeEncodeError
CRITICAL:root:twint.output:_output:UnicodeEncodeError
CRITICAL:root:twint.output:_output:UnicodeEncodeError
CRITICAL:root:twint.output:_output:UnicodeEncodeError



1267950302809993218 2020-06-02 23:44:44 GMT Daylight Time <bobthephysicist> Ah yes, Milton Keynes... That we'll known region of Scotland....  https://twitter.com/paulhutcheon/status/1267876435492036608 … pic.twitter.com/VvajQl7gbr
1262388929313099776 2020-05-18 15:25:49 GMT Daylight Time <bobthephysicist> Oooh I know that feel. pic.twitter.com/LAd0D6lutN
1259522007068225546 2020-05-10 17:33:41 GMT Daylight Time <bobthephysicist> Some important Sunday content... https://twitter.com/Pandamoanimum/status/1259207916403462144 …
1257036966094405633 2020-05-03 20:59:01 GMT Daylight Time <bobthephysicist> This is Paddy, the family doggo.
1256952263874134017 2020-05-03 15:22:27 GMT Daylight Time <bobthephysicist> My new supervisor, peering over my shoulder. Will woof-read thesis chapters for snackos. #phdchat #phdlife #thesis #DogsofTwittter pic.twitter.com/Llzo1wLDug
unicode error [x] output._output
1245359502268018689 2020-04-01 15:36:57 GMT Daylight Time <bobthephysicist>  pic.twitter.com/f

CRITICAL:root:twint.output:_output:UnicodeEncodeError
CRITICAL:root:twint.output:_output:UnicodeEncodeError
CRITICAL:root:twint.output:_output:UnicodeEncodeError
CRITICAL:root:twint.output:_output:UnicodeEncodeError
CRITICAL:root:twint.output:_output:UnicodeEncodeError
