### README

### Importing Modules

In [0]:
import requests
import time
import re
import unicodedata

### Global Constants

In [0]:
# Instagram base url preffix
tagurl_prefix = 'https://www.instagram.com/explore/tags/'

# suffix to append to tag request url to retrieve data in JSON format
tagurl_suffix = '/?__a=1'

# suffix to end cursor when requesting posts by tag
tagurl_endcursor = '&max_id='

# a generic media post preffix (concat with media shortcode to view)
posturl_prefix = 'https://www.instagram.com/p/'

### Defining Functions

In [0]:
import unicodedata

def strip_accents(text):
    
    """
    Strip accents from input String.

    :param text: The input string.
    :type text: String.

    :returns: The processed String.
    :rtype: String.
    """
    
    try:
        text = unicode(text, 'utf-8')
    except (TypeError, NameError):
        pass
    
    text = unicodedata.normalize('NFD', text)
    text = text.encode('ascii', 'ignore')
    text = text.decode("utf-8")
    
    return str(text)

In [0]:
import re

def text2tags(text, striptag=True):
    
    pattern = '#\S+'
    
    text = text.lower()
    
    text = strip_accents(text)
    
    matches = re.findall(pattern, text)
    
    if striptag :
        matches = [ match.replace('#','') for match in matches ]
    
    return matches

In [0]:
def json2medias(json_info, infilter=False):

    medias_list = json_info['graphql']['hashtag']['edge_hashtag_to_media']['edges']

    medias = []

    for media in medias_list:

        node = media['node']

        id_media = node['id']

        id_owner = node['owner']['id']

        shortcode = node['shortcode']

        edges = node['edge_media_to_caption']['edges']
        
        text = edges[0]['node']['text'].replace('\n','') if len(edges) else ''
        
        tags = text2tags(text)

        mediaurl = posturl_prefix + shortcode + '/'

        media_dict = {
            'id_media': id_media,
            'id_owner': id_owner,
            'shortcode': shortcode,
            'text': text,
            'mediaurl': mediaurl,
            'tags': tags
        }
        
        if infilter :
            if len(tags) :
                medias.append( media_dict )
        else:
            pass
    
    else:
        medias.append( media_dict )
    
    return medias

In [0]:
import requests
import time

def snowball(url, deep=1, end_cursor='', count=0, showurl=False, 
             sleep=0, forever=False, progress=False, pause=60 ):

    request_url = url + tagurl_endcursor + end_cursor

    if showurl :
        print(request_url)
    else:
        if progress :
            print( count, end=' ' )
    
    while True :
        try :
            json_info = requests.get( request_url ).json()
            break
        except:
            if forever :
                print('Fail, retrying in ' + pause + ' seconds')
                time.sleep(pause)
            else:
                print('Fail, ' + count + ' requests done')
                return []
    
    end_cursor = json_info['graphql']['hashtag']['edge_hashtag_to_media']['page_info']['end_cursor']

    medias = json2medias( json_info, True )

    time.sleep(sleep)
  
    count = count + 1

    if count < deep :
        medias += snowball(
            url=url, 
            deep=deep, 
            end_cursor=end_cursor, 
            count=count, 
            showurl=showurl, 
            sleep=sleep,
            forever=forever,
            progress=progress, 
            pause=pause)
    else:
        pass
    
    if showurl :
        pass
    else:
        if progress :
            if count == deep :
                print()

    return medias

### Collecting Data

In [0]:
# target initial tags
tags = ['bolsonaro', 'haddad', 'dilma', 'ciro', 'guedes', 'moro', 'lula']

In [0]:
# urls to initial tags
queries = [ tagurl_prefix + tag + tagurl_suffix for tag in tags ]

In [0]:
data = {}

In [0]:
%%time

for tag, query in zip( tags, queries ) :
    
    print( 'Querying ' + tag + '...' )
    
    medias = snowball(query, deep=40, forever=True, sleep=0.5, pause=60, progress=True)
    
    data[tag] = medias
    
    print( 'Done' )
    
    time.sleep(30)

Querying bolsonaro...
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 
Done
Querying haddad...
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 
Done
Querying dilma...
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 
Done
Querying ciro...
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 
Done
Querying guedes...
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 
Done
Querying moro...
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 
Done
Querying lula...
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 
Done
CPU times: user 28.2 s, sys: 1.51 s, total: 29.7 s
Wall tim

In [0]:
data

In [0]:
# checking number of medias
for key, medias in data.items() :
    
    print(key, len(medias))

moro 2570
guedes 2678
bolsonaro 2435
lula 2500
haddad 2660
ciro 2738
dilma 2655


In [0]:
import json

# saving data to a JSON file
f = open('data.json', 'w')
json.dump(data, f)
f.close()

### Filtering Data

This code is a plus. The function `json2medias` has the boolean attribute `infilter`, which filters and drops out the posts with no tags.

In case this attribute is `False` (not recommended), we can check the non-tagged posts here. 

In [0]:
# checking non-tagged medias

for key, medias in data.items():
    
    for media in medias:
        
        tags = media['tags']
        
        if len(tags) :
            pass
        else:
            print(media['mediaurl'], media['text'])

### Development

In [0]:
import requests
import time

# from tqdm import tqdm, tnrange
from tqdm import tqdm_notebook as tqdm

def snowball(url, deep=1, end_cursor='', count=0, showurl=False, 
             sleep=0, forever=False, progress=False, pause=60, pbar=None ):
    
    if pbar is None :
        pbar = tqdm(total=deep)

    request_url = url + tagurl_endcursor + end_cursor

    if showurl :
        print(request_url)
    else:
        if progress :
            pbar.update()
    
    while True :
        try :
            json_info = requests.get( request_url ).json()
            break
        except:
            if forever :
                print('Fail, retrying in ' + str(pause) + ' seconds...', end=' ')
                time.sleep(pause)
                print('Go!')
            else:
                print('Fail, ' + str(count) + ' requests done')
                return []
    
    end_cursor = json_info['graphql']['hashtag']['edge_hashtag_to_media']['page_info']['end_cursor']

    medias = json2medias( json_info, True )

    time.sleep(sleep)
  
    count = count + 1

    if count < deep :
        medias += snowball(
            url=url, 
            deep=deep, 
            end_cursor=end_cursor, 
            count=count, 
            showurl=showurl, 
            sleep=sleep,
            forever=forever,
            progress=progress, 
            pause=pause,
            pbar=pbar)
    else:
        pass
    
    if showurl :
        pass
    else:
        if progress :
            if count == deep :
                pass
                print()

    return medias

In [78]:
snowball(queries[1], deep=5, progress=True, forever=True)[0]

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

Fail, retrying in 60 seconds... Go!
Fail, retrying in 60 seconds... Go!



{'id_media': '2069048540710857446',
 'id_owner': '431798494',
 'mediaurl': 'https://www.instagram.com/p/By2vK98HGrm/',
 'shortcode': 'By2vK98HGrm',
 'tags': ['morotraidordapatria',
  'infuxwetrust',
  'libertemlula',
  'morocriminoso',
  'moronacadeia',
  'dallagnolnacadeia',
  'lulalivreja',
  'lulanobeldapaz2019',
  'lula',
  'lulalivre',
  'lulalibre',
  'lulavalealuta',
  'lulapresidente',
  'freelula',
  'lulainocente',
  'eusoulula',
  'lula2018',
  'lulapresopolitico',
  'partidodostrabalhadores',
  'haddad',
  'haddad13',
  'haddadpresidente',
  'obrasilfelizdenovo',
  'mulherescontrabolsonaro',
  'elenao',
  'elenao',
  'elenunca',
  'elejamais'],
 'text': 'VÍDEO PARA QUEM AINDA NÃO ENTENDEU A ATUAÇÃO DO MORO NO PROCESSO DO LULABANDIDO BOM É BANDIDO MORO!!! #morotraidordapatria #infuxwetrust #libertemlula #morocriminoso #moronacadeia #dallagnolnacadeia #lulalivreja #lulanobeldapaz2019 #lula #lulalivre #lulalibre #lulavalealuta #lulapresidente #freelula #lulainocente #eusoulula