# Notebook Summary

This notebook is a web scraper that crawls Medium.com based on an initial seed user. The scraper loops through the graph of followers/followees (called leaders generally in this code), and creates 2 main data structures:
1) A dictionary of source:target pairs that describes the connections between leaders and followers in medium
2) A dictionary of nodes that contains each user examined and metadata of #of followers and #of leaders

This code was written for 2 main purposes:
1) To generate a list of users, whose articles will then be scraped in Step 2 for analysis and modeling in Step 3 of the project.
2) To product source:target pairs in a format readable by d3 for the purposes of making a force-directed network graph of a given medium network

Note: Medium.com does not product a comprehensive list of users or articles published, so this type of connection between users is the only way to get a large sample size on the site. 



# Inital imports

In [None]:
import json
import requests
from lxml import html
from bs4 import BeautifulSoup
from collections import OrderedDict
import argparse
import re
import dateutil.parser
import pandas as pd
import time
from selenium import webdriver
import pickle

# Declare Data Structures and seed iterator

In [200]:
# ## Decalare the structures we'll be working in


# #nodes = {node1:{followers:3, leaders:4},node2:{followers:12, leaders: 13},node3:{followers:2, leaders: 9}...}
# nodes = {}

# #iterator = [node1,node2,node3...]
# iterator = []

# #links_dict = {source: node1, target: f1},{source: node1, target: f2},{source: node2, target: f3}
# links_dict = {'links':[]}


# #LEGACY
# # followers_dict = {node1:[f,f,f],node2:[f,f,f,f,f,f,f,f,],node3:[f,f,f]...}
# followers_dict = {}

# #leaders_dict = {node1:[l,l,l,l], node2:[l,l], node3:[l,l,1,1,1,1,1,1,l]...}
# leaders_dict = {}



In [201]:
#seed the iterator
iterator.append('https://medium.com/@mmidzik')
iterator

['https://medium.com/@mmidzik']

# Define global functions

In [202]:
## Define functions to scrape a person's overview page
def fix_K_or_M(n):
    if ' Follower' in str(n):
        n = re.sub(' Follower','',n)
        return int(n)
    elif 'K' in str(n):
        n = re.sub('K','',n)
        n = float(n) * 1000
        return int(n)
    elif 'M' in str(n):
        n = re.sub('M','',n)
        n = float(n) * 1000000
        return int(n)
    else: 
        n = float(n)
        return int(n)

def count_followers(soup):
    try: 
        followers = soup.find(attrs ={'data-action-value':'followers'})
        follower_count = followers.text
        follower_count = follower_count.replace(" Followers", "")
        follower_count = fix_K_or_M(follower_count)
        return follower_count
    except AttributeError:
        return 0
        
def count_leaders(soup):
    try: 
        following = soup.find(attrs ={'data-action-value':'following'})
        following_count= following.text
        following_count = following_count.replace(" Following", "")
        following_count = fix_K_or_M(following_count)
        return following_count
    except AttributeError:
        return 0

def get_name(soup):
    name = soup.find('h1',{'class':'ui-h2 hero-title' })
    return name.text

In [203]:
#scroll to the bottom of the followers page and get the soup html again

def scroll_page(max_avatar_count):
    i = 0
    dicts = {}
    stream = []
    catch_error = []
    while len(stream) < max_avatar_count:
        try:
            time.sleep(.4)
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
            stream = driver.find_elements_by_class_name('ui-captionStrong')
            catch_error.append(len(stream))
            if len(catch_error)>6:
                #this is a super janky way to test for a timeout error
                if catch_error[-2] == catch_error[-3] == catch_error[-4] == catch_error[-5] == catch_error[-6] == catch_error[-7]:
                    break
                    print('hung before end')
        except TimeoutException:
            pass

    innerHTML = driver.execute_script("return document.body.innerHTML")
    soup = BeautifulSoup(innerHTML,"lxml")
    return soup


In [204]:
def pickle_save(item,name):
    with open(name, 'wb') as picklefile:
        pickle.dump(item, picklefile)
    

In [205]:
def get_followers_dict(url,soup,followers_dict,iterator):
        followers = soup.find_all('a', {'class':'link u-baseColor--link avatar u-width60 u-marginRight20 u-flex0'})
        res = []
        for follower in followers:
            follower = follower['href']
            #should put in the node's url here
            res.append(follower)
            iterator.append(follower)
        followers_dict[url] = res
        return followers_dict

def get_leaders_dict(url,soup,leaders_dict,iterator):
        leaders = soup.find_all('a', {'class':'link u-baseColor--link avatar u-width60 u-marginRight20 u-flex0'})
        res = []
        for leader in leaders:
            leader = leader['href']
            #should put in the node's url here
            res.append(leader)
            iterator.append(leader)
        leaders_dict[url] = res
        return leaders_dict

#links_dict = {source: node1, targer: f1},{follower: node1, leader: f2},{follower: node2, leader: f3}
#source == follower
#target == leader

def add_follower_links(url, soup):
    followers = soup.find_all('a', {'class':'link u-baseColor--link avatar u-width60 u-marginRight20 u-flex0'})
    for follower in followers:
        follower = follower['href']
        iterator.append(follower)
        res = {'source':follower, 'target':url}
        links_dict['links'].append(res)
        
def add_leader_links(url, soup):
    leaders = soup.find_all('a', {'class':'link u-baseColor--link avatar u-width60 u-marginRight20 u-flex0'})
    for leader in leaders:
        leader = leader['href']
        iterator.append(leader)
        res = {'source':url, 'target':leader}
        links_dict['links'].append(res)
        
        

    


# Iterate through nodes & followers

In [213]:
i = 0
driver = webdriver.Chrome(executable_path="/Users/mayamidzik/tools/chromedriver")
recover_list = []

while len(nodes) < 2000000:
    url = iterator[i]
    if (url not in nodes) & (url != 'https://medium.com/@MediumStaff'):
        followers_url = url+ '/followers'
        driver.get(followers_url)
        innerHTML = driver.execute_script("return document.body.innerHTML") #returns the inner HTML as a string
        
        soup = BeautifulSoup(innerHTML,"lxml")

        followers_count = count_followers(soup)
        leaders_count = count_leaders(soup)

        node_dict ={url:{'followers':followers_count,'leaders':leaders_count}}
        nodes.update(node_dict)
        
        recover_list.append(i)
        
        soup = scroll_page(followers_count)
        get_followers_dict(url,soup,followers_dict,iterator)
        add_follower_links(url,soup)
        
        leaders_url = url+ '/following'
        driver.get(leaders_url)
        
        soup = scroll_page(leaders_count)
        get_leaders_dict(url,soup,leaders_dict,iterator)
        add_leader_links(url,soup)

    
    if len(nodes)%50 == 0:
        pickle_save(links_dict,'links_dict2.pkl')
        pickle_save(followers_dict,'followers_dict2.pkl')
        pickle_save(leaders_dict,'leaders_dict2.pkl')
        pickle_save(iterator, 'iterator2.pkl')
        pickle_save(nodes, 'nodes2.pkl')
        
    i+=1

## Re-pickle all outputs to be safe

In [218]:
pickle_save(links_dict,'links_dict2.pkl')
pickle_save(followers_dict,'followers_dict2.pkl')
pickle_save(leaders_dict,'leaders_dict2.pkl')
pickle_save(iterator, 'iterator2.pkl')
pickle_save(nodes, 'nodes2.pkl')

In [216]:
len(nodes)

442

In [217]:
recover_list[-3:]

[613, 614, 615]

# Reformat for D3 graph

In [None]:
# #getting into the right format for d3:
# for key, value in nodes.items():
#     nodes[key]['id'] = key
# nodes

In [None]:
#format needed for d3

# {
#     "nodes":[
#         {"id": "user", "group": 1},
#         {"id": "user", "group": 1},
#         {"id": "user", "group": 1},
#         {"id": "user", "group": 1},
#     ],
#     "links":[
#         {"source": "Napoleon", "target": "Myriel", "value": 1},
#         {"source": "Mlle.Baptistine", "target": "Myriel", "value": 8},
#         {"source": "Mme.Magloire", "target": "Myriel", "value": 10},
#         {"source": "Mme.Magloire", "target": "Mlle.Baptistine", "value": 6},
#         {"source": "CountessdeLo", "target": "Myriel", "value": 1},
#         {"source": "Geborand", "target": "Myriel", "value": 1},
#         {"source": "Champtercier", "target": "Myriel", "value": 1}
#     ]
# }