# Notebook Summary

This web scraper takes in a list of Medium user profiles and collectsa list of articles written by each user. It then scrapes each of those articles for a set of traits, ranging from article length to a readibility score. These traits are used for analysis in Step 3 of this project.

# Inital Imports

In [2]:
import json
import requests
from lxml import html
from bs4 import BeautifulSoup
from collections import OrderedDict
import argparse
import re
import dateutil.parser
import pandas as pd
import time
from selenium import webdriver
from textatistic import Textatistic
import pickle
import numpy as np
from selenium.common.exceptions import TimeoutException
from textblob import TextBlob
import random
import os

# Get urls for articles from inital scrape

In [8]:
#bring in the 'iterator' list from medium-graph-scrape, and dedupe it into a set
with open("pickles/jonathonmorgan_all_the_people.pkl", 'rb') as picklefile: 
    all_the_people = pickle.load(picklefile)
all_the_people = set(all_the_people)

In [9]:
#make a sample to work with
people = list(all_the_people)
sample= people[-10:]
sample

['https://medium.com/@sabinakrupic',
 'https://medium.com/@oh_steph',
 'https://medium.com/@mawwwk',
 'https://medium.com/@namitajamwal19',
 'https://medium.com/@michaeltbaker22',
 'https://medium.com/@mrdollywaggle',
 'https://medium.com/@charlesliu2012',
 'https://medium.com/@neutunlabs',
 'https://medium.com/@msf_fr',
 'https://medium.com/@rachele.gilman']

In [10]:
#Declare data structures
master_articles = []
master_people = {}

## Create global functions

In [11]:
def get_home_articles(soup):
'''This function gets the articles from a given users home page'''
    article_urls = []
    try:
        articles = soup.find_all('a', {'data-action':'open-post'})
        for item in articles:
            if item.text == 'Read more…':
                url = (item['href'])
                url = re.sub('\?source(.*)','',url)
                article_urls.append(url)
        return article_urls
    except: []

IndentationError: expected an indented block (<ipython-input-11-41282f89d616>, line 2)

In [12]:
def count_followers(soup):
    '''count followers from the home page'''
    try: 
        followers = soup.find(attrs ={'data-action-value':'followers'})
        follower_count = followers.text
        return follower_count.replace(" Followers", "")
    except AttributeError:
        return 0

        
def count_leaders(soup):
    '''count leaders from the home page'''
    try: 
        following = soup.find(attrs ={'data-action-value':'following'})
        following_count= following.text
        return following_count.replace(" Following", "")
    except AttributeError:
        return 0

def get_name(soup):
    try: 
        name = soup.find('h1',{'class':'ui-h2 hero-title' })
        return name.text
    except AttributeError:
        return 0

## Loop through people and get article urls

In [303]:
driver = webdriver.Chrome(executable_path="/Users/mayamidzik/tools/chromedriver")

for url in people:
    url_latest= url+'/latest'
    driver.get(url_latest)
    
    #scroll to bottom of page
    SCROLL_PAUSE_TIME = 0.1
    
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height
    
    #get the html for the page
    innerHTML = driver.execute_script("return document.body.innerHTML")
    soup = BeautifulSoup(innerHTML,"lxml")
    
    
#     pickle_name = re.search('@(.*)',url).group(0)
#     print(pickle_name)
#     with open('/Users/mayamidzik/backups/pickles/person_homes/'+ pickle_name+'_home.pkl', 'wb') as picklefile:
#         pickle.dump(soup, picklefile)
    
    followers_count = count_followers(soup)
    leaders_count = count_leaders(soup)
    name = get_name(soup)
    master_people[url]=({'followers':followers_count})
    master_people[url].update({'leaders':leaders_count})
    master_people[url].update({'name':name})
    
    
    articles = get_home_articles(soup)
    master_articles.extend(articles)

    

In [304]:
#save to pickle
with open('SenateGOP_master_articles.pkl', 'wb') as picklefile:
    pickle.dump(master_articles, picklefile)

In [310]:
len(master_people)

2233

In [311]:
master_people_df = pd.DataFrame.from_dict(master_people).T

In [312]:
master_people_df.reset_index(level=0, inplace=True)
master_people_df.rename(columns = {'index':'url'},inplace = True)
master_people_df.head()

Unnamed: 0,url,followers,leaders,name
0,https://medium.com/@0504567335,26,536,AHMAD ALASHRAM
1,https://medium.com/@12Sherwoodcp,2,10,L Sherwood
2,https://medium.com/@1800Ethernet,24,547,Ethernet.GLOBAL
3,https://medium.com/@1LuisFarias,14,87,Luis Farias
4,https://medium.com/@1PNR,13,404,PIERRE N. ROLIN


In [313]:
with open('SenateGOP_people.pkl', 'wb') as picklefile:
    pickle.dump(master_people, picklefile)

# Get article traits from urls

## Define functions for attributes

In [121]:
with open("pickles/jonathonmorgan_master_articles.pkl", 'rb') as picklefile: 
    master_articles = pickle.load(picklefile)

In [122]:
master_articles

['https://blog.medium.com/let-s-take-it-to-medium-6f3607a77f70',
 'https://blog.medium.com/good-embed-eacdce5f2933',
 'https://blog.medium.com/taking-a-side-on-net-neutrality-11bca5a2f118',
 'https://blog.medium.com/blocking-and-tracking-3b74da2fd881',
 'https://medium.com/@feerst/on-the-death-of-a-very-good-lawyer-i-never-met-a1cc1d43d446',
 'https://medium.com/@biggreenbox/our-5-step-methodology-for-predictive-analytics-79fa22c5f3be',
 'https://medium.com/@danielresende/como-fazer-um-computador-ver-356849d3c86b',
 'https://medium.com/@mlbayless92/what-i-have-read-and-what-i-am-currently-reading-dff6b8a8db55',
 'https://medium.com/@mlbayless92/challenging-your-opinions-c2bc011dcbdc',
 'https://medium.com/@mlbayless92/the-very-short-history-of-fbi-directors-1f4e550cecf5',
 'https://blog.politicsmeanspolitics.com/about-time-a8f07b276c38',
 'https://medium.com/@mlbayless92/the-biggest-loser-usaid-eeb2fd867c3',
 'https://medium.com/@biggreenbox/our-5-step-methodology-for-predictive-analyt

In [123]:
sample = master_articles[:20]

In [124]:
#check if the article is associated with a publication
def check_pub(soup):
    try:
        soup.find_all('a',{'class':'js-collectionLogoOrName'})
        return 1
    except: return 0

In [125]:
#get_writer

def get_author(soup):
    try:
        author_url = soup.find('a',{'data-action-source':'post_header_lockup'})
        author_url = author_url['href']
        author_url = re.sub('\?source(.*)','',author_url)
        return author_url
    except: return np.nan
    

In [126]:
#get the date and time of the article:
def get_datetime(soup): 
    try: 
        published = soup.find('time')
        return published['datetime']
    except: return np.nan

In [127]:
#get the reading time for the article

def get_reading_time(soup):
    try:
        reading = soup.find('span',{'class':'readingTime'})
        reading = reading['title']
        #reading = list(filter(str.isdigit, reading))
        #return int(''.join(reading))
        return reading
    except: return np.nan

In [128]:
#get the tags for the article
def get_tags(soup):
    tag_list = []
    try:
        tags = articlesoup.find('ul',{'class':'tags tags--postTags tags--borderless'})
        tags = tags.find_all('li')
        for tag in tags:
            tag_list.append(tag.text)
        return tag_list
    except: return tag_list


In [129]:
def get_claps(soup):
    try: 
        claps = articlesoup.find('button',{'data-action':'show-recommends'})
        return claps.text
    except: return 0

def get_claps2(soup):
    try: 
        claps = articlesoup.find('span',{'class':'u-textAlignCenter u-relative u-background js-actionMultirecommendCount u-marginLeft10'})
        if len(claps.text) == 0:
            return 0
        else: return claps.text
    except: return 0

In [130]:
def get_meta_traits(soup):
    article_traits = {}
    currentURL =  driver.current_url
    article_traits[currentURL] = {}
    article_traits[currentURL] = {'publisher': check_pub(articlesoup)}
    article_traits[currentURL].update({'author': get_author(articlesoup)})
    article_traits[currentURL].update({'datetime': get_datetime(articlesoup)})
    article_traits[currentURL].update({'reading_time': get_reading_time(articlesoup)})
    article_traits[currentURL].update({'claps': get_claps2(articlesoup)})
    article_traits[currentURL].update({'tags': get_tags(articlesoup)})

    return article_traits


In [131]:
#get all attributes around the article text

def get_text_traits(soup):
    article_text_traits= {}
    #get the number of images and add to dictionary
    article_images = soup.find_all('img',{'class' : 'progressiveMedia-image js-progressiveMedia-image'})
    if article_images:
        #add to dictionary
        article_text_traits['images'] = len(article_images)
    else: article_text_traits['images'] = 0
    
    #get the tota; text block for the article
    article_text = articlesoup.find_all('p')
    total_text = ''
    if article_text:
        for text in article_text:
            total_text+=(text.text)
            
        #get total word count
        article_text_traits['words'] = len(total_text.split())
        
        #get sentence count
        article_text_traits['sentences'] = total_text.count('.')
        
        #get flesch readability
        try:
            s = Textatistic(total_text)
            article_text_traits['flesch'] = s.flesch_score
        except: article_text_traits['flesch'] = np.nan
            
        #get sentiments
        try:
            testimonial = TextBlob(total_text)
            article_text_traits['polarity'] = testimonial.sentiment.polarity
            article_text_traits['subjectivity'] = testimonial.sentiment.subjectivity
            article_text_traits['language'] = testimonial.detect_language()
        except: 
            article_text_traits['polarity'] = np.nan
            article_text_traits['subjectivity'] = np.nan
            article_text_traits['language'] = np.nan
            
    else:
        article_text_traits['words'] = np.nan
        article_text_traits['sentences'] = np.nan
        article_text_traits['flesch'] = np.nan
        article_text_traits['polarity'] = np.nan
        article_text_traits['subjectivity'] = np.nan
        article_text_traits['language'] = np.nan
        
    
    return article_text_traits



## Loop through articles, applying all trait functions

In [140]:
master_article_traits = {}
#if have closed driver
driver = webdriver.Chrome(executable_path="/Users/mayamidzik/tools/chromedriver")
for url in master_articles:
    driver.get(url)
    try:
        articleHTML = driver.execute_script("return document.body.innerHTML")
        articlesoup = BeautifulSoup(articleHTML,"lxml")

        currentURL =  driver.current_url
        article_meta_traits = get_meta_traits(articlesoup)
        article_text_traits = get_text_traits(articlesoup)
        article_meta_traits[currentURL].update(article_text_traits)
        master_article_traits.update(article_meta_traits)
    except TimeoutException:
        pass

with open('jonathonmorgan_articles_claps_redo.pkl', 'wb') as picklefile:
    pickle.dump(master_article_traits, picklefile)

#write to a dataframe
df = pd.DataFrame.from_dict(master_article_traits).T
df.reset_index(level=0, inplace=True)
df['index'] = df
df.rename(columns = {'index':'url'},inplace = True)

df.to_pickle('jonathonmorgan_articles_claps_redo_df.pkl')


    

In [141]:
#1415
len(master_article_traits)

3757

In [142]:
df.to_pickle('jonathonmorgan_articles_claps_redo_df.2pkl')
