# NaNoWriMo Word Count Scraping

This notebook scrapes word count information for a NaNoWriMo user. Word count is scraped across all projects, and all events within that project.

In [None]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.common.exceptions import NoSuchElementException
import time
from datetime import datetime
from selenium import webdriver
import requests
from bs4 import BeautifulSoup
from itertools import chain
import os
from datetime import datetime
import pandas as pd

In [None]:
# get wordcount stats across all events for one project
def get_project_stats(html):
    soup = BeautifulSoup(html, "html.parser")

    # basic stats for project
    title = soup.find("div", class_="nw-title").text.strip()
    genre = soup.find("div", class_="nw-project-genres").text.strip()

    # get each event this project was written during
    events = [x.text.strip() for x in soup.find_all("div", class_="nw-title")[1:]]
    # get goal for each event
    goals = [x.text.split("/")[1].strip() for x in soup.find_all("span", class_="goal")[1:]]

    dfs = []
    
    project_url = driver.current_url

    # iterate over all the events
    for i in range(len(events)):
        # click on the link associated with that event
        time.sleep(10)
        event_links = driver.find_elements_by_class_name('nw-title')
        event_links[i+1].click()

        time.sleep(5)
        html = driver.page_source
        stats = get_wordcount_stats(html)
        if stats is not None:
            stats['Project'] = title
            stats['Genre'] = genre
            stats['Event'] = events[i]
            stats['Goal'] = goals[i]

            dfs.append(stats)
        
        # go back to the project page
        driver.get(project_url)

    # return combo of all info for this project
    if len(dfs) == 0:
        return None
    else:
        return(pd.concat(dfs, axis=0))

# parse the word count table
def get_entry(row):
    x = row.find_all("td")
    return ([datetime.strptime(x[0].text,"%b %d, %Y"), int(x[1].text.split(" ")[0])])

# get daily word count stats for a single nanowrimo event
def get_wordcount_stats(html):
    soup = BeautifulSoup(html, "html.parser")
    table_info = soup.find("table", class_="nw-table-full-width")
    if table_info is None:
        return(None)
    else:
        table_info = table_info.find_all("tr")
        df = pd.DataFrame([get_entry(x) for x in table_info[1:]] , columns = ['Date', 'WordCount']).groupby("Date").sum()
        return(df)
    

# Set up driver


In [None]:
driver = webdriver.Firefox(executable_path=r'/Users/morganoneka/Documents/PersonalProjects/geckodriver')

In [None]:
username = "morgandanielle"
url = "https://nanowrimo.org/participants/" + username + "/projects"

In [None]:
driver.get(url)

IMPORTANT!!! Before moving onto the next step, you need to log into the NaNoWriMo website with your credentials.

# Scraping word count
This part of the notebook iterates over all of a user's projects to get word count. There are many long `time.sleep` calls because the NaNoWriMo website takes a bit to load.

In [None]:
# get project links for first pass 
project_links = driver.find_elements_by_class_name('goal-text')

# the number of projects the user has
number_projects = len(project_links)

# list to save stats in - one df per project
all_stats = []

# iterate over all projects
for i in range(len(project_links)):
    print(i)
    
    # if the reassignment of project_links went well, it will equal the number of projects
    if (len(project_links) != number_projects):
        driver.get(url)
        time.sleep(15)
        project_links = driver.find_elements_by_class_name('goal-text')
    
    # click link to go to project page
    project_links[i].click()
    
    # get html from page
    time.sleep(10)
    html = driver.page_source
    
    # get project page info
    stats = get_project_stats(html)
    all_stats.append(stats)

    print(url)
    # return to homepage
    driver.get(url)
     # update project links 
    time.sleep(10)
    project_links = driver.find_elements_by_class_name('goal-text')
    

In [None]:
combo = pd.concat(all_stats, axis=0)
combo.to_csv("rawdata/" + username + ".csv")