In [2]:
import pandas as pd
import numpy as np
import csv
from bs4 import BeautifulSoup
import requests

# Web Scraping

#### This section focuses on collecting data from the csv provided by mountain project.
#### Notes to consider:
The csv is pre-filtered with Boulders in Joshua Tree Bouldering, 1.5+ Stars, sorted first by popularity and then by difficulty

In [8]:
routes = pd.read_csv("route-finder.csv")
routes.head()

Unnamed: 0,Route,Location,URL,Avg Stars,Your Stars,Route Type,Rating,Pitches,Length,Area Latitude,Area Longitude
0,White Rastafarian,White Rastafarian Boulder > Outback Bouldering...,https://www.mountainproject.com/route/10572259...,3.9,-1,Boulder,V2 R,1,20.0,34.02073,-116.16212
1,Slashface,Slash Boulder > Western Belt > Geology Tour Ro...,https://www.mountainproject.com/route/10572275...,3.9,-1,Boulder,V3 R,1,25.0,33.95344,-116.08706
2,Pigpen,Pigpen Boulder > Manx Boulders Circuit > Manx/...,https://www.mountainproject.com/route/10572299...,3.9,-1,Boulder,V4,1,10.0,34.0153,-116.15811
3,JBMFP,JBMF Boulder > JBMF Boulders > Roadside Rocks ...,https://www.mountainproject.com/route/10572546...,3.9,-1,Boulder,V5,1,16.0,34.0152,-116.16631
4,Gunsmoke,Gunsmoke Wall > Gunsmoke Area > Barker Dam Bou...,https://www.mountainproject.com/route/10572235...,3.7,-1,Boulder,V3,1,80.0,34.02858,-116.14508


In [9]:
# Here we're scraping for descriptions accessing each route's site 
# provided in the Mountain Project csv (route-finder.csv)

# NOTE: This parsing relies on the assumption that the first "fr-view" class is always the description
# Considering the websites' structure, it is possible some descriptions are in different divs or classes

# Empty list for descriptions; we'll be appending this list to the data frame after.
descriptions = []

for index, row in routes.iterrows():
    route_link = row['URL']
    response = requests.get(route_link)
    soup = BeautifulSoup(response.content, "html.parser")
    route_description = soup.find("div", {"class": "fr-view"})
    descriptions.append(str(route_description))
    
routes['Descriptions'] = descriptions

print("done")

done


In [11]:
# Quick look at the descriptions in the updated dataframe.
routes.loc[:,['Route', 'Descriptions']].head()

Unnamed: 0,Route,Descriptions
0,White Rastafarian,"<div class=""fr-view"">This problem is located o..."
1,Slashface,"<div class=""fr-view"">Another of Joshua Tree's ..."
2,Pigpen,"<div class=""fr-view"">Behind Cyclops rock is th..."
3,JBMFP,"<div class=""fr-view""><p>This classic problem t..."
4,Gunsmoke,"<div class=""fr-view"">Classic. Like Yosemite's..."


# Pre-processing
#### Here are the steps we plan on implementing for cleaning our "Descriptions"
1. Lowercasing
2. Cleaning
3. Splitting
4. Stopwords
5. Stemming

Let's implement a function that cleans our ddescriptions.

In [12]:
import multiprocessing as mp
import nltk
import string
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [19]:
# This function takes a string and returns a cleaned list of words.


def preprocess_descriptions(text):
    
    # lowercasing
    lowercased_text = text.lower()
    
    # Cleaning - removing punctuation and html notation
    removed_html_text = re.sub("\<.*?\>", "", lowercased_text)
    removed_html_text = re.sub("\xa0", "", removed_html_text)
    removed_punctuation = removed_html_text.translate(str.maketrans('', '', string.punctuation))
    
    removed_white_space = removed_punctuation.strip()
    
    # Splitting
    split_text = removed_white_space.split()
    
    # Stopwords
    stop_words = set(stopwords.words('english'))
    stopwords_removed = [word for word in split_text if word not in stop_words]
    cleaned_string = stopwords_removed
    # Stemming
    ps = PorterStemmer()
    stemmed_text = [ps.stem(word) for word in stopwords_removed]
    cleaned_string = stemmed_text

    
    return cleaned_string

In [20]:
# Now, we can apply this function to all of our descriptions so that we can start using it for modeling!
routes['Descriptions'] = routes['Descriptions'].apply(preprocess_descriptions)

In [22]:
# Lets take a look
routes.loc[:,['Route', 'Descriptions']].head()

Unnamed: 0,Route,Descriptions
0,White Rastafarian,"[problem, locat, larg, boulder, southeast, end..."
1,Slashface,"[anoth, joshua, tree, finest, boulder, problem..."
2,Pigpen,"[behind, cyclop, rock, awesom, boulder, proble..."
3,JBMFP,"[classic, problem, take, proud, line, directli..."
4,Gunsmoke,"[classic, like, yosemit, midnight, lightn, hue..."


In [25]:
# Looks good, let's export that into a csv as our saving point
routes.to_csv('Routes_With_Cleaned_Descriptions.csv', index=False)

# NOTICE we have to use the following code when loading the csv, as to_csv turns our list into a string.
# df = pd.read_csv('Routes_With_Cleaned_Descriptions.csv')
# df['Descriptions'].apply(ast.literal_eval)