In [87]:
import pandas as pd
import numpy as np
import csv
from bs4 import BeautifulSoup
import requests

# Web Scraping

#### This section focuses on collecting data from the csv provided by mountain project.
#### Notes to consider:
The csv is pre-filtered with Boulders in Joshua Tree Bouldering, 1.5+ Stars, sorted first by popularity and then by difficulty

In [88]:
routes = pd.read_csv("route-finder.csv")

In [90]:
# Testing scraping for descriptions, can easily be scaled.

# NOTE: This parsing relies on the assumption that the first "fr-view" class is always the description
# This can be expanded upon later in the processes while refining data collection

test_routes = routes[:20].copy()

test_descriptions = []

for index, row in test_routes.iterrows():
    route_link = row['URL']
    response = requests.get(route_link)
    soup = BeautifulSoup(response.content, "html.parser")
    route_description = soup.find("div", {"class": "fr-view"})
    test_descriptions.append(str(route_description))
    
test_routes['Descriptions'] = test_descriptions

print("done")

done


In [100]:
test_string = test_routes['Descriptions'][:5][2]
test_string

'<div class="fr-view">Behind Cyclops rock is this awesome boulder problem. A sit start in a cave starts you out 10\' of finger crack through the roof in the cave, to a tight-hands swing around the lip and then fingers, off fingers and tight hands lead to a funky top out that is for many the crux.</div>'

# Pre-processing
#### Here are the steps we plan on implementing for cleaning our "Descriptions"
1. Lowercasing
2. Cleaning
3. Splitting
4. Stopwords
5. Stemming

In [134]:
import multiprocessing as mp
import nltk
import string
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [148]:
# This takes a string and returns a cleaned list of words.


def preprocess_descriptions(text):
    
    # lowercasing
    lowercased_text = text.lower()
    
    # Cleaning - removing punctuation and html notation
    removed_html_text = re.sub("\<.*?\>", "", lowercased_text)
    removed_html_text = re.sub("\xa0", "", removed_html_text)
    removed_punctuation = removed_html_text.translate(str.maketrans('', '', string.punctuation))
    
    removed_white_space = removed_punctuation.strip()
    
    # Splitting
    split_text = removed_white_space.split()
    
    # Stopwords
    stop_words = set(stopwords.words('english'))
    stopwords_removed = [word for word in split_text if word not in stop_words]
    cleaned_string = stopwords_removed
    # Stemming
    ps = PorterStemmer()
    stemmed_text = [ps.stem(word) for word in stopwords_removed]
    cleaned_string = stemmed_text

    
    return cleaned_description

In [146]:
preprocess_descriptions(test_string)


['behind',
 'cyclop',
 'rock',
 'awesom',
 'boulder',
 'problem',
 'sit',
 'start',
 'cave',
 'start',
 '10',
 'finger',
 'crack',
 'roof',
 'cave',
 'tighthand',
 'swing',
 'around',
 'lip',
 'finger',
 'finger',
 'tight',
 'hand',
 'lead',
 'funki',
 'top',
 'mani',
 'crux']