In [1]:
import pandas as pd
import numpy as np
import csv
from bs4 import BeautifulSoup
import requests

# Web Scraping

#### This section focuses on collecting data from the csv provided by mountain project.
#### Notes to consider:
The csv is pre-filtered with Boulders in Joshua Tree Bouldering, 1.5+ Stars, sorted first by popularity and then by difficulty

In [9]:
routes = pd.read_csv("route-finder.csv")

In [11]:
# Testing scraping for descriptions, can easily be scaled.

# NOTE: This parsing relies on the assumption that the first "fr-view" class is always the description
# This can be expanded upon later in the processes while refining data collection

def scrape_descriptions(df):
    # Empty List of Descriptions
    test_descriptions = []
    
    for index, row in df.iterrows():
        route_link = row['URL']
        response = requests.get(route_link)
        soup = BeautifulSoup(response.content, "html.parser")
        route_description = soup.find("div", {"class": "fr-view"})
        test_descriptions.append(str(route_description))

    # Add list of Descriptions to column in df.
    df['Descriptions'] = test_descriptions
    
    print("Done Scraping")
    return df

In [13]:
with_descriptions = scrape_descriptions(routes)

Done Scraping


In [17]:
with_descriptions.to_csv('Routes_With_Scraped_Descriptions.csv', index=False)

# Pre-processing
### *Start here when reloading the notebook
#### Here are the steps we plan on implementing for cleaning our "Descriptions"
1. Lowercasing
2. Cleaning
3. Splitting
4. Stopwords
5. Stemming

In [19]:
loaded_with_descriptions = pd.read_csv('Routes_With_Scraped_Descriptions.csv')

In [23]:
import multiprocessing as mp
import nltk
import string
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download('stopwords')

# This takes a string and returns a cleaned list of words.

def preprocess_descriptions(text):
    
    # lowercasing
    lowercased_text = text.lower()
    
    # Cleaning - removing punctuation and html notation
    removed_html_text = re.sub("\<.*?\>", "", lowercased_text)
    removed_html_text = re.sub("\xa0", "", removed_html_text)
    removed_punctuation = removed_html_text.translate(str.maketrans('', '', string.punctuation))
    
    removed_white_space = removed_punctuation.strip()
    
    # Splitting
    split_text = removed_white_space.split()
    
    # Stopwords
    stop_words = set(stopwords.words('english'))
    stopwords_removed = [word for word in split_text if word not in stop_words]
    cleaned_string = stopwords_removed
    
    # Stemming
    ps = PorterStemmer()
    stemmed_text = [ps.stem(word) for word in stopwords_removed]
    cleaned_string = stemmed_text

    
    return cleaned_string


def preprocess_df(df):
    for index, row in df.iterrows():
        row['Descriptions'] = preprocess_descriptions(row['Descriptions'])

    return df

[nltk_data] Downloading package stopwords to C:\Users\Noah
[nltk_data]     Kim\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [25]:
loaded_with_descriptions['Descriptions'] = loaded_with_descriptions['Descriptions'].apply(preprocess_descriptions)

In [27]:
loaded_with_descriptions.head()

Unnamed: 0,Route,Location,URL,Avg Stars,Your Stars,Route Type,Rating,Pitches,Length,Area Latitude,Area Longitude,Descriptions
0,White Rastafarian,White Rastafarian Boulder > Outback Bouldering...,https://www.mountainproject.com/route/10572259...,3.9,-1,Boulder,V2 R,1,20.0,34.02073,-116.16212,"[problem, locat, larg, boulder, southeast, end..."
1,Slashface,Slash Boulder > Western Belt > Geology Tour Ro...,https://www.mountainproject.com/route/10572275...,3.9,-1,Boulder,V3 R,1,25.0,33.95344,-116.08706,"[anoth, joshua, tree, finest, boulder, problem..."
2,Pigpen,Pigpen Boulder > Manx Boulders Circuit > Manx/...,https://www.mountainproject.com/route/10572299...,3.9,-1,Boulder,V4,1,10.0,34.0153,-116.15811,"[behind, cyclop, rock, awesom, boulder, proble..."
3,JBMFP,JBMF Boulder > JBMF Boulders > Roadside Rocks ...,https://www.mountainproject.com/route/10572546...,3.9,-1,Boulder,V5,1,16.0,34.0152,-116.16631,"[classic, problem, take, proud, line, directli..."
4,Gunsmoke,Gunsmoke Wall > Gunsmoke Area > Barker Dam Bou...,https://www.mountainproject.com/route/10572235...,3.7,-1,Boulder,V3,1,80.0,34.02858,-116.14508,"[classic, like, yosemit, midnight, lightn, hue..."


# Recommendation System

### Recommendations using Cosine Similarity
### Steps for implementation:
1. Use Multilabel Binarizer to one hot encode the lists of strings

### Multilabel Binarizer

In [55]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()

mlb_df = pd.DataFrame(mlb.fit_transform(loaded_with_descriptions['Descriptions']),columns=mlb.classes_, index=loaded_with_descriptions.index)
mlb_df.head()

Unnamed: 0,1,10,100,1015,10a,11ft,12,125,12a,12foot,...,zippi,zone,zoomo,zshape,chunkers,locker,“black,“fri,“pillar”,“spud
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Bag of Words

In [148]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

descriptions_as_string = loaded_with_descriptions['Descriptions'].astype(str)

vectorizer = CountVectorizer()
bow_encoded = vectorizer.fit_transform(descriptions_as_string)

similarity_matrix = cosine_similarity(bow_encoded)

In [302]:
def n_nearest_routes(input_index, similarity_matrix, n):
    
    nearest_indices = similarity_matrix[input_index].argsort()

    names =[]
    for i in nearest_indices.tolist():
        names.append(loaded_with_descriptions.loc[i, 'Route'])
    
    return names[-(n+1):-1][::-1]

In [304]:
n_nearest_routes(2, similarity_matrix, 5)

['Street Zen',
 'Sex Magician Sit Start',
 'Dark Matter',
 'The Ejector',
 'The Egg Timer']

In [234]:
loaded_with_descriptions[loaded_with_descriptions['Route'] == 'Pigpen'].index

Index([2], dtype='int64')