Get to school via search results (selenium)

In [79]:
def get_school(school):
    '''Get to school via search results using selenium'''
    from selenium import webdriver
    from selenium.webdriver.common.keys import Keys
    link = 'http://www.ratemyprofessors.com/search.jsp?query={}'.format(school)
    driver = webdriver.Chrome()
    driver.get(link)
    x = '//*[@id="cookie_notice"]/a[2]'
    driver.find_element_by_xpath(x).click()
    first_result = '//*[@id="searchResultsBox"]/div[2]/ul/li[1]/a/span[2]/span[1]'
    driver.find_element_by_xpath(first_result).click()
    return driver.current_url

Get rating for a single school given school url (bs4)

In [197]:
def get_rating(url):
    '''Get rating for a school given school url using bs4'''
    from bs4 import BeautifulSoup
    import requests
    import numpy as np
    headers={'User-Agent': 'Mozilla/5.0'}
    req = requests.get(url, headers = headers)
    soup = BeautifulSoup(req.text,'lxml')
    try:
        avg_prof_rating = soup.find(class_='score medium').text
    except:
        avg_prof_rating = np.nan
    return avg_prof_rating

In [195]:
import numpy as np

Find all urls for California state university

In [92]:
link='http://www.ratemyprofessors.com/search.jsp?queryBy=schoolName&queryoption=HEADER&query=California+State+University&facetSearch=true'

In [93]:
def get_soup(url):
    '''Get bs4 obj for a given school url'''
    from bs4 import BeautifulSoup
    import requests
    headers={'User-Agent': 'Mozilla/5.0'}
    req = requests.get(url, headers = headers)
    soup = BeautifulSoup(req.text,'lxml')
    return soup

In [94]:
soup = get_soup(link)

In [147]:
listings = soup.find_all('li',class_='listing SCHOOL')
dic={}
base = 'http://www.ratemyprofessors.com'
for l in listings:
    sid_url = l.a.get('href')
    school_name = l.find('span',class_='main').text
    dic[school_name]=base+sid_url

In [148]:
dic

{'California Polytechnic State University': 'http://www.ratemyprofessors.com/campusRatings.jsp?sid=153',
 'California State Polytechnic University, Pomona': 'http://www.ratemyprofessors.com/campusRatings.jsp?sid=14774',
 'California State University': 'http://www.ratemyprofessors.com/campusRatings.jsp?sid=14767',
 'California State University -  East Bay': 'http://www.ratemyprofessors.com/campusRatings.jsp?sid=167',
 'California State University Antelope Valley': 'http://www.ratemyprofessors.com/campusRatings.jsp?sid=5802',
 'California State University Bakersfield (CSUB)': 'http://www.ratemyprofessors.com/campusRatings.jsp?sid=158',
 'California State University Channel Islands': 'http://www.ratemyprofessors.com/campusRatings.jsp?sid=4766',
 'California State University Chico': 'http://www.ratemyprofessors.com/campusRatings.jsp?sid=159',
 'California State University Dominguez Hills': 'http://www.ratemyprofessors.com/campusRatings.jsp?sid=160',
 'California State University Fresno': '

Create dataframe

In [117]:
import pandas as pd

In [149]:
df = pd.DataFrame.from_dict(dic,orient='index')

Unnamed: 0,0
California State University Antelope Valley,http://www.ratemyprofessors.com/campusRatings....
"California State Polytechnic University, Pomona",http://www.ratemyprofessors.com/campusRatings....
California Polytechnic State University,http://www.ratemyprofessors.com/campusRatings....
California State University - East Bay,http://www.ratemyprofessors.com/campusRatings....
"California State University, Northridge",http://www.ratemyprofessors.com/campusRatings....


In [155]:
df.reset_index(inplace=True)

In [161]:
df.columns=['school','url']

In [179]:
df['sid']=df['url'].apply(lambda x:x.replace('http://www.ratemyprofessors.com/campusRatings.jsp?sid=',''))

In [200]:
df['rating']=df['url'].apply(get_rating)

In [251]:
df.drop('rating',axis=1,inplace=True)

Now that I have it for one rating, do it for all ratings

In [247]:
def get_ratings(url):
    '''Get all ratings for a school given school url using bs4'''
    from bs4 import BeautifulSoup
    import requests
    import numpy as np
    headers={'User-Agent': 'Mozilla/5.0'}
    req = requests.get(url, headers = headers)
    soup = BeautifulSoup(req.text,'lxml')
    overalls = soup.find_all(class_='overall-rating')
    avg_prof_rating,overall_rating = [o.find('span').text for o in overalls]
    return overall_rating, avg_prof_rating

In [255]:
df['overall_rating'],df['avg_prof_rating'] = zip(*df['url'].apply(get_ratings))

Get number of ratings

In [286]:
def num_ratings(url):
    '''Get number of ratings for a school given school url using bs4'''
    from bs4 import BeautifulSoup
    import requests
    import numpy as np
    headers={'User-Agent': 'Mozilla/5.0'}
    req = requests.get(url, headers = headers)
    soup = BeautifulSoup(req.text,'lxml')
    try:
        num_ratings = soup.find(class_='table-toggle rating-count active h1').text.replace(' School Ratings','')
    except:
        num_ratings = np.nan
    return num_ratings

In [287]:
df['num_ratings'] = df['url'].apply(num_ratings)

In [296]:
# get rid of null rows
df.drop([0,8],inplace=True)
df.reset_index(drop=True,inplace=True)

Get all the subratings

In [335]:
def sub_ratings(url):
    soup = get_soup(url)
    ratings_set = soup.find(class_='quality-breakdown').find_all(class_='score')
    ratings = [r.text for r in ratings_set]
    return ratings

In [336]:
df['clubs'],df['facilities'],df['food'],df['happiness'],df['internet'],df['location'],df['opportunity'],df['reputation'],df['safety'],df['social']  = zip(*df['url'].apply(sub_ratings))

In [360]:
def campus(s):
    trimmed = s.split('University')[-1].replace(',','').replace('-','').strip()
    return trimmed

In [362]:
df['campus']=df['school'].apply(campus)

In [369]:
df.loc[8,'campus']='Bakersfield'

Save dataframe to database

In [None]:
# import sqlite3
# con = sqlite3.connect('data.db')
# df.to_sql('ratings',con)
# con.close()

Gather data from output side 