## <font color='blue'>This Jupyter Notebook covers data acquisition from College Confidential</font>

In [None]:
# table goes here

### 1.1 Importing packages needed to scrape and save serialized data and set College Confidential url

In [1]:
from bs4 import BeautifulSoup
from datetime import datetime 
from helpers import bot_policy
import json
import os
import requests
import sys

In [3]:
cc = 'https://www.collegeconfidential.com'

### 1.2 Checking website policy using [bot_policy](helpers.py) function

In [4]:
bot_policy(cc)

User-agent: *

Disallow: /cgi-bin

Disallow: /wp-admin

Disallow: /wp-includes

Disallow: /wp-content/plugins

Disallow: /wp-content/cache

Disallow: /wp-content/themes

Disallow: /vibe

Disallow: /trackback

Disallow: /feed

Disallow: /comments

Disallow: /category/*/*

Disallow: */trackback

Disallow: */feed

Disallow: */comments

Disallow: /*?*

Disallow: /*?

Disallow:

Disallow:

Disallow: /

Disallow: /


### 1.3 After navigating to Penn forum, test response and figure out API call structure

In [7]:
requests.get('https://talk.collegeconfidential.com/university-pennsylvania/').status_code

200

In [None]:
penn_url = 'https://talk.collegeconfidential.com/university-pennsylvania//p1'

In [None]:
response = requests.get(penn_url)

In [None]:
response.status_code

In [None]:
page = response.text
print(page[0:250])

In [None]:
soup = BeautifulSoup(page, 'html5')

In [None]:
# by inspection, there are 250 pages
page_list = []
for i in range(1, 251):
    page_list.append('https://talk.collegeconfidential.com/university-pennsylvania//p{}'.format(i))

In [None]:
def get_one_page_titles(soup):
    """
    This function scrapes the thread titles displayed on one page of results
    
    """
    topics_json = []
    for div in soup.find_all('div',  class_='Title'):
        for link in div.find_all('a'):
            topics_json.append({'topic': link.text, 
                                'url': link.get("href")})
    return topics_json

In [None]:
def get_all_topics(page_list):
    count = 1
    topics_json = []
    
    for url in page_list:    
        response = requests.get(url)
        page = response.text
        soup = BeautifulSoup(page, 'html5')
        topics_json.append(get_one_page_titles(soup))

        if count % 25 == 0:
            print("Finished page {}".format(count))
        count += 1
    topics_json = [item for sublist in topics_json for item in sublist]
    return topics_json

In [None]:
#topics_json = get_all_topics(page_list)

#with open("penn_topics.json", 'w', encoding='utf-8') as outfile:
    #json.dump(topics_json, outfile)

In [None]:
with open("penn_topics.json", 'r') as f:
    topics_json = json.load(f)
    f.close()

In [None]:
def page_comments(url):
    comments = []
    
    page = requests.get(url, timeout=100).text
    soup = BeautifulSoup(page, 'html5')
    
    dates = []
    comments = []
    user_id = []
    user_names = []
    user_thread_count = []
    user_comment_count = []
    lst = []
        
    # comment dates broken by time component
    for div in soup.find_all('div', class_ ="Meta Discussion DiscussionInfo"):
        for i in div.find_all('span', class_="MItem DateCreated"):
            for j in i.find_all('time'):
                split = j.get('title').split()
                dates.append({
                            'month': split[0],
                            'day': split[1],
                            'year': split[2],
                            'time': split[3]
                            })
            
    # comment text
    for div in soup.find_all('div', class_ = "Message userContent"):
        comments.append(div.text)    
    # user name
    for div in soup.find_all('div', class_ ="AuthorWrap"):
        for span in div.find_all('span', class_="Author"):
            user_names.append(span.text.replace('\n', ''))
    # user id
    for div in soup.find_all('div', class_ ="AuthorWrap"):
        for i in div.find_all('a'):
            user_id.append(i.get("data-userid"))
    # user thread count
    for div in soup.find_all('div', class_ ="AuthorWrap"):
        for span in div.find_all('span', class_ = "MItem CountDiscussions"):
            user_thread_count.append(span.text.split()[0])      
    # user comment count
    for div in soup.find_all('div', class_ ="AuthorWrap"):
        for span in div.find_all('span', class_ = "MItem CountComments"):
            user_comment_count.append(span.text.split()[0])
            
    # make json
    for i in range(0, len(comments)):
        lst.append({
            'date' :dates[i],
            'comment': comments[i],
            'user_id': user_id[i],
            'user_name': user_names[i],
            'user_thread_count': user_thread_count[i],
            'user_comment_count': user_comment_count[i],
                    })
    return lst  

In [None]:
# return flatten list of length number of comments
def topic_comments(url):    
    all_comments = []
    return_list = []
    
    i = 1
    
    base_url = url.split(".html")[0]
    test_url = base_url + "-p{}".format(i) + ".html"
    response = requests.get(test_url, timeout=100)
    
    while response.status_code != 404:
        test_url = base_url + "-p{}".format(i) + ".html"
        response = requests.get(test_url, timeout=100)
        all_comments.append(page_comments(test_url))
        i += 1
   
    flat_list = [item for sublist in all_comments for item in sublist]
    return flat_list
    

In [None]:
def get_forum_comms(topics_json, lower, upper):
    forum = []
    
    i = lower
    for topic in topics_json:
        start_time = datetime.now()
        thread = topic['topic']
        url = topic['url']
        
        try:
            forum.append({
                        'topic': thread,
                        'url': url,
                        'comments' : topic_comments(url)
                         })
        except:
            print("Thread {} failed".format(i))
            
        
        time_elapsed = datetime.now() - start_time
        print("{}: Thread {}:".format(datetime.now(), i), 'Time elapsed {}(hh:mm:ss.ms)'.format(time_elapsed))

        i += 1
    
    # for windows file structure 
    filename = os.getcwd() + "\\penn_data\\{}_to_{}.json".format(lower, upper)
     
    with open(filename, 'w', encoding = 'utf-8') as outfile:
        json.dump(forum, outfile)
        
    print("saved {}".format(filename))

In [None]:
low = 0
high = 100

lowers = []
uppers = []
while low < len(topics_json):
    lowers.append(low)
    low += 100

while high < len(topics_json):
    uppers.append(high)
    high +=  100

In [None]:
uppers.append(len(topics_json))

In [None]:
lims = list(zip(lowers, uppers))

In [None]:
for i, j in lims:
    print("get_forum_comms(topics_json[{}:{}], {}, {})".format(i, j, i, j - 1))

In [None]:
#get_forum_comms(topics_json[0:99], 0, 99)

In [None]:
#get_forum_comms(topics_json[100:199], 100, 199)

In [None]:
#get_forum_comms(topics_json[200:299], 200, 299)

In [None]:
#get_forum_comms(topics_json[300:399], 300, 399)

In [None]:
len(topics_json[0:100])

In [None]:
#get_forum_comms(topics_json[400:500], 400, 500)

In [None]:
#get_forum_comms(topics_json[500:600], 500, 600)

In [None]:
#get_forum_comms(topics_json[600:700], 600, 700)

In [None]:
#get_forum_comms(topics_json[700:800], 700, 800)

In [None]:
#get_forum_comms(topics_json[800:900], 800, 900)

In [None]:
#get_forum_comms(topics_json[900:1000], 900, 1000)

In [None]:
#get_forum_comms(topics_json[1000:1100], 1000, 1100)

In [None]:
#get_forum_comms(topics_json[1100:1200], 1100, 1200)

In [None]:
#get_forum_comms(topics_json[1200:1300], 1200, 1300)

In [None]:
#get_forum_comms(topics_json[1300:1400], 1300, 1400)

In [None]:
#get_forum_comms(topics_json[1400:1500], 1400, 1500)

In [None]:
#get_forum_comms(topics_json[1500:1600], 1500, 1600)

In [None]:
#get_forum_comms(topics_json[1600:1700], 1600, 1700)

In [None]:
#get_forum_comms(topics_json[1700:1800], 1700, 1800)

In [None]:
#get_forum_comms(topics_json[1800:1900], 1800, 1900)

In [None]:
#get_forum_comms(topics_json[1900:2000], 1900, 2000)

In [None]:
#get_forum_comms(topics_json[2000:2100], 2000, 2100)

In [None]:
#get_forum_comms(topics_json[2100:2200], 2100, 2200)

In [None]:
#get_forum_comms(topics_json[2200:2300], 2200, 2300)

In [None]:
#get_forum_comms(topics_json[2300:2400], 2300, 2400)

In [None]:
#get_forum_comms(topics_json[2400:2500], 2400, 2500)

In [None]:
#get_forum_comms(topics_json[2500:2600], 2500, 2600)

In [None]:
#get_forum_comms(topics_json[2600:2700], 2600, 2700)

In [None]:
#get_forum_comms(topics_json[2700:2800], 2700, 2800)

In [None]:
#get_forum_comms(topics_json[2800:2900], 2800, 2900)

In [None]:
#get_forum_comms(topics_json[2900:3000], 2900, 3000)

In [None]:
#get_forum_comms(topics_json[3000:3100], 3000, 3100)

In [None]:
#get_forum_comms(topics_json[3100:3200], 3100, 3200)

In [None]:
#get_forum_comms(topics_json[3200:3300], 3200, 3300)

In [None]:
#get_forum_comms(topics_json[3300:3400], 3300, 3400)

In [None]:
#get_forum_comms(topics_json[3400:3500], 3400, 3500)

In [None]:
#get_forum_comms(topics_json[3500:3600], 3500, 3600)

In [None]:
#get_forum_comms(topics_json[3600:3700], 3600, 3700)

In [None]:
#get_forum_comms(topics_json[3700:3800], 3700, 3800)

In [None]:
#get_forum_comms(topics_json[3800:3900], 3800, 3900)

In [None]:
#get_forum_comms(topics_json[3900:4000], 3900, 4000)

In [None]:
#get_forum_comms(topics_json[4000:4100], 4000, 4100)

In [None]:
#get_forum_comms(topics_json[4100:4200], 4100, 4200)

In [None]:
#get_forum_comms(topics_json[4200:4300], 4200, 4300)

In [None]:
#get_forum_comms(topics_json[4300:4400], 4300, 4400)

In [None]:
#get_forum_comms(topics_json[4400:4500], 4400, 4500)

In [None]:
#get_forum_comms(topics_json[4500:4600], 4500, 4600)

In [None]:
#get_forum_comms(topics_json[4600:4700], 4600, 4700)

In [None]:
#get_forum_comms(topics_json[4700:4800], 4700, 4800)

In [None]:
#get_forum_comms(topics_json[4800:4900], 4800, 4900)

In [None]:
#get_forum_comms(topics_json[4900:4993], 4900, 4993)