This document discusses the preprocessing steps I took create the CSV files.

In [3]:
from bs4 import BeautifulSoup
from requests import get
import io
import math
import time
import dateparser


base_url = "https://thegradcafe.com/survey/index.php"
per_page = 250
def generate_url(search_string, page_num=1):
    return base_url + "?q=" + '+'.join(search_string.split()) + "&t=a&o=&pp={}&p={}".format(per_page, page_num)

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/5'}
response = get(generate_url("njit"), headers=headers)

html_soup = BeautifulSoup(response.text, 'html.parser')
table = html_soup.find_all('table', class_='results narrow-table')
table_rows = table[0].find_all('tr')[1:]

After generating appropriate links and obtaining a HTML table from the website, the table will need to be parsed and cleaned
appropriately to obtain the necessary data. Displayed below is one column of the table obtained from thegradcafe.com.

In [7]:
dat = table_rows[0]
stats = dat.find_all('td')[2]
dat

<tr class="row0" onmouseout="hideControlsBox(this);" onmouseover="showControlsBox(this,584557);"><td class="instcol">New Jersey Institute Of Technology ( NJIT )</td><td>Business Data Science, PhD (F18)</td><td><span class="dRejected">Rejected</span> via E-mail on 13 Mar 2018 <a class="extinfo" href="#"><span><strong>Undergrad GPA</strong>: n/a<br/><strong>GRE General (V/Q/W)</strong>: 160/151/4.00<br/><strong>GRE Subject</strong>: n/a<br/></span>♦</a></td><td>I</td><td class="datecol">15 Mar 2018</td><td><ul class="control"><li class="controlspam"></li><li>cu*ulative GPA: 2.8
ME GPA: 3.91

</li></ul></td></tr>

First, I deal with the most important, complicated part of the table (the third column). This column includes important data
such as the posters GPA, GRE, acceptance status, etc...
This is tricky since the information in this clolumn is not consistent. The poster may not include his or her statistics. They
may not even post acceptance status (might just be informing about interviews, or asking questions)

In [61]:
dat = table_rows[0]
stats = dat.find_all('td')[2]

In [62]:
stats

<td><span class="dRejected">Rejected</span> via E-mail on 13 Mar 2018 <a class="extinfo" href="#"><span><strong>Undergrad GPA</strong>: n/a<br/><strong>GRE General (V/Q/W)</strong>: 160/151/4.00<br/><strong>GRE Subject</strong>: n/a<br/></span>♦</a></td>

In [63]:
tokens = stats.text.split()
# status = stats.find('span').text
status = tokens[0]
notified = tokens[2]

if(status=='Wait'):
    status=tokens[0]+tokens[1]
    notified = tokens[3]
    
index = stats.text.index(' on ')
tokens = stats.text[index:].split()

date = "-".join(tokens[1:4])

In [64]:
status, notified, date  #dateparser.parse(date).year, dateparser.parse(date).month, dateparser.parse(date).day

('Rejected', 'E-mail', '13-Mar-2018')

This part extracts GPA and GRE scores if they exist. The values may still be n/a.

In [8]:
ext_info = stats.find('a')
if(ext_info != None):
    raw_gpa = ext_info.find('span').contents[1]
    for c in ": ":
        raw_gpa = raw_gpa.replace(c,"")
    gpa = raw_gpa
    
    raw_gre = stats.find('a').find('span').contents[4]
    for c in ": ":
        raw_gre = raw_gre.replace(c,"")
    greV,greQ,greW = raw_gre.split('/')

These values are only relevant for posts that contain them.

In [9]:
gpa,greV,greQ,greW

('n/a', '160', '151', '4.00')

Now we process the other columns of the table one by one
<br>
Column 1:

In [67]:
name = dat.find_all('td')[0].text
name = name.replace(',',' ')
print(name)
name = name.split('(')[0]
name

New Jersey Institute Of Technology ( NJIT )


'New Jersey Institute Of Technology '

Column 2:

In [160]:
major, degree = dat.find_all('td')[1].text.split(',')
degree, semester = degree.split('(')
semester = semester[0]
major, degree,semester

('Mechanical Engineering', ' Masters ', 'F')

Column 4:
<br>
This column indicates if the poster is a graduate from an American University or not.
<br><br>
codes: A: American; U: International, with US degree; I: International, without US degree; O: Other; ?: Unknown

In [168]:
nationality = dat.find_all('td')[3].text
nationality

'A'

Column 5:
This column indicates the date 

In [173]:
post_date = '-'.join(dat.find_all('td')[4].text.split())
post_date

'11-Feb-2018'

Column 6:
<br>
This column is simply the posters comment. This column may contain anything that the poster wanted to say. And it is common
for people to post more details about their application, such as the number of publications, letters of recommendation, etc...
<br>
It may be valuble to extract some information from this raw text later on.

In [200]:
comment = dat.find_all('td')[5].text
bad_chars = '\n,'
for c in bad_chars:
    comment = comment.replace(c," ")
comment

'Did anyone else check the website and it says "admitted" with no other explanation?'

In [184]:
name, major, degree,semester, status, notified, date, gpa,greV,greQ,greW, nationality, post_date, comment

('New Jersey Institute Of Technology ',
 'Mechanical Engineering',
 ' Masters ',
 'F',
 'Other',
 'Website',
 '1-Feb-2018',
 '3.50',
 '150',
 '168',
 '3.00',
 'A',
 '11-Feb-2018',
 'Did anyone else check the website and it says "admitted" with no other explanation?')

This function will be used to process each row I obtain from the website.

In [201]:
def process_row(dat):
    gpa, greV, greQ, greW = 'n/a', 'n/a', 'n/a', 'n/a'

    # Column 1:
    name = dat.find_all('td')[0].text
    name = name.split('(')[0]

    # Column 2:
    vals = dat.find_all('td')[1].text.split(',')
    major, degree = vals[0],vals[1]
    degree, semester = vals[-1].split('(')
    semester = semester[0]

    # Column 3
    stats = dat.find_all('td')[2]
    tokens = stats.text.split()
    # status = stats.find('span').text
    status = tokens[0]

    notified = tokens[2]
    date = "-".join(tokens[4:7])
    ext_info = stats.find('a')
    if (ext_info != None):
        raw_gpa = ext_info.find('span').contents[1]
        for c in ": ":
            raw_gpa = raw_gpa.replace(c, "")
        gpa = raw_gpa

        raw_gre = stats.find('a').find('span').contents[4]
        for c in ": ":
            raw_gre = raw_gre.replace(c, "")
        greV, greQ, greW = raw_gre.split('/')

    # Column 4:
    nationality = dat.find_all('td')[3].text

    # Column 5:
    post_date = '-'.join(dat.find_all('td')[4].text.split())

    # Column 6:
    comment = dat.find_all('td')[5].text
    bad_chars = '\n,'
    for c in bad_chars:
        comment = comment.replace(c,"")

    return [name, major, degree, semester, status, notified, date, gpa, greV, greQ, greW, nationality, post_date,
            comment]


In [203]:
print(process_row(table_rows[0]))
print(process_row(table_rows[1]))

['New Jersey Institute Of Technology ', 'Business Data Science', ' PhD ', 'F', 'Rejected', 'E-mail', '13-Mar-2018', 'n/a', '160', '151', '4.00', 'I', '15-Mar-2018', 'cu*ulative GPA: 2.8\rME GPA: 3.91\r\r']
['New Jersey Institute Of Technology ', 'Mathematical Sciences', ' PhD ', 'F', 'Accepted', 'E-mail', '21-Feb-2018', 'n/a', 'n/a', 'n/a', 'n/a', 'A', '28-Feb-2018', '']
