# Webscraping the Michigan Corpus of Upper-Level Student Papers (MICUSP)

The Michigan Corpus of Upper-Level Student Papers is a public repository of advanced, high-scoring student work in 16 disciplines. This code allows researchers to scrape all 829 papers from MICUSP using BeautifulSoup and download them as text files labeled by Paper ID. 

Browse MICUSP here: https://elicorpora.info/main

In [None]:
#Install packages
import urllib
from bs4 import BeautifulSoup as bsoup
import bleach
import pandas as pd

#Install certificate (of meeded)
import ssl 
ssl._create_default_https_context = ssl._create_unverified_context

In [None]:
#Set working directory
import os
path = os.getcwd()
os.chdir('/Users/megankane/Documents/Corpora/MICUSP/MICUSP_NEW')
print(path)

In [None]:
#Use BeautifulSoup to scrape the text of a single paper in MICUSP

#Assign text url to variable
u = 'https://elicorpora.info/view?pid=BIO.G0.15.1' 

#Open the url
uf = urllib.request.urlopen(u).read() 

#Turn url into a soup object
su = bsoup(uf) 

#Inspect soup object
su

In [None]:
#Get paper metadata
papers = pd.read_csv('https://elicorpora.info/browse?mode=download&start=1&sort=dept&direction=asc')
papers.head()

In [None]:
#Add relevant metadata to lists
#We're most interested in labeling papers by ID, but can retrieve by discipline, type, or title
pids = papers['PAPER ID'].tolist()

#papertype = papers['PAPER TYPE'].tolist()
#discipline = papers['DISCIPLINE'].tolist()
#titles = papers['TITLE'].tolist()

In [None]:
#Set url base to scrape multiple files
url_base =  'https://elicorpora.info/view?pid='

#Get urls of all text and add to list
urls = []
for p in pids:
  urls.append(url_base + p)
urls

In [None]:
#Iterate through each text page and scrape with beautifulsoup
files = []
for item in urls:
  files.append(urllib.request.urlopen(item).read())

soups = []
for f in files:
  soups.append(bsoup(f))

In [None]:
#Check length of soups (should be 829)
len(soups)

In [None]:
#Isolate paragraphs of each essay (do not want headers, bibliographies, other metadata)
paragraphs = []
for soup in soups:
  paragraphs.append(soup.findAll('p'))

In [None]:
#Define a function to get the text of each paragraph and join together
def get_paragraph_text(text):
  ptexts = []
  for p in text:
    ptexts.append(p.getText())
    
  return ' '.join(ptexts)

In [None]:
#Add joined paragraphs of each text to a new list called essays
essays = []
for p in paragraphs:
  essays.append(get_paragraph_text(p))

len(essays)

In [None]:
#Write each essay to file named by paper ID
n = 0
for i in essays:
    f = open(pids[0] + ".txt",'w')
    n += 1
    f.write(i)
    f.close()