In [0]:
import pandas as pd
from bs4 import BeautifulSoup
import os
import numpy as np


# **College decision in fall 2020**
Src: https://www.chronicle.com/article/Here-s-a-List-of-Colleges-/248626

In [0]:
data = pd.read_csv("collegeFall2020.csv")

In [0]:
# Cleaning html tags
def getText(x):
  return BeautifulSoup(x, 'html5lib').text

In [0]:
# Removing html tags from the category file
data["Category"]=data["Category"].apply(getText)

In [0]:
data.to_csv("college_fall2020.csv")

In [30]:
data.head(5)

Unnamed: 0,Institution,Control,State,Category
0,Abilene Christian University,Private,TX,In-person
1,Academy of Art University,Private,CA,In-person
2,Adelphi University,Private,NY,Hybrid model
3,Adrian College,Private,MI,In-person
4,Agnes Scott College,Private,GA,In-person


In [29]:
# Shortening the names of the categories
data.groupby(data['Category'])['Institution'].nunique()
data['Category'] = data['Category'].replace('Planning for in-person', 'In-person')
data['Category'] = data['Category'].replace('Proposing a hybrid model', 'Hybrid model')
data['Category'] = data['Category'].replace('Planning for online', 'Online')
data['Category'] = data['Category'].replace('Waiting to decide', 'Undecided')
data['Category'] = data['Category'].replace('Considering a range of scenarios', 'Considering options')
data.head(5)

Unnamed: 0,Institution,Control,State,Category
0,Abilene Christian University,Private,TX,In-person
1,Academy of Art University,Private,CA,In-person
2,Adelphi University,Private,NY,Hybrid model
3,Adrian College,Private,MI,In-person
4,Agnes Scott College,Private,GA,In-person


In [0]:
data.to_csv('Colleges_fall20.csv', sep=',')

# **NYT COVID 19**
Src: https://github.com/nytimes/covid-19-data

In [0]:
covid_data = pd.read_csv("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv")

In [114]:
covid_data.head(5)

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061.0,1,0
1,2020-01-22,Snohomish,Washington,53061.0,1,0
2,2020-01-23,Snohomish,Washington,53061.0,1,0
3,2020-01-24,Cook,Illinois,17031.0,1,0
4,2020-01-24,Snohomish,Washington,53061.0,1,0


In [0]:
latest_covid =covid_data[covid_data["date"]=="2020-06-12"].reset_index()

In [0]:
latest_covid.to_csv("ny_covid_Jun12.csv")

# **School closure in spring 2020**
Src: https://github.com/jessejanderson/covid19schools 

In [0]:
closure = pd.read_csv("https://raw.githubusercontent.com/jessejanderson/covid19schools/master/covid19schools.csv")

In [0]:
closure["affected_starting"], closure["affected_ending"] = closure["Overall affected dates"].str.split("→").str

In [27]:
closure["affected_starting"] = pd.to_datetime(closure["affected_starting"])
closure["affected_ending"] = pd.to_datetime(closure["affected_ending"])
closure.head(5)

Unnamed: 0,Name,State,Overall affected dates,No class (include SB),Online-only,No campus housing,Source,IPEDS - ID,Latitude,Longitude,Notes,IPEDS,IPEDS - State,URL,affected_starting,affected_ending
0,Alabama A&M University,AL,"Mar 16, 2020 → Apr 30, 2020","Mar 22, 2020 → Mar 29, 2020","Mar 16, 2020 → Apr 30, 2020",,https://www.aamu.edu/about/inside-aamu/news/aa...,100654.0,34.783368,-86.568502,,https://www.notion.so/Alabama-A-M-University-e...,AL,www.aamu.edu/,2020-03-16,2020-04-30
1,Alabama State University,AL,"Mar 12, 2020 → Jun 30, 2020",,,,https://www.alabamanews.net/2020/03/12/asu-clo...,100724.0,32.364317,-86.295677,,https://www.notion.so/Alabama-State-University...,AL,www.alasu.edu,2020-03-12,2020-06-30
2,Auburn University,AL,"Mar 16, 2020 → Apr 10, 2020",,,,https://ocm.auburn.edu/newsroom/news_articles/...,100858.0,32.599378,-85.488258,,https://www.notion.so/Auburn-University-acb48d...,AL,www.auburn.edu,2020-03-16,2020-04-10
3,Bevill State Community College,AL,"Mar 17, 2020 → May 05, 2020","Mar 21, 2020 → Mar 29, 2020","Mar 30, 2020 → May 05, 2020",,https://www.bscc.edu/coronavirus https://...,102429.0,33.836929,-87.266406,graduation ceremony postponed - TBD,https://www.notion.so/Bevill-State-Community-C...,AL,www.bscc.edu,2020-03-17,2020-05-05
4,Bishop State Community College,AL,"Mar 17, 2020 → May 12, 2020","Mar 30, 2020 → Apr 05, 2020","Mar 17, 2020 → May 12, 2020",,https://www.bishop.edu/news/coronavirus h...,102030.0,30.693972,-88.056982,graduation ceremony postponed - TBD,https://www.notion.so/Bishop-State-Community-C...,AL,www.bishop.edu,2020-03-17,2020-05-12


In [0]:
closure.to_csv("closure_spring20.csv")

# **Population estimates**
Src: https://www.ers.usda.gov/data-products/county-level-data-sets/download-data.aspx

In [0]:
population_counties =pd.read_csv("PopulationEstimates.csv",skiprows=2)[["FIPStxt","POP_ESTIMATE_2019","Area_Name"]]

In [0]:
population_counties =population_counties.drop(population_counties.index[0])

In [0]:
population_counties.to_csv("population_counties2019.csv")

In [126]:
population_counties

Unnamed: 0,FIPStxt,POP_ESTIMATE_2019,Area_Name
1,1000,4903185,Alabama
2,1001,55869,Autauga County
3,1003,223234,Baldwin County
4,1005,24686,Barbour County
5,1007,22394,Bibb County
...,...,...,...
3268,72145,50023,"Vega Baja Municipio, Puerto Rico"
3269,72147,8386,"Vieques Municipio, Puerto Rico"
3270,72149,21372,"Villalba Municipio, Puerto Rico"
3271,72151,32282,"Yabucoa Municipio, Puerto Rico"


# **American university data**
Src: https://www.kaggle.com/sumithbhongale/american-university-data-ipeds-dataset

In [0]:
df = pd.read_csv("UniversityData.csv")
df.head(5)

In [37]:
# Select interesting columns
columns = ['ID number', 'Name', 'County name', 'Longitude location of institution', 'Latitude location of institution', 
           'Total  enrollment', 'Control of institution', 'Degree of urbanization (Urban-centric locale)', 'Carnegie Classification 2010: Basic', 
           'Percent of first-time undergraduates - out-of-state', 'Percent of first-time undergraduates - foreign countries',
           'Endowment assets (year end) per FTE enrollment (FASB)']
aud = df[columns]
aud.head(5)

Unnamed: 0,ID number,Name,County name,Longitude location of institution,Latitude location of institution,Total enrollment,Control of institution,Degree of urbanization (Urban-centric locale),Carnegie Classification 2010: Basic,Percent of first-time undergraduates - out-of-state,Percent of first-time undergraduates - foreign countries,Endowment assets (year end) per FTE enrollment (FASB)
0,100654,Alabama A & M University,Madison County,-86.568502,34.783368,5020.0,Public,City: Midsize,Master's Colleges and Universities (larger pro...,,,
1,100663,University of Alabama at Birmingham,Jefferson County,-86.80917,33.50223,18568.0,Public,City: Midsize,Research Universities (very high research acti...,13.0,1.0,
2,100690,Amridge University,Montgomery County,-86.17401,32.362609,631.0,Private not-for-profit,City: Midsize,Baccalaureate Colleges--Arts & Sciences,,,302.0
3,100706,University of Alabama in Huntsville,Madison County,-86.63842,34.722818,7376.0,Public,City: Midsize,Research Universities (very high research acti...,14.0,4.0,
4,100724,Alabama State University,Montgomery County,-86.295677,32.364317,6075.0,Public,City: Midsize,Master's Colleges and Universities (larger pro...,37.0,4.0,


In [39]:
# rename categories
aud.groupby(aud['Control of institution'])['ID number'].nunique()
aud['Control of institution'] = aud['Control of institution'].replace('Private not-for-profit', 'Private')
aud

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,ID number,Name,County name,Longitude location of institution,Latitude location of institution,Total enrollment,Control of institution,Degree of urbanization (Urban-centric locale),Carnegie Classification 2010: Basic,Percent of first-time undergraduates - out-of-state,Percent of first-time undergraduates - foreign countries,Endowment assets (year end) per FTE enrollment (FASB)
0,100654,Alabama A & M University,Madison County,-86.568502,34.783368,5020.0,Public,City: Midsize,Master's Colleges and Universities (larger pro...,,,
1,100663,University of Alabama at Birmingham,Jefferson County,-86.809170,33.502230,18568.0,Public,City: Midsize,Research Universities (very high research acti...,13.0,1.0,
2,100690,Amridge University,Montgomery County,-86.174010,32.362609,631.0,Private,City: Midsize,Baccalaureate Colleges--Arts & Sciences,,,302.0
3,100706,University of Alabama in Huntsville,Madison County,-86.638420,34.722818,7376.0,Public,City: Midsize,Research Universities (very high research acti...,14.0,4.0,
4,100724,Alabama State University,Montgomery County,-86.295677,32.364317,6075.0,Public,City: Midsize,Master's Colleges and Universities (larger pro...,37.0,4.0,
...,...,...,...,...,...,...,...,...,...,...,...,...
1529,451671,University of South Florida-Sarasota-Manatee,Manatee County,-82.562951,27.391766,1877.0,Public,Suburb: Large,Master's Colleges and Universities (medium pro...,2.0,2.0,
1530,454184,The Kingâ€™s College,New York County,-74.012348,40.706861,516.0,Private,City: Large,Baccalaureate Colleges--Arts & Sciences,,,935.0
1531,454582,Ottawa University-Online,Franklin County,-95.263775,38.602692,458.0,Private,Town: Distant,Baccalaureate Colleges--Diverse Fields,,,20863.0
1532,455770,Providence Christian College,Los Angeles County,-118.118491,34.172750,66.0,Private,City: Midsize,Baccalaureate Colleges--Arts & Sciences,,,350.0


In [41]:
# Rename columns
aud = aud.rename(columns={'ID number': 'Id', 'County name': 'County', 'Longitude location of institution': 'Longitude', 'Latitude location of institution': 'Latitude', 
                      'Control of institution':'Control', 'Degree of urbanization (Urban-centric locale)':'Urbanization', 'Percent of first-time undergraduates - out-of-state':'out-of-state',
                      'Percent of first-time undergraduates - foreign countries':'foreign', 'Endowment assets (year end) per FTE enrollment (FASB)':'Endowment', 'Carnegie Classification 2010: Basic':'Carnegie Classifation'})
aud.head(5)

Unnamed: 0,Id,Name,County,Longitude,Latitude,Total enrollment,Control,Urbanization,Carnegie Classifation,out-of-state,foreign,Endowment
0,100654,Alabama A & M University,Madison County,-86.568502,34.783368,5020.0,Public,City: Midsize,Master's Colleges and Universities (larger pro...,,,
1,100663,University of Alabama at Birmingham,Jefferson County,-86.80917,33.50223,18568.0,Public,City: Midsize,Research Universities (very high research acti...,13.0,1.0,
2,100690,Amridge University,Montgomery County,-86.17401,32.362609,631.0,Private,City: Midsize,Baccalaureate Colleges--Arts & Sciences,,,302.0
3,100706,University of Alabama in Huntsville,Madison County,-86.63842,34.722818,7376.0,Public,City: Midsize,Research Universities (very high research acti...,14.0,4.0,
4,100724,Alabama State University,Montgomery County,-86.295677,32.364317,6075.0,Public,City: Midsize,Master's Colleges and Universities (larger pro...,37.0,4.0,


In [43]:
# Rename urbanization categories
aud.groupby(aud['Urbanization'])['Id'].nunique()
aud['Urbanization'] = aud['Urbanization'].replace(['City: Large', 'City: Midsize', 'City: Small'], 'City')
aud['Urbanization'] = aud['Urbanization'].replace(['Rural: Distant', 'Rural: Fringe', 'Rural: Remote'], 'Rural')
aud['Urbanization'] = aud['Urbanization'].replace(['Suburb: Large', 'Suburb: Midsize', 'Suburb: Small'], 'Suburb')
aud['Urbanization'] = aud['Urbanization'].replace(['Town: Distant', 'Town: Fringe', 'Town: Remote'], 'Town')
aud.head(5)

Unnamed: 0,Id,Name,County,Longitude,Latitude,Total enrollment,Control,Urbanization,Carnegie Classifation,out-of-state,foreign,Endowment
0,100654,Alabama A & M University,Madison County,-86.568502,34.783368,5020.0,Public,City,Master's Colleges and Universities (larger pro...,,,
1,100663,University of Alabama at Birmingham,Jefferson County,-86.80917,33.50223,18568.0,Public,City,Research Universities (very high research acti...,13.0,1.0,
2,100690,Amridge University,Montgomery County,-86.17401,32.362609,631.0,Private,City,Baccalaureate Colleges--Arts & Sciences,,,302.0
3,100706,University of Alabama in Huntsville,Madison County,-86.63842,34.722818,7376.0,Public,City,Research Universities (very high research acti...,14.0,4.0,
4,100724,Alabama State University,Montgomery County,-86.295677,32.364317,6075.0,Public,City,Master's Colleges and Universities (larger pro...,37.0,4.0,


In [45]:
# Rename Carnegie Classification categories
aud.groupby(aud['Carnegie Classifation'])['Id'].nunique()
aud['Carnegie Classifation'] = aud['Carnegie Classifation'].replace(['Master\'s Colleges and Universities (larger programs)', 'Master\'s Colleges and Universities (medium programs)', 
                                                                         'Master\'s Colleges and Universities (smaller programs)'], 'Master\'s')
aud['Carnegie Classifation'] = aud['Carnegie Classifation'].replace(['Research Universities (high research activity)', 'Research Universities (very high research activity)', 'Doctoral/Research Universities'], 'Research')
aud['Carnegie Classifation'] = aud['Carnegie Classifation'].replace(['Baccalaureate Colleges--Arts & Sciences', 'Baccalaureate Colleges--Diverse Fields'], 'Baccalaureatte')
aud.head(5)

Unnamed: 0,Id,Name,County,Longitude,Latitude,Total enrollment,Control,Urbanization,Carnegie Classifation,out-of-state,foreign,Endowment
0,100654,Alabama A & M University,Madison County,-86.568502,34.783368,5020.0,Public,City,Master's,,,
1,100663,University of Alabama at Birmingham,Jefferson County,-86.80917,33.50223,18568.0,Public,City,Research,13.0,1.0,
2,100690,Amridge University,Montgomery County,-86.17401,32.362609,631.0,Private,City,Baccalaureatte,,,302.0
3,100706,University of Alabama in Huntsville,Madison County,-86.63842,34.722818,7376.0,Public,City,Research,14.0,4.0,
4,100724,Alabama State University,Montgomery County,-86.295677,32.364317,6075.0,Public,City,Master's,37.0,4.0,


In [0]:
aud.to_csv('UniversityData_clean.csv', sep=',')

# **American university rankings (top 150)**
Src: https://www.kaggle.com/peterpenner445/american-university-rankings-top-150

In [96]:
df = pd.read_csv("SchoolRankings.csv")
df.head(5)

Unnamed: 0,Institution,AR,Location,Price,SAT
0,Massachusetts Institute of Technology,7% Acceptance Rate,"Cambridge, MA","$22,230 Net Price",1490-1570 SAT Range
1,Stanford University,5% Acceptance Rate,"Stanford, CA","$16,562 Net Price",1390-1540 SAT Range
2,Harvard University,5% Acceptance Rate,"Cambridge, MA","$17,030 Net Price",1460-1590 SAT Range
3,Yale University,7% Acceptance Rate,"New Haven, CT","$18,053 Net Price",1460-1580 SAT Range
4,Princeton University,6% Acceptance Rate,"Princeton, NJ","$16,302 Net Price",1430-1570 SAT Range


In [101]:
ranking = df[['Institution', 'Location']].reset_index()
ranking.head(5)

Unnamed: 0,index,Institution,Location
0,0,Massachusetts Institute of Technology,"Cambridge, MA"
1,1,Stanford University,"Stanford, CA"
2,2,Harvard University,"Cambridge, MA"
3,3,Yale University,"New Haven, CT"
4,4,Princeton University,"Princeton, NJ"


In [102]:
city_state = pd.DataFrame(ranking.Location.str.split(',',1).tolist(), columns = ['City','State']).reset_index()
city_state

Unnamed: 0,index,City,State
0,0,Cambridge,MA
1,1,Stanford,CA
2,2,Cambridge,MA
3,3,New Haven,CT
4,4,Princeton,NJ
...,...,...,...
145,145,Portland,OR
146,146,Norman,OK
147,147,Denver,CO
148,148,San Diego,CA


In [103]:
ranking = pd.merge(ranking, city_state, on=ranking.index, how='inner')
ranking

Unnamed: 0,key_0,index_x,Institution,Location,index_y,City,State
0,0,0,Massachusetts Institute of Technology,"Cambridge, MA",0,Cambridge,MA
1,1,1,Stanford University,"Stanford, CA",1,Stanford,CA
2,2,2,Harvard University,"Cambridge, MA",2,Cambridge,MA
3,3,3,Yale University,"New Haven, CT",3,New Haven,CT
4,4,4,Princeton University,"Princeton, NJ",4,Princeton,NJ
...,...,...,...,...,...,...,...
145,145,145,University of Portland,"Portland, OR",145,Portland,OR
146,146,146,University of Oklahoma,"Norman, OK",146,Norman,OK
147,147,147,University of Denver,"Denver, CO",147,Denver,CO
148,148,148,University of San Diego,"San Diego, CA",148,San Diego,CA


In [104]:
ranking = ranking[['index_x', 'Institution', 'City', 'State']]
ranking = ranking.rename(columns={'index_x':'Rank'})
ranking

Unnamed: 0,Rank,Institution,City,State
0,0,Massachusetts Institute of Technology,Cambridge,MA
1,1,Stanford University,Stanford,CA
2,2,Harvard University,Cambridge,MA
3,3,Yale University,New Haven,CT
4,4,Princeton University,Princeton,NJ
...,...,...,...,...
145,145,University of Portland,Portland,OR
146,146,University of Oklahoma,Norman,OK
147,147,University of Denver,Denver,CO
148,148,University of San Diego,San Diego,CA


In [0]:
ranking.to_csv('SchoolRanking_clean.csv', sep=',')