In [1]:
import requests
import pandas as pd

In [2]:
response = requests.get('https://www.forbes.com/forbesapi/org/top-colleges/2021/position/true.json?limit=1000&fields=organizationName,academics,state,financialAid,rank,medianBaseSalary,campusSetting,studentPopulation,organization,description')

responseObject = response.json()

responseData = responseObject['organizationList']['organizationsLists']

data_main = pd.DataFrame(responseData)

#### Adding 'organization' to main table

In [3]:
# add organization
organization = [ x['organization'] for x in responseData]
data_main = pd.concat([data_main, pd.DataFrame(organization)], axis=1)

#### Add single 'Primary_Key' column to main table

In [4]:
# add naturalId column
data_main['Primary_Key'] = [ x['organization']['naturalId'].split('/')[-1] for x in responseData]

#### Adding 'organization/geoLocation' to main table

In [5]:

# add geoLocation columns
rows = []
for organizationsLists in responseData:
    row = {}    
    # row['Primary_Key'] = organizationsLists['organization']['naturalId'].split('/')[-1]
    try:
        row['latitude'] = organizationsLists['organization']['geoLocation']['latitude']
    except:
        pass
    try:
        row['longitude']= organizationsLists['organization']['geoLocation']['longitude']
    except:
        pass
    rows.append(row)
data_main = pd.concat([data_main, pd.DataFrame(rows)], axis=1)

#### Adding 'Academics' to main table

In [6]:
academics = [ # items in the 'academics' key to be unpacked
    'attendanceStatus',
    'firstToSecondYearRetention',
    'overallGraduationRates',
    'enrollmentByGender',
    'graduationRateByGender',
    'enrollmentByRace',
    'graduationRateByRace']

def restructure(list_of_dictionaries,name): # change the shape of the json
    row = {}
    for item in list_of_dictionaries:
        key_value = [value for key,value in item.items()]
        key_name = name + '_' + key_value[0]
        row[key_name] = key_value[1]
    return row

def dataFrame_of_Item(item): # build a data frame from all json's 
    list_of_rows = []
    for organization in responseData:
        # create reshaped json
        graduationRateByRace = restructure(organization['academics'][item],item)
        # add primary_key number
        # graduationRateByRace['PrimaryKey'] = organization['organization']['naturalId'].split('/')[-1]
        # append to list
        list_of_rows.append(graduationRateByRace)
    return pd.DataFrame(list_of_rows)

# add date frames to main data
df_ls = []

for item in academics:
    df_ls.append(dataFrame_of_Item(item))

academics_unpacked = pd.concat(df_ls, axis=1)

data_main = pd.concat([data_main,academics_unpacked], axis=1)



#### Adding 'organization/socialNetworks' to main table

In [7]:
def restructure(list_of_dictionaries): # input: list of dictionaries | output: structured dictionary
    row = {}
    for item in list_of_dictionaries:
        key_value = [value for key,value in item.items()]
        key_name =key_value[0]
        row[key_name] = key_value[1]
    return row

def dataFrame_of_Item(col_name,item,TF): # input: name of dict to unpack | output: data frame
    list_of_rows = []
    for organization in responseData:
        # create json
        dictionary = {}
        try:
            # create reshaped json
            dictionary = dictionary | restructure(organization[col_name][item])
        except:
            pass
        # add primary_key number
        if TF:
            dictionary['Primary_Key'] = organization['organization']['naturalId'].split('/')[-1]
        # append to list
        list_of_rows.append(dictionary)
    return pd.DataFrame(list_of_rows)

df_socialNetworks = dataFrame_of_Item('organization','socialNetworks',True)

data_main = data_main.merge(df_socialNetworks, on='Primary_Key')

#### Adding 'financialAid' to main table

In [8]:
financialAid = [
    'grantAidByType',
    'avgGrantAidByType',
    'loansByType',
    'avgLoansByType']

def restructure(list_of_dictionaries): # input: list of dictionaries | output: structured dictionary
    row = {}
    for item in list_of_dictionaries:
        key_value = [value for key,value in item.items()]
        key_name =key_value[0]
        row[key_name] = key_value[1]
    return row

def dataFrame_of_Item(col_name,item,TF): # input: name of dict to unpack | output: data frame
    list_of_rows = []
    for organization in responseData:
        # create json
        dictionary = {}
        try:
            # create reshaped json
            dictionary = dictionary | restructure(organization[col_name][item])
        except:
            pass
        # add primary_key number
        if TF:
            dictionary['Primary_Key'] = organization['organization']['naturalId'].split('/')[-1]
        # append to list
        list_of_rows.append(dictionary)
    return pd.DataFrame(list_of_rows)

# add date frames to main data
df_ls = []

for item in financialAid:
    df_ls.append(dataFrame_of_Item('financialAid',item,True))

financialAid_unpacked = pd.concat(df_ls, axis=1)

financialAid_unpacked = financialAid_unpacked.loc[:, ~financialAid_unpacked.columns.duplicated()]

data_main = data_main.merge(financialAid_unpacked)

data_main = data_main.loc[:, ~data_main.columns.duplicated()]

drop_columns = ['organization','academics','financialAid','listImages','geoLocation','visible','relatedVisible','imageExists','socialNetworks','collegeMedia']

data_main.drop(drop_columns, axis=1, inplace=True)

In [9]:
data_main[['latitude', 'longitude']]

Unnamed: 0,latitude,longitude
0,37.869236,-122.258393
1,41.314042,-72.923425
2,40.349855,-74.659119
3,37.431370,-122.168924
4,40.806515,-73.961288
...,...,...
595,47.750994,-117.415201
596,43.527114,-96.736267
597,43.313226,-91.799646
598,41.985318,-91.657250


# ------------------ TEST AREA ------------------

# WHAT TO DO:
* ~~unpack 'organization'~~
    * ~~unpack 'geoLocation'~~
    * ~~unpack 'socialNetworks'~~
* ~~unpack 'academics'~~
    * ~~unpack 'attendanceStatus'~~
    * ~~unpack 'firstToSecondYearRetention'~~
    * ~~unpack 'overallGraduationRates'~~
    * ~~unpack 'enrollmentByGender'~~
    * ~~unpack 'graduationRateByGender'~~
    * ~~unpack 'enrollmentByRace'~~
    * ~~unpack 'graduationRateByRace'~~
* ~~unpack 'financialAid'~~
    * ~~grantAidByType~~
    * ~~avgGrantAidByType~~
    * ~~loansByType~~
    * ~~avgLoansByType~~
* ~~drop unnecessary columns~~

In [None]:
data_main.shape


In [None]:
col_to_drop = [
       'organization',
       'academics',
       'financialAid',
       'listImages',
       'visible',
       'relatedVisible',
       'imageExists',
       'recentContentCount',
       'country',
       'collegeMedia',
       'landscapeImage',
       'industries',
       'embargo',
       'image',
       'industry',
       'ceoName',
       'ceoTitle',
       'premiumProfile',
       'employees',
       'portraitImage',
       'naturalId',
       'geoLocation',
       'uri',
       'uris',
       'socialNetworks',
       'placeUri'
       ]

data_main.drop(col_to_drop, axis=1, inplace=True)


In [None]:

data_main.to_csv('University_Data.csv')

In [None]:
pd.options.display.max_columns = None
data_main

In [None]:
pd.options.display.max_columns = None
data_main.head()

In [None]:
data_main[data_main.organizationName == 'Brigham Young University']

In [None]:
data_main[data_main.organizationName.str.contains('Brigham')]

In [None]:
pd.options.display.max_rows = 100
# data_main.sort_values(by='enrollmentByGender_enrollmentFemale', ascending=False)

In [None]:
data_main.columns

In [10]:
pd.options.display.max_rows = None

# data_main[data_main.].dropna(subset=['latitude'])

In [42]:
data_main.state.unique()

array(['CA', 'CT', 'NJ', 'NY', 'MA', 'PA', 'IL', 'NH', 'NC', 'TN', 'DC',
       'MI', 'TX', 'FL', 'RI', 'WA', 'VA', 'MO', 'GA', 'ME', 'MD', 'IN',
       'VT', 'UT', 'WI', 'CO', 'MN', 'OH', 'SC', 'DE', 'IA', 'LA', 'AZ',
       'OK', 'OR', 'AL', 'KS', 'AR', 'NE', 'KY', 'MS', 'NV', 'WV', 'ND',
       'NM', 'MT', 'HI', 'WY', 'ID', 'SD', 'AK'], dtype=object)

In [53]:
data_main.longitude.dropna().max()

85.5016

In [56]:
data_main[data_main.longitude == 85.5016][['latitude','longitude']]

Unnamed: 0,latitude,longitude
303,36.1628,85.5016


In [57]:
data_main[data_main.longitude == 85.5016]

Unnamed: 0,description,rank,organizationName,state,studentPopulation,campusSetting,medianBaseSalary,naturalId,name,uri,...,YouTube,federalGrant,pellGrant,otherFederalGrant,stateLocalGrant,institutionalGrant,anyGrant,anyLoan,federalLoan,nonFederalLoan
303,Tennessee Tech is a midsize public research un...,304,Tennessee Technological University,TN,10140,Rural,92000.0,fred/college/820,Tennessee Technological University,tennessee-technological-university,...,https://www.youtube.com/user/ttunews,37.0,36.0,6.0,90.0,48.0,7777.0,50.0,49.0,6.0


In [12]:
pd.options.display.max_columns = None
pd.options.display.max_rows = 10

In [14]:
data_main.head(3)

Unnamed: 0,description,rank,organizationName,state,studentPopulation,campusSetting,medianBaseSalary,naturalId,name,uri,webSite,phoneNumber,recentContentCount,uris,shortUri,squareImage,city,country,region,yearFounded,stateCode,placeUri,landscapeImage,industries,embargo,image,industry,ceoName,ceoTitle,parentOrganization,premiumProfile,employees,portraitImage,Primary_Key,latitude,longitude,attendanceStatus_partTime,attendanceStatus_fullTime,firstToSecondYearRetention_fullTime,firstToSecondYearRetention_partTime,overallGraduationRates_4,overallGraduationRates_6,enrollmentByGender_enrollmentMale,enrollmentByGender_enrollmentFemale,enrollmentByGender_AgeUnder18,enrollmentByGender_Age18to24,enrollmentByGender_Age25to64,enrollmentByGender_Age65andOver,graduationRateByGender_graduationMale,graduationRateByGender_graduationFemale,enrollmentByRace_americanIndian,enrollmentByRace_asian,enrollmentByRace_hawaiianPacific,enrollmentByRace_africanAmerican,enrollmentByRace_hispanic,enrollmentByRace_white,enrollmentByRace_twoRaces,enrollmentByRace_unknown,enrollmentByRace_alien,graduationRateByRace_americanIndian,graduationRateByRace_asian,graduationRateByRace_hawaiianPacific,graduationRateByRace_africanAmerican,graduationRateByRace_hispanic,graduationRateByRace_white,graduationRateByRace_twoRaces,graduationRateByRace_unknown,graduationRateByRace_alien,Twitter,Facebook,LinkedIn,Instagram,YouTube,federalGrant,pellGrant,otherFederalGrant,stateLocalGrant,institutionalGrant,anyGrant,anyLoan,federalLoan,nonFederalLoan
0,One of the top public universities in the coun...,1,"University of California, Berkeley",CA,43185,Urban,138800.0,fred/college/64,"University of California, Berkeley",university-of-california-berkeley,http://www.berkeley.edu,(510) 642-6000,0.0,[university-of-california-berkeley],http://onforb.es/MvI8Zf,//specials-images.forbesimg.com/imageserve/5d5...,Berkeley,United States,West,1868.0,CA,,,,,,,,,,,,,64,37.869236,-122.258393,7.0,93.0,97.0,72.0,76.0,93.0,48.0,52.0,782.0,28810.0,1752.0,4.0,91.0,94.0,0.2,30.2,0.1,2.1,14.1,26.7,5.4,4.0,17.2,100.0,97.0,75.0,76.0,84.0,92.0,92.0,96.0,92.0,https://twitter.com/UCBerkeley,https://www.facebook.com/UCBerkeley,https://www.linkedin.com/edu/university-of-cal...,http://instagram.com/ucberkeleyofficial/,http://www.berkeley.edu/tour/,23.0,27.0,13.0,31.0,51.0,19126.0,23.0,23.0,12.0
1,"The second oldest Ivy League institution, Yale...",2,Yale University,CT,13609,Urban,141300.0,fred/college/10,Yale University,yale-university,http://www.yale.edu,203-432-4771,,[yale-university],http://onforb.es/NirARu,https://specials-images.forbesimg.com/imageser...,New Haven,United States,Northeast,1701.0,CT,ct/new-haven,,,,,,,,,,,,10,41.314042,-72.923425,1.0,99.0,99.0,0.0,88.0,97.0,48.0,52.0,114.0,5947.0,30.0,1.0,97.0,98.0,0.3,15.1,0.1,6.0,10.4,41.2,5.1,1.1,20.7,100.0,97.0,0.0,97.0,97.0,98.0,96.0,100.0,95.0,https://twitter.com/yale,https://www.facebook.com/YaleUniversity,https://www.linkedin.com/edu/school?id=18043&t...,http://instagram.com/yale,http://admissions.yale.edu/virtual-tour,20.0,18.0,9.0,,56.0,55827.0,6.0,6.0,
2,Princeton is a leading private research univer...,3,Princeton University,NJ,8419,Suburban,150500.0,fred/college/2,Princeton University,princeton-university,http://www.princeton.edu,609-258-3000,,[princeton-university],http://onforb.es/NirwkP,http://specials-images.forbesimg.com/imageserv...,Princeton,United States,Northeast,1746.0,NJ,nj/mercer-county,,,,,,,,,,,,2,40.349855,-74.659119,1.0,99.0,98.0,0.0,90.0,98.0,54.0,46.0,145.0,5239.0,38.0,0.0,97.0,99.0,0.1,17.1,0.1,5.9,9.0,37.1,4.3,3.3,23.1,100.0,99.0,100.0,97.0,97.0,98.0,92.0,96.0,98.0,https://twitter.com/princeton,https://www.facebook.com/PrincetonU,http://www.linkedin.com/edu/princeton-universi...,http://instagram.com/Princeton_University,http://www.princeton.edu/~oktour/virtualtour/,21.0,19.0,4.0,2.0,62.0,52188.0,10.0,4.0,8.0


In [19]:
[
# 'description',
 'rank',
#  'organizationName',
 'state',
 'studentPopulation',
#  'campusSetting',
 'medianBaseSalary',
#  'naturalId',
#  'name',
#  'uri',
 'webSite',
 'phoneNumber',
#  'recentContentCount',
#  'uris',
#  'shortUri',
 'squareImage',
 'city',
#  'country',
 'region',
#  'yearFounded',
 'stateCode',
#  'placeUri',
#  'landscapeImage',
#  'industries',
#  'embargo',
#  'image',
#  'industry',
#  'ceoName',
#  'ceoTitle',
#  'parentOrganization',
#  'premiumProfile',
#  'employees',
#  'portraitImage',
#  'Primary_Key',
 'latitude',
 'longitude',
#  'attendanceStatus_partTime',
#  'attendanceStatus_fullTime',
#  'firstToSecondYearRetention_fullTime',
#  'firstToSecondYearRetention_partTime',
#  'overallGraduationRates_4',
#  'overallGraduationRates_6',.
 'enrollmentByGender_enrollmentMale',
 'enrollmentByGender_enrollmentFemale',
#  'enrollmentByGender_AgeUnder18',
#  'enrollmentByGender_Age18to24',
#  'enrollmentByGender_Age25to64',
#  'enrollmentByGender_Age65andOver',
 'graduationRateByGender_graduationMale',
 'graduationRateByGender_graduationFemale',
#  'enrollmentByRace_americanIndian',
#  'enrollmentByRace_asian',
#  'enrollmentByRace_hawaiianPacific',
#  'enrollmentByRace_africanAmerican',
#  'enrollmentByRace_hispanic',
#  'enrollmentByRace_white',
#  'enrollmentByRace_twoRaces',
#  'enrollmentByRace_unknown',
#  'enrollmentByRace_alien',
#  'graduationRateByRace_americanIndian',
#  'graduationRateByRace_asian',
#  'graduationRateByRace_hawaiianPacific',
#  'graduationRateByRace_africanAmerican',
#  'graduationRateByRace_hispanic',
#  'graduationRateByRace_white',
#  'graduationRateByRace_twoRaces',
#  'graduationRateByRace_unknown',
#  'graduationRateByRace_alien',
 'Twitter',
 'Facebook',
 'LinkedIn',
 'Instagram',
 'YouTube',
 'federalGrant'
#  'pellGrant',
#  'otherFederalGrant',
#  'stateLocalGrant',
#  'institutionalGrant',
#  'anyGrant',
#  'anyLoan',
#  'federalLoan',
#  'nonFederalLoan'
 ]

['rank',
 'state',
 'studentPopulation',
 'medianBaseSalary',
 'webSite',
 'phoneNumber',
 'squareImage',
 'city',
 'region',
 'stateCode',
 'latitude',
 'longitude',
 'enrollmentByGender_enrollmentMale',
 'enrollmentByGender_enrollmentFemale',
 'graduationRateByGender_graduationMale',
 'graduationRateByGender_graduationFemale',
 'Twitter',
 'Facebook',
 'LinkedIn',
 'Instagram',
 'YouTube',
 'federalGrant']