In [1]:
import requests
import pandas as pd

In [2]:
response = requests.get('https://www.forbes.com/forbesapi/org/top-colleges/2021/position/true.json?limit=1000&fields=organizationName,academics,state,financialAid,rank,medianBaseSalary,campusSetting,studentPopulation,organization,description')

responseObject = response.json()

responseData = responseObject['organizationList']['organizationsLists']

data_main = pd.DataFrame(responseData)

#### Adding 'organization' to main table

In [3]:
# add organization
organization = [ x['organization'] for x in responseData]
data_main = pd.concat([data_main, pd.DataFrame(organization)], axis=1)

#### Add single 'Primary_Key' column to main table

In [4]:
# add naturalId column
data_main['Primary_Key'] = [ x['organization']['naturalId'].split('/')[-1] for x in responseData]

#### Adding 'organization/geoLocation' to main table

In [5]:

# add geoLocation columns
rows = []
for organizationsLists in responseData:
    row = {}    
    # row['Primary_Key'] = organizationsLists['organization']['naturalId'].split('/')[-1]
    try:
        row['latitude'] = organizationsLists['organization']['geoLocation']['latitude']
    except:
        pass
    try:
        row['longitude']= organizationsLists['organization']['geoLocation']['longitude']
    except:
        pass
    rows.append(row)
data_main = pd.concat([data_main, pd.DataFrame(rows)], axis=1)

#### Adding 'Academics' to main table

In [6]:
academics_df = pd.DataFrame(responseData)['academics'].apply(pd.Series)[['type','studentFacultyRatio','undergraduatePopulation']]
data_main = data_main.merge(academics_df, left_on='studentPopulation',right_on='undergraduatePopulation')

academics = [ # items in the 'academics' key to be unpacked
    'attendanceStatus',
    'firstToSecondYearRetention',
    'overallGraduationRates',
    'enrollmentByGender',
    'graduationRateByGender',
    'enrollmentByRace',
    'graduationRateByRace']

def restructure(list_of_dictionaries,name): # change the shape of the json
    row = {}
    for item in list_of_dictionaries:
        key_value = [value for key,value in item.items()]
        key_name = name + '_' + key_value[0]
        row[key_name] = key_value[1]
    return row

def dataFrame_of_Item(item): # build a data frame from all json's 
    list_of_rows = []
    for organization in responseData:
        # create reshaped json
        graduationRateByRace = restructure(organization['academics'][item],item)
        # add primary_key number
        # graduationRateByRace['PrimaryKey'] = organization['organization']['naturalId'].split('/')[-1]
        # append to list
        list_of_rows.append(graduationRateByRace)
    return pd.DataFrame(list_of_rows)

# add date frames to main data
df_ls = []
for item in academics:
    df_ls.append(dataFrame_of_Item(item))

academics_unpacked = pd.concat(df_ls, axis=1)

data_main = pd.concat([data_main,academics_unpacked], axis=1)



#### Adding 'organization/socialNetworks' to main table

In [7]:
def restructure(list_of_dictionaries): # input: list of dictionaries | output: structured dictionary
    row = {}
    for item in list_of_dictionaries:
        key_value = [value for key,value in item.items()]
        key_name =key_value[0]
        row[key_name] = key_value[1]
    return row

def dataFrame_of_Item(col_name,item,TF): # input: name of dict to unpack | output: data frame
    list_of_rows = []
    for organization in responseData:
        # create json
        dictionary = {}
        try:
            # create reshaped json
            dictionary = dictionary | restructure(organization[col_name][item])
        except:
            pass
        # add primary_key number
        if TF:
            dictionary['Primary_Key'] = organization['organization']['naturalId'].split('/')[-1]
        # append to list
        list_of_rows.append(dictionary)
    return pd.DataFrame(list_of_rows)

df_socialNetworks = dataFrame_of_Item('organization','socialNetworks',True)

data_main = data_main.merge(df_socialNetworks, on='Primary_Key')

#### Adding 'financialAid' to main table

In [8]:
financialAid = [
    'grantAidByType',
    'avgGrantAidByType',
    'loansByType',
    'avgLoansByType']

def restructure(list_of_dictionaries): # input: list of dictionaries | output: structured dictionary
    row = {}
    for item in list_of_dictionaries:
        key_value = [value for key,value in item.items()]
        key_name =key_value[0]
        row[key_name] = key_value[1]
    return row

def dataFrame_of_Item(col_name,item,TF): # input: name of dict to unpack | output: data frame
    list_of_rows = []
    for organization in responseData:
        # create json
        dictionary = {}
        try:
            # create reshaped json
            dictionary = dictionary | restructure(organization[col_name][item])
        except:
            pass
        # add primary_key number
        if TF:
            dictionary['Primary_Key'] = organization['organization']['naturalId'].split('/')[-1]
        # append to list
        list_of_rows.append(dictionary)
    return pd.DataFrame(list_of_rows)

# add date frames to main data
df_ls = []

for item in financialAid:
    df_ls.append(dataFrame_of_Item('financialAid',item,True))

financialAid_unpacked = pd.concat(df_ls, axis=1)

financialAid_unpacked = financialAid_unpacked.loc[:, ~financialAid_unpacked.columns.duplicated()]

data_main = data_main.merge(financialAid_unpacked)

data_main = data_main.loc[:, ~data_main.columns.duplicated()]

drop_columns = ['organization','academics','financialAid','listImages','geoLocation','visible','relatedVisible','imageExists','socialNetworks','collegeMedia']

data_main.drop(drop_columns, axis=1, inplace=True)

In [None]:
data_main[['latitude', 'longitude']]

# ------------------ TEST AREA ------------------

# WHAT TO DO:
* ~~unpack 'organization'~~
    * ~~unpack 'geoLocation'~~
    * ~~unpack 'socialNetworks'~~
* ~~unpack 'academics'~~
    * ~~unpack 'attendanceStatus'~~
    * ~~unpack 'firstToSecondYearRetention'~~
    * ~~unpack 'overallGraduationRates'~~
    * ~~unpack 'enrollmentByGender'~~
    * ~~unpack 'graduationRateByGender'~~
    * ~~unpack 'enrollmentByRace'~~
    * ~~unpack 'graduationRateByRace'~~
* ~~unpack 'financialAid'~~
    * ~~grantAidByType~~
    * ~~avgGrantAidByType~~
    * ~~loansByType~~
    * ~~avgLoansByType~~
* ~~drop unnecessary columns~~

In [None]:
data_main.shape


In [None]:
col_to_drop = [
       'organization',
       'academics',
       'financialAid',
       'listImages',
       'visible',
       'relatedVisible',
       'imageExists',
       'recentContentCount',
       'country',
       'collegeMedia',
       'landscapeImage',
       'industries',
       'embargo',
       'image',
       'industry',
       'ceoName',
       'ceoTitle',
       'premiumProfile',
       'employees',
       'portraitImage',
       'naturalId',
       'geoLocation',
       'uri',
       'uris',
       'socialNetworks',
       'placeUri'
       ]

data_main.drop(col_to_drop, axis=1, inplace=True)


In [None]:

data_main.to_csv('University_Data.csv')

In [None]:
pd.options.display.max_columns = None
data_main

In [None]:
pd.options.display.max_columns = None
data_main.head()

In [None]:
data_main[data_main.organizationName == 'Brigham Young University']

In [None]:
data_main[data_main.organizationName.str.contains('Brigham')]

In [None]:
pd.options.display.max_rows = 100
# data_main.sort_values(by='enrollmentByGender_enrollmentFemale', ascending=False)

In [None]:
data_main.columns

In [None]:
pd.options.display.max_rows = None

# data_main[data_main.].dropna(subset=['latitude'])

In [None]:
data_main.state.unique()

In [None]:
data_main.longitude.dropna().max()

In [None]:
data_main[data_main.longitude == 85.5016][['latitude','longitude']]

In [None]:
data_main[data_main.longitude == 85.5016]

In [10]:
pd.options.display.max_columns = None
pd.options.display.max_rows = 10

In [11]:
data_main.head(3)

Unnamed: 0,description,rank,organizationName,state,studentPopulation,campusSetting,medianBaseSalary,naturalId,name,uri,webSite,phoneNumber,recentContentCount,uris,shortUri,squareImage,city,country,region,yearFounded,stateCode,placeUri,landscapeImage,industries,embargo,image,industry,ceoName,ceoTitle,parentOrganization,premiumProfile,employees,portraitImage,Primary_Key,latitude,longitude,type,studentFacultyRatio,undergraduatePopulation,attendanceStatus_partTime,attendanceStatus_fullTime,firstToSecondYearRetention_fullTime,firstToSecondYearRetention_partTime,overallGraduationRates_4,overallGraduationRates_6,enrollmentByGender_enrollmentMale,enrollmentByGender_enrollmentFemale,enrollmentByGender_AgeUnder18,enrollmentByGender_Age18to24,enrollmentByGender_Age25to64,enrollmentByGender_Age65andOver,graduationRateByGender_graduationMale,graduationRateByGender_graduationFemale,enrollmentByRace_americanIndian,enrollmentByRace_asian,enrollmentByRace_hawaiianPacific,enrollmentByRace_africanAmerican,enrollmentByRace_hispanic,enrollmentByRace_white,enrollmentByRace_twoRaces,enrollmentByRace_unknown,enrollmentByRace_alien,graduationRateByRace_americanIndian,graduationRateByRace_asian,graduationRateByRace_hawaiianPacific,graduationRateByRace_africanAmerican,graduationRateByRace_hispanic,graduationRateByRace_white,graduationRateByRace_twoRaces,graduationRateByRace_unknown,graduationRateByRace_alien,Twitter,Facebook,LinkedIn,Instagram,YouTube,federalGrant,pellGrant,otherFederalGrant,stateLocalGrant,institutionalGrant,anyGrant,anyLoan,federalLoan,nonFederalLoan
0,"A top liberal arts school, Amherst is located ...",16.0,Amherst College,MA,1839.0,Suburban,127100.0,fred/college/3,Amherst College,amherst-college,http://https://www.amherst.edu,413-542-2000,0.0,[amherst-college],http://onforb.es/MvHZF3,http://specials-images.forbesimg.com/imageserv...,Amherst,United States,Northeast,1821.0,MA,ma/springfield,,,,,,,,,,,,3,42.370772,-72.533204,Private,7,1839.0,7.0,93.0,97.0,72.0,76.0,93.0,48.0,52.0,782.0,28810.0,1752.0,4.0,91.0,94.0,0.2,30.2,0.1,2.1,14.1,26.7,5.4,4.0,17.2,100.0,97.0,75.0,76.0,84.0,92.0,92.0,96.0,92.0,https://twitter.com/AmherstCollege,https://www.facebook.com/amherstcollege,http://us.linkedin.com/company/amherst-college,http://statigr.am/AmherstCollege,https://www.amherst.edu/aboutamherst/visiting/...,30.0,24.0,18.0,4.0,62.0,57627.0,18.0,14.0,7.0
1,"Pomona College in Claremont, CA. is one of the...",19.0,Pomona College,CA,1637.0,Suburban,121900.0,fred/college/17,Pomona College,pomona-college,http://www.pomona.edu,(909) 621-8134,0.0,[pomona-college],http://onforb.es/MvHPO9,https://specials-images.forbesimg.com/imageser...,Claremont,United States,West,1887.0,CA,,,,,,,,,,,,,17,34.096849,-117.712684,Private,7,1637.0,1.0,99.0,99.0,0.0,88.0,97.0,48.0,52.0,114.0,5947.0,30.0,1.0,97.0,98.0,0.3,15.1,0.1,6.0,10.4,41.2,5.1,1.1,20.7,100.0,97.0,0.0,97.0,97.0,98.0,96.0,100.0,95.0,https://twitter.com/pomonacollege,https://www.facebook.com/pomonacollege,http://www.linkedin.com/edu/pomona-college-19928,http://instagram.com/pomonacollege,www.pomona.edu/tours,20.0,23.0,13.0,7.0,51.0,53449.0,20.0,13.0,13.0
2,"The United States Military Academy, located in...",29.0,United States Military Academy,NY,4457.0,Rural,146300.0,fred/college/4,United States Military Academy,united-states-military-academy,http://https://www.westpoint.edu/,845-938-4041,0.0,[united-states-military-academy],http://onforb.es/MvHZVR,//specials-images.forbesimg.com/imageserve/5d5...,West Point,United States,Northeast,1802.0,NY,ny/poughkeepsie,,,,,,,,,,,,4,41.395221,-73.955162,Public,7,4457.0,1.0,99.0,98.0,0.0,90.0,98.0,54.0,46.0,145.0,5239.0,38.0,0.0,97.0,99.0,0.1,17.1,0.1,5.9,9.0,37.1,4.3,3.3,23.1,100.0,99.0,100.0,97.0,97.0,98.0,92.0,96.0,98.0,https://twitter.com/westpoint_usma,http://www.facebook.com/WestPointUSMA,https://www.linkedin.com/edu/united-states-mil...,http://instagram.com/wpaog,,,,,,,,,,


In [12]:
[x for x in data_main.columns]

['description',
 'rank',
 'organizationName',
 'state',
 'studentPopulation',
 'campusSetting',
 'medianBaseSalary',
 'naturalId',
 'name',
 'uri',
 'webSite',
 'phoneNumber',
 'recentContentCount',
 'uris',
 'shortUri',
 'squareImage',
 'city',
 'country',
 'region',
 'yearFounded',
 'stateCode',
 'placeUri',
 'landscapeImage',
 'industries',
 'embargo',
 'image',
 'industry',
 'ceoName',
 'ceoTitle',
 'parentOrganization',
 'premiumProfile',
 'employees',
 'portraitImage',
 'Primary_Key',
 'latitude',
 'longitude',
 'type',
 'studentFacultyRatio',
 'undergraduatePopulation',
 'attendanceStatus_partTime',
 'attendanceStatus_fullTime',
 'firstToSecondYearRetention_fullTime',
 'firstToSecondYearRetention_partTime',
 'overallGraduationRates_4',
 'overallGraduationRates_6',
 'enrollmentByGender_enrollmentMale',
 'enrollmentByGender_enrollmentFemale',
 'enrollmentByGender_AgeUnder18',
 'enrollmentByGender_Age18to24',
 'enrollmentByGender_Age25to64',
 'enrollmentByGender_Age65andOver',
 'gr

In [13]:
columns_keep = [
# 'description',
 'rank',
 'organizationName',
 'state',
 'studentPopulation',
#  'campusSetting',
 'medianBaseSalary',
#  'naturalId',
#  'name',
#  'uri',
 'webSite',
 'phoneNumber',
#  'recentContentCount',
#  'uris',
#  'shortUri',
 'squareImage',
 'city',
#  'country',
 'region',
#  'yearFounded',
 'stateCode',
#  'placeUri',
#  'landscapeImage',
#  'industries',
#  'embargo',
#  'image',
#  'industry',
#  'ceoName',
#  'ceoTitle',
#  'parentOrganization',
#  'premiumProfile',
#  'employees',
#  'portraitImage',
#  'Primary_Key',
 'latitude',
 'longitude',
 'type',
 'studentFacultyRatio',
 'undergraduatePopulation',
#  'attendanceStatus_partTime',
#  'attendanceStatus_fullTime',
#  'firstToSecondYearRetention_fullTime',
#  'firstToSecondYearRetention_partTime',
#  'overallGraduationRates_4',
#  'overallGraduationRates_6',.
 'enrollmentByGender_enrollmentMale',
 'enrollmentByGender_enrollmentFemale',
#  'enrollmentByGender_AgeUnder18',
#  'enrollmentByGender_Age18to24',
#  'enrollmentByGender_Age25to64',
#  'enrollmentByGender_Age65andOver',
 'graduationRateByGender_graduationMale',
 'graduationRateByGender_graduationFemale',
#  'enrollmentByRace_americanIndian',
#  'enrollmentByRace_asian',
#  'enrollmentByRace_hawaiianPacific',
#  'enrollmentByRace_africanAmerican',
#  'enrollmentByRace_hispanic',
#  'enrollmentByRace_white',
#  'enrollmentByRace_twoRaces',
#  'enrollmentByRace_unknown',
#  'enrollmentByRace_alien',
#  'graduationRateByRace_americanIndian',
#  'graduationRateByRace_asian',
#  'graduationRateByRace_hawaiianPacific',
#  'graduationRateByRace_africanAmerican',
#  'graduationRateByRace_hispanic',
#  'graduationRateByRace_white',
#  'graduationRateByRace_twoRaces',
#  'graduationRateByRace_unknown',
#  'graduationRateByRace_alien',
 'Twitter',
 'Facebook',
 'LinkedIn',
 'Instagram',
 'YouTube',
 'federalGrant'
#  'pellGrant',
#  'otherFederalGrant',
#  'stateLocalGrant',
#  'institutionalGrant',
#  'anyGrant',
#  'anyLoan',
#  'federalLoan',
#  'nonFederalLoan'
 ]

In [15]:
for e in columns_keep:
    print(e)

rank
organizationName
state
studentPopulation
medianBaseSalary
webSite
phoneNumber
squareImage
city
region
stateCode
latitude
longitude
type
studentFacultyRatio
undergraduatePopulation
enrollmentByGender_enrollmentMale
enrollmentByGender_enrollmentFemale
graduationRateByGender_graduationMale
graduationRateByGender_graduationFemale
Twitter
Facebook
LinkedIn
Instagram
YouTube
federalGrant


In [40]:
url = 'https://byui.kuali.co/api/v1/catalog/courses/6102e778ef84b869ba4eb375?q='

response = requests.get(url)

responseObject = response.json()

df = pd.DataFrame(responseObject)

df = pd.concat([df, df.subjectCode.apply(pd.Series)], axis=1).drop(['subjectCode','__passedCatalogQuery','_score'], axis=1)

df.columns = [
    'catalogCourseId',
    'dateStart',
    'pid',
    'id',
    'title',
    'catalogActivationDate',
    'name',
    'description',
    'subjectCode-id',
    'linkedGroup'
]

data = df

In [49]:
def str_num(word):
    for letter in word:
        if letter.isdigit():
            idx = word.index(letter)
            code = word[idx:]
            break
    return code

In [52]:
df.catalogCourseId.apply(lambda x: str_num(x))

0        100
1        180
2        201
3        202
4        205
        ... 
2094     329
2095     338
2096     350
2097     480
2098    490R
Name: catalogCourseId, Length: 2099, dtype: object

In [63]:
# catalogBYUI

import requests
import pandas as pd

url_display = 'https://www.byui.edu/catalog#/courses'

url = 'https://byui.kuali.co/api/v1/catalog/courses/6102e778ef84b869ba4eb375?q='

response = requests.get(url)

responseObject = response.json()

df = pd.DataFrame(responseObject)

df = pd.concat([df, df.subjectCode.apply(pd.Series)], axis=1).drop(['subjectCode','__passedCatalogQuery','_score'], axis=1)

def str_num(word):
    for letter in word:
        if letter.isdigit():
            idx = word.index(letter)
            code = word[idx:]
            break
    return code

df['course-id'] = df.__catalogCourseId.apply(lambda x: str_num(x))

df.columns = [
    'catalogCourseId',
    'dateStart',
    'pid',
    'id',
    'title',
    'catalogActivationDate',
    'name',
    'description',
    'subjectCode-id',
    'linkedGroup',
    'course-id'
]

data = df

# ----------------------- TEST -----------------------

script_1 = '''
url = 'https://byui.kuali.co/api/v1/catalog/courses/6102e778ef84b869ba4eb375?q='

response = requests.get(url)

responseObject = response.json()

df = pd.DataFrame(responseObject)

df = pd.concat([df, df.subjectCode.apply(pd.Series)], axis=1).drop(['subjectCode','__passedCatalogQuery','_score'], axis=1)

df.columns = [
    'catalogCourseId',
    'dateStart',
    'pid',
    'id',
    'title',
    'catalogActivationDate',
    'name',
    'description',
    'subjectCode-id',
    'linkedGroup'
]

data = df
'''

In [64]:
df

Unnamed: 0,catalogCourseId,dateStart,pid,id,title,catalogActivationDate,name,description,subjectCode-id,linkedGroup,course-id
0,ACCTG100,2019-01-01,414Six2j-,5a6a3fb5449c7c2e00c517af,Introduction to Accounting,2018-12-21,ACCTG,Accounting,45beaa83-7d3c-4760-bdb1-6813f94d4097,578be6e1aeb3767907f3b4e8,100
1,ACCTG180,2020-01-01,V1g4Sol3sW,5da0e7a7783602240012c9cb,Survey of Accounting,2019-12-01,ACCTG,Accounting,45beaa83-7d3c-4760-bdb1-6813f94d4097,578be6e1aeb3767907f3b4e8,180
2,ACCTG201,2020-01-01,EkrHil3oZ,5da7a52d25c43024004548bf,Financial Accounting,2019-12-01,ACCTG,Accounting,45beaa83-7d3c-4760-bdb1-6813f94d4097,578be6e1aeb3767907f3b4e8,201
3,ACCTG202,2020-01-01,Vy8rsxhiW,5da8a5c19a654a2400d2eeb3,Managerial Accounting,2019-12-01,ACCTG,Accounting,45beaa83-7d3c-4760-bdb1-6813f94d4097,578be6e1aeb3767907f3b4e8,202
4,ACCTG205,2022-01-01,NkxUrjehs-,6202ca0aaa637727f9b984da,Accounting Software,2021-12-01,ACCTG,Accounting,45beaa83-7d3c-4760-bdb1-6813f94d4097,578be6e1aeb3767907f3b4e8,205
...,...,...,...,...,...,...,...,...,...,...,...
2094,WELD329,2022-01-01,4kIpgbho-,6042c80977e1c400270d89bb,"Welding Codes, Certification & Inspection",2021-12-01,WELD,Welding,29ee52d3-030c-4823-a922-fe830800138c,59d3a033b0e9130001e8a713,329
2095,WELD338,2017-01-01,N1g86g-hib,b17972a7-c38f-4b33-9a7b-3c08f8d9f86c,Welding Automation,2017-01-01,WELD,Welding,29ee52d3-030c-4823-a922-fe830800138c,59d3a033b0e9130001e8a713,338
2096,WELD350,2022-01-01,EJWLTlW2jZ,6063af60b548f800261b9638,Physical Metallurgy,2021-12-01,WELD,Welding,29ee52d3-030c-4823-a922-fe830800138c,59d3a033b0e9130001e8a713,350
2097,WELD480,2022-01-01,EkG86l-3s-,6042c4c8639c000028829e4a,Welding Fabrication,2021-12-01,WELD,Welding,29ee52d3-030c-4823-a922-fe830800138c,59d3a033b0e9130001e8a713,480
