# In this notebook, I extract the university coordinates from hd2023.xlsx dataset. The states considered here are MA, NY, CT, NH, and RI.

### Five types of universities in CCIHE2021-PublicData.xlsx that we considered here are:
- R1 and R2 universities (basic2021 = 15 or 16)
- Public universities (control = 1)
- Private not-for-profit (control = 2)
- Land Grant (landgrnt (1 is yes, 2 no))
- STEM (stem_rsd nonzero or zero)

In [1]:
import pandas as pd
import numpy as np

In [2]:
home_state = ['MA']
nearby_state_arr = ['NY', 'CT', 'NH', 'RI']
state_name_abbrev_arr = home_state + nearby_state_arr

hd2023_data = pd.read_excel('data/hd2023.xlsx')
CCIHE2021_data = pd.read_excel('data/CCIHE2021-PublicData.xlsx', sheet_name = 'Data')

In [3]:
'''
(1) R1 and R2 universities
'''
file_name_str = 'R1R2'
header_name = 'R1R2'

for state_name_abbrev in state_name_abbrev_arr:
    
    CCIHE2021_data_state = CCIHE2021_data[CCIHE2021_data['stabbr'] == state_name_abbrev].sort_values(by='unitid')
    CCIHE2021_data_state = CCIHE2021_data_state[['unitid', 'name', 'city', 'basic2021', 'control', 'landgrnt', 'stem_rsd', 'anenr1920', 'rooms']]
    CCIHE2021_data_state = CCIHE2021_data_state[CCIHE2021_data_state['basic2021'].isin([15, 16])]
    CCIHE2021_data_state = CCIHE2021_data_state.sort_values(by='unitid').reset_index(drop=True)
    
    hd2023_data_state = hd2023_data[hd2023_data['STABBR'] == state_name_abbrev]
    hd2023_data_state = hd2023_data_state[['UNITID', 'INSTNM', 'CITY', 'COUNTYNM', 'LONGITUD', 'LATITUDE']]
    hd2023_data_state[header_name] = hd2023_data_state['UNITID'].isin(CCIHE2021_data_state['unitid'])
    hd2023_data_state = hd2023_data_state[hd2023_data_state[header_name] == True]
    hd2023_data_state = hd2023_data_state.sort_values(by='UNITID').reset_index(drop=True)

    hd2023_data_state['Annual enrollment'] = CCIHE2021_data_state['anenr1920']
    hd2023_data_state['Number of dorm beds'] = CCIHE2021_data_state['rooms']
    
    hd2023_data_state.insert(4, 'State', [state_name_abbrev] * hd2023_data_state.shape[0])
    hd2023_data_state.to_excel('data/MA_AP_performance/hd2023_coordinate/hd2023_%s_data_%s.xlsx'%(file_name_str, state_name_abbrev), index=False)

#### concat all states data into one sheet
data_temp = []
for state_name_abbrev in state_name_abbrev_arr:
    data_temp += [ pd.read_excel('data/MA_AP_performance/hd2023_coordinate/hd2023_%s_data_%s.xlsx'%(file_name_str, state_name_abbrev)).sort_values(by='INSTNM') ]
combined_states_data = pd.concat(data_temp, ignore_index=True)
combined_states_data.to_excel('data/MA_AP_performance/hd2023_coordinate/hd2023_%s_data_%s_allnearby.xlsx'%(file_name_str, home_state[0]), index=False)

In [4]:
'''
(2) Public universities
'''
file_name_str = 'public_university'
header_name = 'Public university'

for state_name_abbrev in state_name_abbrev_arr:

    CCIHE2021_data_state = CCIHE2021_data[CCIHE2021_data['stabbr'] == state_name_abbrev].sort_values(by='unitid')
    CCIHE2021_data_state = CCIHE2021_data_state[['unitid', 'name', 'city', 'basic2021', 'control', 'landgrnt', 'stem_rsd', 'anenr1920', 'rooms']]
    CCIHE2021_data_state = CCIHE2021_data_state[CCIHE2021_data_state['control'].isin([1])]
    CCIHE2021_data_state = CCIHE2021_data_state.sort_values(by='unitid').reset_index(drop=True)
    
    hd2023_data_state = hd2023_data[hd2023_data['STABBR'] == state_name_abbrev]
    hd2023_data_state = hd2023_data_state[['UNITID', 'INSTNM', 'CITY', 'COUNTYNM', 'LONGITUD', 'LATITUDE']]
    hd2023_data_state[header_name] = hd2023_data_state['UNITID'].isin(CCIHE2021_data_state['unitid'])
    hd2023_data_state = hd2023_data_state[hd2023_data_state[header_name] == True]
    hd2023_data_state = hd2023_data_state.sort_values(by='UNITID').reset_index(drop=True)

    hd2023_data_state['Annual enrollment'] = CCIHE2021_data_state['anenr1920']
    hd2023_data_state['Number of dorm beds'] = CCIHE2021_data_state['rooms']
    
    hd2023_data_state.insert(4, 'State', [state_name_abbrev] * hd2023_data_state.shape[0])
    hd2023_data_state.to_excel('data/MA_AP_performance/hd2023_coordinate/hd2023_%s_data_%s.xlsx'%(file_name_str, state_name_abbrev), index=False)

#### concat all states data into one sheet
data_temp = []
for state_name_abbrev in state_name_abbrev_arr:
    data_temp += [ pd.read_excel('data/MA_AP_performance/hd2023_coordinate/hd2023_%s_data_%s.xlsx'%(file_name_str, state_name_abbrev)).sort_values(by='INSTNM') ]
combined_states_data = pd.concat(data_temp, ignore_index=True)
combined_states_data.to_excel('data/MA_AP_performance/hd2023_coordinate/hd2023_%s_data_%s_allnearby.xlsx'%(file_name_str, home_state[0]), index=False)

In [5]:
'''
(3) Private not-for-profit
'''
file_name_str = 'private_notforprofit'
header_name = 'Private not-for-profit'

for state_name_abbrev in state_name_abbrev_arr:

    CCIHE2021_data_state = CCIHE2021_data[CCIHE2021_data['stabbr'] == state_name_abbrev].sort_values(by='unitid')
    CCIHE2021_data_state = CCIHE2021_data_state[['unitid', 'name', 'city', 'basic2021', 'control', 'landgrnt', 'stem_rsd', 'anenr1920', 'rooms']]
    CCIHE2021_data_state = CCIHE2021_data_state[CCIHE2021_data_state['control'].isin([2])]
    CCIHE2021_data_state = CCIHE2021_data_state.sort_values(by='unitid').reset_index(drop=True)
    
    hd2023_data_state = hd2023_data[hd2023_data['STABBR'] == state_name_abbrev]
    hd2023_data_state = hd2023_data_state[['UNITID', 'INSTNM', 'CITY', 'COUNTYNM', 'LONGITUD', 'LATITUDE']]
    hd2023_data_state[header_name] = hd2023_data_state['UNITID'].isin(CCIHE2021_data_state['unitid'])
    hd2023_data_state = hd2023_data_state[hd2023_data_state[header_name] == True]
    hd2023_data_state = hd2023_data_state.sort_values(by='UNITID').reset_index(drop=True)

    hd2023_data_state['Annual enrollment'] = CCIHE2021_data_state['anenr1920']
    hd2023_data_state['Number of dorm beds'] = CCIHE2021_data_state['rooms']
    
    hd2023_data_state.insert(4, 'State', [state_name_abbrev] * hd2023_data_state.shape[0])
    hd2023_data_state.to_excel('data/MA_AP_performance/hd2023_coordinate/hd2023_%s_data_%s.xlsx'%(file_name_str, state_name_abbrev), index=False)

#### concat all states data into one sheet
data_temp = []
for state_name_abbrev in state_name_abbrev_arr:
    data_temp += [ pd.read_excel('data/MA_AP_performance/hd2023_coordinate/hd2023_%s_data_%s.xlsx'%(file_name_str, state_name_abbrev)).sort_values(by='INSTNM') ]
combined_states_data = pd.concat(data_temp, ignore_index=True)
combined_states_data.to_excel('data/MA_AP_performance/hd2023_coordinate/hd2023_%s_data_%s_allnearby.xlsx'%(file_name_str, home_state[0]), index=False)

In [6]:
'''
(4) Land Grant
'''
file_name_str = 'Land_Grant'
header_name = 'Land Grant'

for state_name_abbrev in state_name_abbrev_arr:

    CCIHE2021_data_state = CCIHE2021_data[CCIHE2021_data['stabbr'] == state_name_abbrev].sort_values(by='unitid')
    CCIHE2021_data_state = CCIHE2021_data_state[['unitid', 'name', 'city', 'basic2021', 'control', 'landgrnt', 'stem_rsd', 'anenr1920', 'rooms']]
    CCIHE2021_data_state = CCIHE2021_data_state[CCIHE2021_data_state['landgrnt'].isin([1])]
    CCIHE2021_data_state = CCIHE2021_data_state.sort_values(by='unitid').reset_index(drop=True)
    
    hd2023_data_state = hd2023_data[hd2023_data['STABBR'] == state_name_abbrev]
    hd2023_data_state = hd2023_data_state[['UNITID', 'INSTNM', 'CITY', 'COUNTYNM', 'LONGITUD', 'LATITUDE']]
    hd2023_data_state[header_name] = hd2023_data_state['UNITID'].isin(CCIHE2021_data_state['unitid'])
    hd2023_data_state = hd2023_data_state[hd2023_data_state[header_name] == True]
    hd2023_data_state = hd2023_data_state.sort_values(by='UNITID').reset_index(drop=True)

    hd2023_data_state['Annual enrollment'] = CCIHE2021_data_state['anenr1920']
    hd2023_data_state['Number of dorm beds'] = CCIHE2021_data_state['rooms']
    
    hd2023_data_state.insert(4, 'State', [state_name_abbrev] * hd2023_data_state.shape[0])
    hd2023_data_state.to_excel('data/MA_AP_performance/hd2023_coordinate/hd2023_%s_data_%s.xlsx'%(file_name_str, state_name_abbrev), index=False)

#### concat all states data into one sheet
data_temp = []
for state_name_abbrev in state_name_abbrev_arr:
    data_temp += [ pd.read_excel('data/MA_AP_performance/hd2023_coordinate/hd2023_%s_data_%s.xlsx'%(file_name_str, state_name_abbrev)).sort_values(by='INSTNM') ]
combined_states_data = pd.concat(data_temp, ignore_index=True)
combined_states_data.to_excel('data/MA_AP_performance/hd2023_coordinate/hd2023_%s_data_%s_allnearby.xlsx'%(file_name_str, home_state[0]), index=False)

In [7]:
'''
(5) STEM
'''
file_name_str = 'STEM'
header_name = 'STEM'

for state_name_abbrev in state_name_abbrev_arr:

    CCIHE2021_data_state = CCIHE2021_data[CCIHE2021_data['stabbr'] == state_name_abbrev].sort_values(by='unitid')
    CCIHE2021_data_state = CCIHE2021_data_state[['unitid', 'name', 'city', 'basic2021', 'control', 'landgrnt', 'stem_rsd', 'anenr1920', 'rooms']]
    CCIHE2021_data_state = CCIHE2021_data_state[CCIHE2021_data_state['stem_rsd'].notna()]
    
    CCIHE2021_data_state = CCIHE2021_data_state.sort_values(by='unitid').reset_index(drop=True)
    
    hd2023_data_state = hd2023_data[hd2023_data['STABBR'] == state_name_abbrev]
    hd2023_data_state = hd2023_data_state[['UNITID', 'INSTNM', 'CITY', 'COUNTYNM', 'LONGITUD', 'LATITUDE']]
    hd2023_data_state[header_name] = hd2023_data_state['UNITID'].isin(CCIHE2021_data_state['unitid'])
    hd2023_data_state = hd2023_data_state[hd2023_data_state[header_name] == True]
    hd2023_data_state = hd2023_data_state.sort_values(by='UNITID').reset_index(drop=True)

    hd2023_data_state['Annual enrollment'] = CCIHE2021_data_state['anenr1920']
    hd2023_data_state['Number of dorm beds'] = CCIHE2021_data_state['rooms']
    
    hd2023_data_state.insert(4, 'State', [state_name_abbrev] * hd2023_data_state.shape[0])
    hd2023_data_state.to_excel('data/MA_AP_performance/hd2023_coordinate/hd2023_%s_data_%s.xlsx'%(file_name_str, state_name_abbrev), index=False)

#### concat all states data into one sheet
data_temp = []
for state_name_abbrev in state_name_abbrev_arr:
    data_temp += [ pd.read_excel('data/MA_AP_performance/hd2023_coordinate/hd2023_%s_data_%s.xlsx'%(file_name_str, state_name_abbrev)).sort_values(by='INSTNM') ]
combined_states_data = pd.concat(data_temp, ignore_index=True)
combined_states_data.to_excel('data/MA_AP_performance/hd2023_coordinate/hd2023_%s_data_%s_allnearby.xlsx'%(file_name_str, home_state[0]), index=False)