# In this Notebook, I calculate various quantities related to inverse distance and combine all data in one excel file. 

In [10]:
import pandas as pd
import numpy as np
from geopy import distance

In [11]:
MA_district_coordinates = pd.read_excel('data/MA_AP_performance/MA_district_coordinates.xlsx').sort_values(by='District Code').reset_index(drop=True)

uni_coord_R1R2 = pd.read_excel('data/MA_AP_performance/hd2023_coordinate/hd2023_R1R2_data_MA_allnearby.xlsx').sort_values(by='INSTNM').reset_index(drop=True)
uni_coord_Public = pd.read_excel('data/MA_AP_performance/hd2023_coordinate/hd2023_Public_university_data_MA_allnearby.xlsx').sort_values(by='INSTNM').reset_index(drop=True)
uni_coord_Private_nfp = pd.read_excel('data/MA_AP_performance/hd2023_coordinate/hd2023_private_notforprofit_data_MA_allnearby.xlsx').sort_values(by='INSTNM').reset_index(drop=True)
uni_coord_Land_Grant = pd.read_excel('data/MA_AP_performance/hd2023_coordinate/hd2023_Land_Grant_data_MA_allnearby.xlsx').sort_values(by='INSTNM').reset_index(drop=True)
uni_coord_STEM = pd.read_excel('data/MA_AP_performance/hd2023_coordinate/hd2023_STEM_data_MA_allnearby.xlsx').sort_values(by='INSTNM').reset_index(drop=True)


###########################

def distance_district_univeristy_R1R2(district_code, university_name):
    district_coor = MA_district_coordinates[MA_district_coordinates['District Code'] == district_code][['LONGITUD', 'LATITUDE']].to_numpy()[0][::-1]
    university_coor = uni_coord_R1R2[uni_coord_R1R2['INSTNM'] == university_name][['LONGITUD', 'LATITUDE']].to_numpy()[0][::-1]
    return distance.distance(district_coor, university_coor).miles    ## return unit is miles

def distance_district_univeristy_Public(district_code, university_name):
    district_coor = MA_district_coordinates[MA_district_coordinates['District Code'] == district_code][['LONGITUD', 'LATITUDE']].to_numpy()[0][::-1]
    university_coor = uni_coord_Public[uni_coord_Public['INSTNM'] == university_name][['LONGITUD', 'LATITUDE']].to_numpy()[0][::-1]
    return distance.distance(district_coor, university_coor).miles    ## return unit is miles

def distance_district_univeristy_Private_nfp(district_code, university_name):
    district_coor = MA_district_coordinates[MA_district_coordinates['District Code'] == district_code][['LONGITUD', 'LATITUDE']].to_numpy()[0][::-1]
    university_coor = uni_coord_Private_nfp[uni_coord_Private_nfp['INSTNM'] == university_name][['LONGITUD', 'LATITUDE']].to_numpy()[0][::-1]
    return distance.distance(district_coor, university_coor).miles    ## return unit is miles

def distance_district_univeristy_Land_Grant(district_code, university_name):
    district_coor = MA_district_coordinates[MA_district_coordinates['District Code'] == district_code][['LONGITUD', 'LATITUDE']].to_numpy()[0][::-1]
    university_coor = uni_coord_Land_Grant[uni_coord_Land_Grant['INSTNM'] == university_name][['LONGITUD', 'LATITUDE']].to_numpy()[0][::-1]
    return distance.distance(district_coor, university_coor).miles    ## return unit is miles

def distance_district_univeristy_STEM(district_code, university_name):
    district_coor = MA_district_coordinates[MA_district_coordinates['District Code'] == district_code][['LONGITUD', 'LATITUDE']].to_numpy()[0][::-1]
    university_coor = uni_coord_STEM[uni_coord_STEM['INSTNM'] == university_name][['LONGITUD', 'LATITUDE']].to_numpy()[0][::-1]
    return distance.distance(district_coor, university_coor).miles    ## return unit is miles


###########################

epsilon_soften_factor = 10.0  ## [miles]

def inverse_distance_weight(distance_miles):
    return 1.0 / ( (distance_miles/epsilon_soften_factor) + 1 )

def enrollment_inverse_distance_weight(distance_miles, num_annual_enrollment):
    ## num_annual_enrollment is the number of annual enrollment
    return int(num_annual_enrollment) * 1.0 / ( (distance_miles/epsilon_soften_factor) + 1 )

def dormbed_inverse_distance_weight(distance_miles, num_dorm_bed):
    return int(num_dorm_bed) * 1.0 / ( (distance_miles/epsilon_soften_factor) + 1 )

# The following cell takes long time (~ 13 min) to calculate

In [14]:
for year in np.arange(18, 22+1, 1):
    
    MA_median_income = pd.read_excel('data/MA_AP_performance/MA_median_household_income.xlsx')
    MA_district_population = pd.read_excel('data/MA_AP_performance/MA_district_population.xlsx')
    MA_expenditure_year = pd.read_excel('data/MA_AP_performance/PerPupilExpenditures/PerPupilExpenditures_20%s.xlsx'%year)
    
    
    MA_AP_performance_year = pd.read_excel('data/MA_AP_performance/AP_performance_18_22.xlsx', sheet_name='20%s-%s'%(year, year+1))
    MA_AP_performance_year = MA_AP_performance_year[MA_AP_performance_year['District Code'].isin(MA_district_coordinates['District Code'])]
    MA_AP_performance_year = MA_AP_performance_year[MA_AP_performance_year['District Code'].isin(MA_median_income['District Code'])]
    MA_AP_performance_year = MA_AP_performance_year[MA_AP_performance_year['District Code'].isin(MA_district_population['District Code'])]
    MA_AP_performance_year = MA_AP_performance_year[MA_AP_performance_year['District Code'].isin(MA_expenditure_year['District Code'])]
    MA_AP_performance_year = MA_AP_performance_year.sort_values(by='District Code').reset_index(drop=True)

    
    #### 1. Calculate distance to R1R2 universities
    inv_distance_arr      = []
    enll_inv_distance_arr = []
    dorm_inv_distance_arr = []
    
    for district_code in MA_AP_performance_year['District Code'].tolist():
        
        total_val = enll_total_val = dorm_total_val = 0
        
        for university_name in uni_coord_R1R2['INSTNM'].tolist():
            
            distance_ij           = distance_district_univeristy_R1R2(district_code, university_name)
            num_annual_enrollment = uni_coord_R1R2[uni_coord_R1R2['INSTNM'] == university_name]['Annual enrollment'].values[0]
            num_dorm_bed          = uni_coord_R1R2[uni_coord_R1R2['INSTNM'] == university_name]['Number of dorm beds'].values[0]
            
            total_val      += inverse_distance_weight( distance_ij )
            enll_total_val += enrollment_inverse_distance_weight( distance_ij, num_annual_enrollment )
            dorm_total_val += dormbed_inverse_distance_weight( distance_ij, num_dorm_bed)
            
        inv_distance_arr.append(total_val)
        enll_inv_distance_arr.append(enll_total_val)
        dorm_inv_distance_arr.append(dorm_total_val)
        
    MA_AP_performance_year['Inverse Distance R1R2'] = inv_distance_arr
    MA_AP_performance_year['Enrollment Inverse Distance R1R2'] = enll_inv_distance_arr
    MA_AP_performance_year['Dorm Bed Inverse Distance R1R2'] = dorm_inv_distance_arr

    
    #### 2. Calculate distance to Public universities
    inv_distance_arr      = []
    enll_inv_distance_arr = []
    dorm_inv_distance_arr = []
    
    for district_code in MA_AP_performance_year['District Code'].tolist():
        
        total_val = enll_total_val = dorm_total_val = 0
        
        for university_name in uni_coord_Public['INSTNM'].tolist():
            
            distance_ij           = distance_district_univeristy_Public(district_code, university_name)
            num_annual_enrollment = uni_coord_Public[uni_coord_Public['INSTNM'] == university_name]['Annual enrollment'].values[0]
            num_dorm_bed          = uni_coord_Public[uni_coord_Public['INSTNM'] == university_name]['Number of dorm beds'].values[0]
            
            total_val      += inverse_distance_weight( distance_ij )
            enll_total_val += enrollment_inverse_distance_weight( distance_ij, num_annual_enrollment )
            dorm_total_val += dormbed_inverse_distance_weight( distance_ij, num_dorm_bed)
            
        inv_distance_arr.append(total_val)
        enll_inv_distance_arr.append(enll_total_val)
        dorm_inv_distance_arr.append(dorm_total_val)
        
    MA_AP_performance_year['Inverse Distance Public'] = inv_distance_arr
    MA_AP_performance_year['Enrollment Inverse Distance Public'] = enll_inv_distance_arr
    MA_AP_performance_year['Dorm Bed Inverse Distance Public'] = dorm_inv_distance_arr

    
    #### 3. Calculate distance to Private nfp
    inv_distance_arr      = []
    enll_inv_distance_arr = []
    dorm_inv_distance_arr = []
    
    for district_code in MA_AP_performance_year['District Code'].tolist():
        
        total_val = enll_total_val = dorm_total_val = 0
        
        for university_name in uni_coord_Private_nfp['INSTNM'].tolist():
            
            distance_ij           = distance_district_univeristy_Private_nfp(district_code, university_name)
            num_annual_enrollment = uni_coord_Private_nfp[uni_coord_Private_nfp['INSTNM'] == university_name]['Annual enrollment'].values[0]
            num_dorm_bed          = uni_coord_Private_nfp[uni_coord_Private_nfp['INSTNM'] == university_name]['Number of dorm beds'].values[0]
            
            total_val      += inverse_distance_weight( distance_ij )
            enll_total_val += enrollment_inverse_distance_weight( distance_ij, num_annual_enrollment )
            dorm_total_val += dormbed_inverse_distance_weight( distance_ij, num_dorm_bed)
            
        inv_distance_arr.append(total_val)
        enll_inv_distance_arr.append(enll_total_val)
        dorm_inv_distance_arr.append(dorm_total_val)
        
    MA_AP_performance_year['Inverse Distance Private nfp'] = inv_distance_arr
    MA_AP_performance_year['Enrollment Inverse Distance Private nfp'] = enll_inv_distance_arr
    MA_AP_performance_year['Dorm Bed Inverse Distance Private nfp'] = dorm_inv_distance_arr
    
    
    #### 4. Calculate distance to Land_Grant
    inv_distance_arr      = []
    enll_inv_distance_arr = []
    dorm_inv_distance_arr = []
    
    for district_code in MA_AP_performance_year['District Code'].tolist():
        
        total_val = enll_total_val = dorm_total_val = 0
        
        for university_name in uni_coord_Land_Grant['INSTNM'].tolist():
            
            distance_ij           = distance_district_univeristy_Land_Grant(district_code, university_name)
            num_annual_enrollment = uni_coord_Land_Grant[uni_coord_Land_Grant['INSTNM'] == university_name]['Annual enrollment'].values[0]
            num_dorm_bed          = uni_coord_Land_Grant[uni_coord_Land_Grant['INSTNM'] == university_name]['Number of dorm beds'].values[0]
            
            total_val      += inverse_distance_weight( distance_ij )
            enll_total_val += enrollment_inverse_distance_weight( distance_ij, num_annual_enrollment )
            dorm_total_val += dormbed_inverse_distance_weight( distance_ij, num_dorm_bed)
            
        inv_distance_arr.append(total_val)
        enll_inv_distance_arr.append(enll_total_val)
        dorm_inv_distance_arr.append(dorm_total_val)
        
    MA_AP_performance_year['Inverse Distance Land Grant'] = inv_distance_arr
    MA_AP_performance_year['Enrollment Inverse Distance Land Grant'] = enll_inv_distance_arr
    MA_AP_performance_year['Dorm Bed Inverse Distance Land Grant'] = dorm_inv_distance_arr

    
    #### 5. Calculate distance to STEM
    inv_distance_arr      = []
    enll_inv_distance_arr = []
    dorm_inv_distance_arr = []
    
    for district_code in MA_AP_performance_year['District Code'].tolist():
        
        total_val = enll_total_val = dorm_total_val = 0
        
        for university_name in uni_coord_STEM['INSTNM'].tolist():
            
            distance_ij           = distance_district_univeristy_STEM(district_code, university_name)
            num_annual_enrollment = uni_coord_STEM[uni_coord_STEM['INSTNM'] == university_name]['Annual enrollment'].values[0]
            num_dorm_bed          = uni_coord_STEM[uni_coord_STEM['INSTNM'] == university_name]['Number of dorm beds'].values[0]
            
            total_val      += inverse_distance_weight( distance_ij )
            enll_total_val += enrollment_inverse_distance_weight( distance_ij, num_annual_enrollment )
            dorm_total_val += dormbed_inverse_distance_weight( distance_ij, num_dorm_bed)
            
        inv_distance_arr.append(total_val)
        enll_inv_distance_arr.append(enll_total_val)
        dorm_inv_distance_arr.append(dorm_total_val)
        
    MA_AP_performance_year['Inverse Distance STEM'] = inv_distance_arr
    MA_AP_performance_year['Enrollment Inverse Distance STEM'] = enll_inv_distance_arr
    MA_AP_performance_year['Dorm Bed Inverse Distance STEM'] = dorm_inv_distance_arr
    
    
    #### delete redundant columns in the AP_performance excel sheet 
    MA_all_data_year = MA_AP_performance_year.drop(columns=['Score=1', 'Score=1', 'Score=2', 'Score=3', 'Score=4', 'Score=5', '% Score 1-2'])
    MA_all_data_year.insert(2, 'Year', [2000+year] * MA_all_data_year.shape[0])
    MA_all_data_year = MA_all_data_year.sort_values(by='District Code').reset_index(drop=True)
    
    
    #### Add a colum for district population
    MA_district_population = MA_district_population[MA_district_population['District Code'].isin(MA_all_data_year['District Code'])]
    MA_district_population = MA_district_population.sort_values(by='District Code').reset_index(drop=True)
    MA_all_data_year['Population'] = MA_district_population['Population']

    #### Add a colum for district median household income
    MA_median_income = MA_median_income[MA_median_income['District Code'].isin(MA_all_data_year['District Code'])]
    MA_median_income = MA_median_income.sort_values(by='District Code').reset_index(drop=True)
    MA_all_data_year['Median Household Income'] = MA_median_income['Median Household Income']

    #### Add a colum for district total expenditure per pupil
    MA_expenditure_year = MA_expenditure_year[MA_expenditure_year['District Code'].isin(MA_all_data_year['District Code'])]
    MA_expenditure_year = MA_expenditure_year.sort_values(by='District Code').reset_index(drop=True)
    expenditure_arr = MA_expenditure_year['Total Expenditures per Pupil'].tolist()
    expenditure_arr = np.array([float(str(x).replace('$', '').replace(',', '').strip()) if isinstance(x, str) else float(x) for x in expenditure_arr])
    MA_all_data_year['Total Expenditures per Pupil'] = expenditure_arr
    
    #### save this years data in a excel file
    MA_all_data_year.to_excel('data/MA_AP_performance/AP_data_combined/AP_data_combined_20%s.xlsx'%year, index=False)

In [15]:
data18 = pd.read_excel('data/MA_AP_performance/AP_data_combined/AP_data_combined_2018.xlsx').sort_values(by='District Code')
data19 = pd.read_excel('data/MA_AP_performance/AP_data_combined/AP_data_combined_2019.xlsx').sort_values(by='District Code')
data20 = pd.read_excel('data/MA_AP_performance/AP_data_combined/AP_data_combined_2020.xlsx').sort_values(by='District Code')
data21 = pd.read_excel('data/MA_AP_performance/AP_data_combined/AP_data_combined_2021.xlsx').sort_values(by='District Code')
data22 = pd.read_excel('data/MA_AP_performance/AP_data_combined/AP_data_combined_2022.xlsx').sort_values(by='District Code')

combined_data_18_22 = pd.concat([data18, data19, data20, data21, data22], ignore_index=True)

with pd.ExcelWriter("data/MA_AP_performance/AP_data_combined_18_22.xlsx") as writer:
    combined_data_18_22.to_excel(writer, sheet_name="2018-22", index=False)
    data18.to_excel(writer, sheet_name="2018-19", index=False)
    data19.to_excel(writer, sheet_name="2019-20", index=False)
    data20.to_excel(writer, sheet_name="2020-21", index=False)
    data21.to_excel(writer, sheet_name="2021-22", index=False)
    data22.to_excel(writer, sheet_name="2022-23", index=False)