# Working scripts for data correction and saving after pandas loadable format

In [None]:
import pandas as pd
import numpy as np


## Carnegie Data

In [None]:
def gimmeCarnegieFull():
    """ Returns the limited Carnegie Dataset
        County information should be applied post-loading where appropriate
    """
    carnegie=pd.read_excel('data/CCIHE2021-PublicData.xlsx',sheet_name='Data') # Carnegie university clasification as dataframe
    return carnegie

### Desired Carnegie Keys
name, city, stabbr, basic2021, enrprofile2021, sector, control, hbcu, hsi, msi, womens, selindex, rooms, ugtenr20, tribal

### Necessary reclassification/joining
tribal: 2 (no) should be 0 (no)

hbcu: 2 (no) should be 0 (no)

basic2021:
-2: Remove values, not classified
1, 2, 3, 4, 5, 6, 7, 8, 9, 14: Associates (**0**)
10, 11, 12, 13, 24, 25, 26, 27, 28, 29, 30, 31, 32: Professional (**1**)
15, 16: R-schools (**2**)
17: Doctoral (**3**)
18, 19, 20: Masters (**4**)
21, 22, 23: Bachelors (**5**)

In [None]:
carnegie_full=gimmeCarnegieFull()
needed_information=['name', 'city', 'stabbr', 'basic2021', 'enrprofile2021', 'sector', 'control', 'hbcu', 'hsi', 'msi', 'womens', 'selindex', 'rooms', 'ugtenr20', 'tribal']
carnegie_partial=carnegie_full[needed_information]

In [None]:
# 2, no, values for 0 replacement
carnegie_partial.loc[carnegie_partial['hbcu']==2,'hbcu']=0
carnegie_partial.loc[carnegie_partial['tribal']==2,'tribal']=0

In [None]:
# basic classification grouping algorithm
carnegie_partial=carnegie_partial[carnegie_partial['basic2021']!=(-2)]

associates=[1, 2, 3, 4, 5, 6, 7, 8, 9, 14]
professional=[10, 11, 12, 13, 24, 25, 26, 27, 28, 29, 30, 31, 32]
rschools=[15, 16]
doctoral=[17]
masters=[18,19,20]
bachelors=[21, 22, 23]

carnegie_partial.loc[carnegie_partial['basic2021'].isin(associates),'basic2021']=0
carnegie_partial.loc[carnegie_partial['basic2021'].isin(professional),'basic2021']=1
carnegie_partial.loc[carnegie_partial['basic2021'].isin(rschools),'basic2021']=2
carnegie_partial.loc[carnegie_partial['basic2021'].isin(doctoral),'basic2021']=3
carnegie_partial.loc[carnegie_partial['basic2021'].isin(masters),'basic2021']=4
carnegie_partial.loc[carnegie_partial['basic2021'].isin(bachelors),'basic2021']=5

In [None]:
carnegie_partial.to_csv('data/CCIHE2021-PublicData_limited.csv',index=False)

## Georgia Data

In [None]:
url_2019='https://download.gosa.ga.gov/2019/AP_2019_FEB_24_2020.csv'
url_2020='https://download.gosa.ga.gov/2020/AP_2020_JUN_21_2021.csv'
url_2021='https://download.gosa.ga.gov/2021/AP_2021_Dec062021.csv'
url_2022='https://download.gosa.ga.gov/2022/AP_2022_Apr102023.csv'
url_2023='https://download.gosa.ga.gov/2023/AP_2022-23_2024-04-02_14_14_37.csv'

ga_2019=pd.read_csv(url_2019)
ga_2020=pd.read_csv(url_2020)
ga_2021=pd.read_csv(url_2021)
ga_2022=pd.read_csv(url_2022)
ga_2023=pd.read_csv(url_2023)

In [None]:
print(ga_2019.keys())
print(ga_2020.keys())
print(ga_2021.keys())
print(ga_2022.keys())
print(ga_2023.keys())

In [None]:
ga_2019['Year']=2019
ga_2020['Year']=2020
ga_2021['Year']=2021
ga_2022['Year']=2022
ga_2023['Year']=2023


In [None]:
ga_2023=ga_2023.rename(columns={'NUMBER_TESTS_3_OR_HIGHER':'NOTESTS_3ORHIGHER'})

In [None]:
years=[ga_2019,ga_2020,ga_2021,ga_2022,ga_2023]
ga_total=pd.concat(years)

In [None]:
ga_total.keys()

### kept and new information GA

Kept: ['SCHOOL_DSTRCT_NM', 'INSTN_NUMBER', 'TEST_CMPNT_TYP_NM','NUMBER_TESTS_TAKEN', 'NOTESTS_3ORHIGHER']
New: ['Year', 'County']

All non-county data dropped except Atlanta Public Schools which has been as part of Fulton county

Data that has 'TRS' was dropped

School specific data is dropped

In [None]:
kept_keys=['SCHOOL_DSTRCT_NM', 'TEST_CMPNT_TYP_NM','NUMBER_TESTS_TAKEN', 'NOTESTS_3ORHIGHER','Year']
ga_partial=ga_total.loc[ga_total['INSTN_NUMBER']=='SCHOOL_ALL'][kept_keys]


In [None]:
# Atlanta fixer
temp1=ga_partial[ga_partial['SCHOOL_DSTRCT_NM'].str.contains('Atlanta')]
temp2=ga_partial[ga_partial['SCHOOL_DSTRCT_NM'].str.contains('Fulton')]
temp3=pd.concat([temp1,temp2]).groupby(['Year','TEST_CMPNT_TYP_NM']).sum().reset_index()
temp3['SCHOOL_DSTRCT_NM']='Fulton County'
ga_partial=ga_partial[ga_partial['SCHOOL_DSTRCT_NM']!='Fulton County']
ga_partial=pd.concat([ga_partial,temp3])

In [None]:
# Missing data fixer
ga_partial=ga_partial.dropna()
ga_partial=ga_partial[ga_partial['NOTESTS_3ORHIGHER'].str.contains('TFS',na=False)==False]
ga_partial=ga_partial[ga_partial['NUMBER_TESTS_TAKEN'].str.contains('TFS',na=False)==False]
ga_partial=ga_partial.astype({'NUMBER_TESTS_TAKEN':'float','NOTESTS_3ORHIGHER':'float'})


In [None]:
# Pass rates 
ga_partial['Pass Rate']=ga_partial['NOTESTS_3ORHIGHER']/ga_partial['NUMBER_TESTS_TAKEN']

In [None]:
# Counties
ga_partial=ga_partial[ga_partial['SCHOOL_DSTRCT_NM'].str.contains('County')]
ga_partial['County']=ga_partial['SCHOOL_DSTRCT_NM'].str.split(' County',n=1,expand=True)[0]

In [None]:
ga_partial.to_csv('data/GA_2019-23_counties.csv',index=False)