# Notebook 1: Cleaning & Exporting Raw Data

# 1.0 Project Dependencies

## 1.1 Pip Installs

## 1.3 Import Libraries

In [1]:
import pandas as pd
import numpy as np

## 1.3 Import Datasets

### 1.3a Import District Expenses Data

In [2]:
# Read in the expense data
expense_df = pd.read_excel('current_expense.xlsx', skiprows=10)

# Cleaning the expense data
expense_df = expense_df.rename(columns={'CDS':'District Code',
                                        'District': 'District Name',
                                        'Current\nExpense ADA': 'Expense ADA',
                                        'Current\nExpense per ADA': 'Expense per ADA'})
expense_df = expense_df.drop(['CO'], axis=1)

# Display the cleaned expense data
display(expense_df.shape)
display(expense_df.head())
# display(expense_df)

(933, 6)

Unnamed: 0,District Code,District Name,EDP 365,Expense ADA,Expense per ADA,LEA Type
0,61119,Alameda Unified,155094800.0,8567.86,18101.93,Unified
1,61127,Albany City Unified,61490900.0,3435.41,17899.14,Unified
2,61143,Berkeley Unified,220550800.0,8572.17,25728.7,Unified
3,61150,Castro Valley Unified,142491300.0,8991.52,15847.3,Unified
4,61168,Emery Unified,15863000.0,554.7,28597.44,Unified


### 1.3b Import District Demographic Data

In [3]:
# Read in the district data
district_df = pd.read_excel('district_areas.xlsx')

# Cleaning the district data
district_df['District Code'] = district_df['District Code'].astype(str).str[-5:].astype('int64')
district_df = district_df.drop(['Year',
                                    'DistrctAreaSqMi',
                                    'Shape__Area',
                                    'Shape__Length',
                                    'OBJECTID',
                                    'US Congress District',
                                    'CA Senate District',
                                    'CA Assembly District',
                                    'Update Notes'], axis=1)

# Display the cleaned district data
display(district_df.shape)
display(district_df.head())
# display(district_df)

(938, 45)

Unnamed: 0,Fed ID,District Code,CDS Code,County Name,District Name,District Type,Grade Low,Grade High,Grade Low Census,Grade High Census,...,Foster,Foster (%),Homeless,Homeless (%),Migrant,Migrant (%),Students with Disabilities,Students with Disabilities (%),Socioeconomically Disadvantaged,Socioeconomically Disadvantaged (%)
0,601770,61119,1611190000000,Alameda,Alameda Unified,Unified,KG,12,KG,12,...,27,0.3,91,0.9,0,0.0,1286,12.2,4035,38.2
1,601860,61127,1611270000000,Alameda,Albany City Unified,Unified,KG,12,KG,12,...,1,0.0,24,0.7,0,0.0,320,9.0,1122,31.4
2,604740,61143,1611430000000,Alameda,Berkeley Unified,Unified,KG,12,KG,12,...,32,0.4,150,1.7,0,0.0,1092,12.0,2508,27.6
3,607800,61150,1611500000000,Alameda,Castro Valley Unified,Unified,KG,12,KG,12,...,5,0.1,135,1.4,0,0.0,1048,11.0,3686,38.8
4,612630,61168,1611680000000,Alameda,Emery Unified,Unified,KG,12,KG,12,...,1,0.2,13,2.2,0,0.0,75,12.5,327,54.5


# 2.0 Data Cleaning

### 2.1 Check for Column Types & NaN Values

In [4]:
display(expense_df.info())
display(district_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 933 entries, 0 to 932
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   District Code    933 non-null    int64  
 1   District Name    933 non-null    object 
 2   EDP 365          933 non-null    float64
 3   Expense ADA      933 non-null    float64
 4   Expense per ADA  933 non-null    float64
 5   LEA Type         933 non-null    object 
dtypes: float64(3), int64(1), object(2)
memory usage: 43.9+ KB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 938 entries, 0 to 937
Data columns (total 45 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Fed ID                               938 non-null    int64  
 1   District Code                        938 non-null    int64  
 2   CDS Code                             938 non-null    int64  
 3   County Name                          938 non-null    object 
 4   District Name                        938 non-null    object 
 5   District Type                        938 non-null    object 
 6   Grade Low                            938 non-null    object 
 7   Grade High                           938 non-null    int64  
 8   Grade Low Census                     938 non-null    object 
 9   Grade High Census                    938 non-null    int64  
 10  Assistance Status                    930 non-null    object 
 11  Locale                          

None

### 2.2 Determining District Differences Between `expense_df` and `district_df`

---



In [5]:
# Correcting the code to ensure len(codes_excluded) is 933

# Get unique district codes from both dataframes
expense_codes = expense_df['District Code'].unique()
district_codes = district_df['District Code'].unique()

# Find codes that are in district(938) and NOT in expense(933)
codes_excluded_from_expense = [code for code in district_codes if code not in expense_codes]

# Find codes that are in expense(933) and NOT in district(938)
codes_excluded_from_district = [code for code in expense_codes if code not in district_codes]

# Check excluded codes between two datasets
print(f'Number of Values not in Expense: {len(codes_excluded_from_expense)} \nList: {codes_excluded_from_expense}')
print(district_df[district_df['District Code'].isin(codes_excluded_from_expense)]['District Name'].tolist())
print(f'Number of Values not in District: {len(codes_excluded_from_district)} \nList: {codes_excluded_from_district}')
print(expense_df[expense_df['District Code'].isin(codes_excluded_from_district)]['District Name'].tolist())

Number of Values not in Expense: 10 
List: [65557, 65599, 69815, 69823, 70854, 70862, 70912, 70920, 71167, 71175]
['Arena Union Elementary', 'Point Arena Joint Union High', 'Santa Cruz City Elementary', 'Santa Cruz City High', 'Petaluma City Elementary', 'Petaluma Joint Union High', 'Santa Rosa Elementary', 'Santa Rosa High', 'Modesto City Elementary', 'Modesto City High']
Number of Values not in District: 5 
List: [76349, 40261, 40246, 40253, 40717]
['Arena Union Elementary/Point Arena Joint Union High ', 'Santa Cruz City Elementary/High', 'Petaluma City Elementary/Joint Union High', 'Santa Rosa City Schools', 'Modesto City Schools']


### 2.3 Consolidating Non-Unifed Districts in `district_df` into Unified Districts, as shown in `expense_df`

During EDA, we identified 10 districts in our District DF dataset that were not included in our Expense DF dataset.

>`['Arena Union Elementary', 'Point Arena Joint Union High', 'Santa Cruz City Elementary', 'Santa Cruz City High', 'Petaluma City Elementary', 'Petaluma Joint Union High', 'Santa Rosa Elementary', 'Santa Rosa High', 'Modesto City Elementary', 'Modesto City High']`

Likewise, 5 districts in our Expense DF dataset were not in our District DF dataset.

>`['Arena Union Elementary/Point Arena Joint Union High ', 'Santa Cruz City Elementary/High', 'Petaluma City Elementary/Joint Union High', 'Santa Rosa City Schools', 'Modesto City Schools']`

After comparing the two lists, we found the following:

In [6]:
district_table_dict = {
    'Arena Union Elementary': 'Arena Union Elementary/Point Arena Joint Union High ',
    'Point Arena Joint Union High': 'Arena Union Elementary/Point Arena Joint Union High ',
    'Santa Cruz City Elementary': 'Santa Cruz City Elementary/High',
    'Santa Cruz City High': 'Santa Cruz City Elementary/High',
    'Petaluma City Elementary': 'Petaluma City Elementary/Joint Union High',
    'Petaluma Joint Union High': 'Petaluma City Elementary/Joint Union High',
    'Santa Rosa Elementary': 'Santa Rosa City Schools',
    'Santa Rosa High': 'Santa Rosa City Schools',
    'Modesto City Elementary': 'Modesto City Schools',
    'Modesto City High': 'Modesto City Schools'
}

dict_df = (
    pd.DataFrame(list(district_table_dict.items()), columns=['Original Districts', 'Unified District Name'])
    .groupby('Unified District Name')['Original Districts']
    .apply(lambda x: ', '.join(x))
    .reset_index()
)

display(dict_df)

Unnamed: 0,Unified District Name,Original Districts
0,Arena Union Elementary/Point Arena Joint Union...,"Arena Union Elementary, Point Arena Joint Unio..."
1,Modesto City Schools,"Modesto City Elementary, Modesto City High"
2,Petaluma City Elementary/Joint Union High,"Petaluma City Elementary, Petaluma Joint Union..."
3,Santa Cruz City Elementary/High,"Santa Cruz City Elementary, Santa Cruz City High"
4,Santa Rosa City Schools,"Santa Rosa Elementary, Santa Rosa High"


### 2.4 Plan to Maintain Fidelity with 5 Unified Districts

Given that the Expense data used only the Unified District Names, we decided to do the following:

1. Manually identify all 10 districts in the District DF dataframe
2. Locales were the same across unified districts, so we kept them as is.
3. Sum the values for the following district values:
>`['Enroll Total',  'Enroll Charter', 'Enroll Non Charter']`
4. Sum the values for all:
>`['Race & Ethnicity', 'English Learners', 'Foster, 'Homeless', 'Migrant', 'Students with Disabilities', and 'Socioeconomically Disadvantaged']` columns for Count values only.
5. Recalculate the percentages for all values in **Step 4**.

Ultimately, our final merged dataset will **only** contain the Unified District Names for these 10 specific districts.

### 2.5 Manually Merging 10 Districts into 5 Unified Districts & Merging District Expense & District Areas Datasets

In [7]:
print('Unique District Codes in district_df: ', district_df['District Code'].nunique())
print('Unique District Codes in expense_df: ', expense_df['District Code'].nunique())
print('\n')
print('5 Codes Excluded from district_df: ', codes_excluded_from_district)
print('10 Codes Excluded from expense_df: ',codes_excluded_from_expense)

Unique District Codes in district_df:  938
Unique District Codes in expense_df:  933


5 Codes Excluded from district_df:  [76349, 40261, 40246, 40253, 40717]
10 Codes Excluded from expense_df:  [65557, 65599, 69815, 69823, 70854, 70862, 70912, 70920, 71167, 71175]


#### Step 1: Creating a Dictionary of District Codes to be Unified

In [8]:
# Key:    Un-unified District Code to be replaced
# Value:  UnifiedDistrict Code to replace with
dict_of_codes = {
    65557: 76349,
    65599: 76349,
    69815: 40261,
    69823: 40261,
    70854: 40246,
    70862: 40246,
    70912: 40253,
    70920: 40253,
    71167: 40717,
    71175: 40717
}

district_df = district_df.copy()

# Replace 10 District Codes in `district_df` with 5 Unified District Codes
district_df['District Code'] = district_df['District Code'].replace(dict_of_codes)

print("Unique Codes after Code replacement:", district_df['District Code'].nunique())                #933
print("Unique Names in district_df before Name replace: ", district_df['District Name'].nunique())   #927
print("Unique Names in expense_df before Name replace: ", expense_df['District Name'].nunique())     #919

Unique Codes after Code replacement: 933
Unique Names in district_df before Name replace:  927
Unique Names in expense_df before Name replace:  919


This tells us 2 things:
1. *Number of District Codes* > *Number of District Names*
2. `district_df` (927) **has 8 more names** than `expense_df` does (919)

3. `district_df`: 933 – 927 = 6 “collisions” (situations where 2+ codes share the same name)

4. `expense_df`: 933 – 919 = 14 collisions

So *Number of Codes* > *Number of Names* makes sense

#### Step 2: Determine Different District Names in Expense & District DataFrames

In [9]:
names_in_district = set(district_df['District Name'].unique())
names_in_expense  = set(expense_df['District Name'].unique())

only_in_district = names_in_district - names_in_expense
only_in_expense  = names_in_expense - names_in_district

print("Names only in district_df:", len(only_in_district))
print(sorted(list(only_in_district)))
print("Names only in expense_df:", len(only_in_expense))
print(sorted(list(only_in_expense)))

# Now we know that:
# district_df  uses  shorter names
# expense_df   uses  longer names

Names only in district_df: 92
['Alisal Union', 'Arena Union Elementary', 'Arvin Union', 'Bakersfield City', 'Bellevue Union', 'Bolinas-Stinson Union', 'Brentwood Union', 'Cajon Valley Union', 'Cambrian', 'Campbell Union', 'Castaic Union', 'Chatom Union', 'Chualar Union', 'Columbia Union', 'Cupertino Union', 'El Monte City', 'Escondido Union', 'Eureka City Schools', 'Eureka Union', 'Evergreen Union', 'Hacienda la Puente Unified', 'Hawthorne', 'Hollister', 'Kenwood', 'Keyes Union', 'King City Union', 'Lakeside Joint', 'Lakeside Union', 'Latrobe', 'Lemon Grove', 'Lennox', 'Livingston Union', 'Los Nietos', 'Lowell Joint', 'Luther Burbank', 'McCabe Union Elementary', 'McFarland Unified', 'McKinleyville Union Elementary', 'McKittrick Elementary', 'Meadows Union', 'Menifee Union', 'Modesto City Elementary', 'Modesto City High', 'Moreland', 'Mountain View Whisman', 'Newhall', 'Nicasio', 'Nuview Union', 'Ocean View', 'Old Adobe Union', 'Ontario-Montclair', 'Orange Center', 'Orinda Union', 'Oro 

To summarize this information:
1. There are 2+1+1+1+1+1+1+1+1+1 = 11 "extra" codes
2. `expense_df` names are more descriptive than `district_df` names
3. This makes sense because `district_df` puts descriptive elementary/middle/high names in the `District Type` column

Therefore, we should use `expense_df`'s District Names as our "standardized district names".

Since we've already established *#Codes* > *#Names*, we know that some names are used for multiple codes. In other words, `District Names` are **non-unique**.

So we should use:
>District Code = Unique Authoritative Key

>District Name = Non-Unique, Descriptive Key

#### Step 3: Determine Non-Unique District Names

In [10]:
dupes = (
    district_df.groupby('District Name')['District Code']
    .nunique()
    .sort_values(ascending=False)
)

print(dupes[dupes > 1])

District Name
Jefferson Elementary         3
Washington Unified           2
Pacific Union Elementary     2
Lakeside Union Elementary    2
Ocean View                   2
Liberty Elementary           2
Junction Elementary          2
Hope Elementary              2
Pioneer Union Elementary     2
Mountain View Elementary     2
Name: District Code, dtype: int64


This means that there *11* non-unique District Names. In other words, the same District Name(str) is tied to multiple Codes!

For example, "Jefferson Elementary" District Name is tied to 3 District Codes (and Counties): San Benito, San Joaquin, & San Mateo

However, `district_df` shows us `County Name`, which is important for visualization labels.
>`expense_df` does not show us County Name, so identical `District Names` in `expense_df` will give us confusing visualizations.

Therefore, going forward:
1. **For Analysis & Grouping**: We should always reference to each row of data by `District Code` because codes are unique.
2. **For Visualizations**: We should use the new `District Label` column, which will be a unique concatenation of "District Name (County)".

#### Step 4: Concatenate Unified Districts

We've renamed the Codes to the Unified District Codes.

So let's first take care of the data for these 10 rows before we do our final merge.

In [11]:
expense_data = expense_df.copy()
district_data = district_df.copy()

print(expense_data.shape)
print(district_data.shape)

dupes = district_data['District Code'].value_counts()
print(dupes[dupes > 1])

(933, 6)
(938, 45)
District Code
76349    2
40261    2
40253    2
40246    2
40717    2
Name: count, dtype: int64


#### Step 5: Pair Up Unified Districts & Ascertain dtypes

In [12]:
unified_list = [76349, 40261, 40246, 40253, 40717]

unified_pair_1 = district_data[district_data['District Code'] == 76349]
unified_pair_2 = district_data[district_data['District Code'] == 40261]
unified_pair_3 = district_data[district_data['District Code'] == 40246]
unified_pair_4 = district_data[district_data['District Code'] == 40253]
unified_pair_5 = district_data[district_data['District Code'] == 40717]

unified_pair_list = [unified_pair_1, unified_pair_2, unified_pair_3, unified_pair_4, unified_pair_5]

display(unified_pair_1)
print(unified_pair_1.dtypes)    # Checked: All columns with numeric values are int64 or float64
district_table_dict

Unnamed: 0,Fed ID,District Code,CDS Code,County Name,District Name,District Type,Grade Low,Grade High,Grade Low Census,Grade High Census,...,Foster,Foster (%),Homeless,Homeless (%),Migrant,Migrant (%),Students with Disabilities,Students with Disabilities (%),Socioeconomically Disadvantaged,Socioeconomically Disadvantaged (%)
349,603090,76349,23655570000000,Mendocino,Arena Union Elementary,Elementary,KG,12,KG,8,...,1,0.3,16,5.6,0,0.0,37,12.9,160,55.7
353,631230,76349,23655990000000,Mendocino,Point Arena Joint Union High,High,09,12,09,12,...,3,2.1,25,17.5,2,1.4,27,18.9,83,58.0


Fed ID                                   int64
District Code                            int64
CDS Code                                 int64
County Name                             object
District Name                           object
District Type                           object
Grade Low                               object
Grade High                               int64
Grade Low Census                        object
Grade High Census                        int64
Assistance Status                       object
Locale                                  object
Enroll Total                             int64
Enroll Charter                           int64
Enroll Non Charter                       int64
African American                         int64
African American (%)                   float64
American Indian                          int64
American Indian (%)                    float64
Asian                                    int64
Asian (%)                              float64
Filipino     

{'Arena Union Elementary': 'Arena Union Elementary/Point Arena Joint Union High ',
 'Point Arena Joint Union High': 'Arena Union Elementary/Point Arena Joint Union High ',
 'Santa Cruz City Elementary': 'Santa Cruz City Elementary/High',
 'Santa Cruz City High': 'Santa Cruz City Elementary/High',
 'Petaluma City Elementary': 'Petaluma City Elementary/Joint Union High',
 'Petaluma Joint Union High': 'Petaluma City Elementary/Joint Union High',
 'Santa Rosa Elementary': 'Santa Rosa City Schools',
 'Santa Rosa High': 'Santa Rosa City Schools',
 'Modesto City Elementary': 'Modesto City Schools',
 'Modesto City High': 'Modesto City Schools'}

#### Step 6: Categorize Columns

We need to determine what to do with each of the columns for each Unified Pair.

In [13]:
keep_as_is_columns = [
    'District Code',
    'County Name',
    'District Type',
    'Grade Low',
    'Grade High',
    'Grade Low Census',
    'Grade High Census',
    'Assistance Status'
]

delete_columns = [
    'Fed ID',
    'CDS Code'
]

count_columns = [
    'Enroll Total',
    'Enroll Charter',
    'Enroll Non Charter',
    'African American',
    'American Indian',
    'Asian',
    'Filipino',
    'Hispanic',
    'Pacific Islander',
    'White',
    'Two or More Races',
    'Not Reported',
    'English Learner',
    'Foster',
    'Homeless',
    'Migrant',
    'Students with Disabilities',
    'Socioeconomically Disadvantaged']

percent_columns_dict = {
    'African American (%)': 'African American',
    'American Indian (%)': 'American Indian',
    'Asian (%)': 'Asian',
    'Filipino (%)': 'Filipino',
    'Hispanic (%)': 'Hispanic',
    'Pacific Islander (%)': 'Pacific Islander',
    'White (%)': 'White',
    'Two or More Races (%)': 'Two or More Races',
    'Not Reported (%)': 'Not Reported',
    'English Learner (%)': 'English Learner',
    'Foster (%)': 'Foster',
    'Homeless (%)': 'Homeless',
    'Migrant (%)': 'Migrant',
    'Students with Disabilities (%)': 'Students with Disabilities',
    'Socioeconomically Disadvantaged (%)': 'Socioeconomically Disadvantaged'
}

#### Step 7: Function to Merge Pair

# **Note for Cleaner Code:**
## **Revise function below to take in additional parameters (args) for function (local variables) Convention: ALL CAPS for constants (global variables)**

In [14]:
def merge_unified_pairs(unified_pair): #, keep_cols, delete_cols, count_cols, percent_cols, district_table_dict)
    # Delete Unneeded Columns
    merge_pair = unified_pair.drop(columns=delete_columns)

    # Sum Count Columns
    count_sums = merge_pair[count_columns].sum()      # count_sums is a Series

    # Calculate Percent Columns
    for percent_column, cnt_column in percent_columns_dict.items():
        count_sums[percent_column] = (count_sums[cnt_column] / count_sums['Enroll Total']) * 100

    # Convert Series to DataFrame
    merge_pair = count_sums.to_frame().T     # Transpose to convert from Column to Row

    # Add the "Keep As-Is" columns
    merge_pair[keep_as_is_columns] = unified_pair[keep_as_is_columns].iloc[0].values

    # Add District Name from expense_df's name, which was used in Unified Pair
    merge_pair['District Name'] = unified_pair['District Name'].iloc[0]

    for non_unified_name, unified_name in district_table_dict.items():
        if non_unified_name in unified_pair['District Name'].values:
            merge_pair['District Name'] = unified_name

    # Make shared District Names unique by adding County Name
    # We will create a new column, District Label
    merge_pair['District Label'] = (
        merge_pair['District Name'].astype(str)
        + " (" + unified_pair['County Name'].iloc[0] + ")"
    )

    return merge_pair

#### Step 8: Run Function for All Unified Pairs

In [15]:
# Run function for all unified pairs

merged_pair_1 = merge_unified_pairs(unified_pair_1)
merged_pair_2 = merge_unified_pairs(unified_pair_2)
merged_pair_3 = merge_unified_pairs(unified_pair_3)
merged_pair_4 = merge_unified_pairs(unified_pair_4)
merged_pair_5 = merge_unified_pairs(unified_pair_5)

display(merged_pair_1)
display(merged_pair_2)
display(merged_pair_3)
display(merged_pair_4)
display(merged_pair_5)

Unnamed: 0,Enroll Total,Enroll Charter,Enroll Non Charter,African American,American Indian,Asian,Filipino,Hispanic,Pacific Islander,White,...,District Code,County Name,District Type,Grade Low,Grade High,Grade Low Census,Grade High Census,Assistance Status,District Name,District Label
0,430.0,66.0,364.0,0.0,28.0,1.0,0.0,245.0,1.0,137.0,...,76349,Mendocino,Elementary,KG,12,KG,8,General Assistance,Arena Union Elementary/Point Arena Joint Union...,Arena Union Elementary/Point Arena Joint Union...


Unnamed: 0,Enroll Total,Enroll Charter,Enroll Non Charter,African American,American Indian,Asian,Filipino,Hispanic,Pacific Islander,White,...,District Code,County Name,District Type,Grade Low,Grade High,Grade Low Census,Grade High Census,Assistance Status,District Name,District Label
0,6272.0,116.0,6156.0,72.0,11.0,141.0,29.0,2537.0,13.0,2963.0,...,40261,Santa Cruz,Elementary,KG,5,KG,5,General Assistance,Santa Cruz City Elementary/High,Santa Cruz City Elementary/High (Santa Cruz)


Unnamed: 0,Enroll Total,Enroll Charter,Enroll Non Charter,African American,American Indian,Asian,Filipino,Hispanic,Pacific Islander,White,...,District Code,County Name,District Type,Grade Low,Grade High,Grade Low Census,Grade High Census,Assistance Status,District Name,District Label
0,7388.0,1325.0,6063.0,82.0,48.0,177.0,45.0,2593.0,28.0,3923.0,...,40246,Sonoma,Elementary,KG,12,KG,6,,Petaluma City Elementary/Joint Union High,Petaluma City Elementary/Joint Union High (Son...


Unnamed: 0,Enroll Total,Enroll Charter,Enroll Non Charter,African American,American Indian,Asian,Filipino,Hispanic,Pacific Islander,White,...,District Code,County Name,District Type,Grade Low,Grade High,Grade Low Census,Grade High Census,Assistance Status,District Name,District Label
0,14798.0,1938.0,12860.0,265.0,89.0,586.0,145.0,8709.0,116.0,4113.0,...,40253,Sonoma,Elementary,KG,8,KG,6,,Santa Rosa City Schools,Santa Rosa City Schools (Sonoma)


Unnamed: 0,Enroll Total,Enroll Charter,Enroll Non Charter,African American,American Indian,Asian,Filipino,Hispanic,Pacific Islander,White,...,District Code,County Name,District Type,Grade Low,Grade High,Grade Low Census,Grade High Census,Assistance Status,District Name,District Label
0,31431.0,1754.0,29677.0,771.0,87.0,1378.0,259.0,21296.0,155.0,4999.0,...,40717,Stanislaus,Elementary,KG,12,KG,8,Differentiated Assistance,Modesto City Schools,Modesto City Schools (Stanislaus)


#### Step 9: Add `District Label` Column to Rest of `district_data`

In [16]:
# Add `District Label` Column to rest of `district_data`

district_data['District Label'] = (
    district_data['District Name'].astype(str)
    + " (" + district_data['County Name'] + ")"
)

district_data.head()

Unnamed: 0,Fed ID,District Code,CDS Code,County Name,District Name,District Type,Grade Low,Grade High,Grade Low Census,Grade High Census,...,Foster (%),Homeless,Homeless (%),Migrant,Migrant (%),Students with Disabilities,Students with Disabilities (%),Socioeconomically Disadvantaged,Socioeconomically Disadvantaged (%),District Label
0,601770,61119,1611190000000,Alameda,Alameda Unified,Unified,KG,12,KG,12,...,0.3,91,0.9,0,0.0,1286,12.2,4035,38.2,Alameda Unified (Alameda)
1,601860,61127,1611270000000,Alameda,Albany City Unified,Unified,KG,12,KG,12,...,0.0,24,0.7,0,0.0,320,9.0,1122,31.4,Albany City Unified (Alameda)
2,604740,61143,1611430000000,Alameda,Berkeley Unified,Unified,KG,12,KG,12,...,0.4,150,1.7,0,0.0,1092,12.0,2508,27.6,Berkeley Unified (Alameda)
3,607800,61150,1611500000000,Alameda,Castro Valley Unified,Unified,KG,12,KG,12,...,0.1,135,1.4,0,0.0,1048,11.0,3686,38.8,Castro Valley Unified (Alameda)
4,612630,61168,1611680000000,Alameda,Emery Unified,Unified,KG,12,KG,12,...,0.2,13,2.2,0,0.0,75,12.5,327,54.5,Emery Unified (Alameda)


### Step 10: Add Newly Unified Rows to `district_data` & Drop Old, Non-unified Rows

In [17]:
# Add All Newly Unified/Merged Row to `district_data` & Drop Old Non-Unified Rows
# Use .isin() to filter out the rows with specific District Codes

#filter out existing district code data with ~
district_data = district_data[~district_data['District Code'].isin([76349, 40261, 40246, 40253, 40717])]

district_data = pd.concat([district_data, merged_pair_1, merged_pair_2, merged_pair_3, merged_pair_4, merged_pair_5], ignore_index=True)

print(district_data.shape) #933
display(district_data.head())

(933, 46)


Unnamed: 0,Fed ID,District Code,CDS Code,County Name,District Name,District Type,Grade Low,Grade High,Grade Low Census,Grade High Census,...,Foster (%),Homeless,Homeless (%),Migrant,Migrant (%),Students with Disabilities,Students with Disabilities (%),Socioeconomically Disadvantaged,Socioeconomically Disadvantaged (%),District Label
0,601770.0,61119,1611190000000.0,Alameda,Alameda Unified,Unified,KG,12,KG,12,...,0.3,91.0,0.9,0.0,0.0,1286.0,12.2,4035.0,38.2,Alameda Unified (Alameda)
1,601860.0,61127,1611270000000.0,Alameda,Albany City Unified,Unified,KG,12,KG,12,...,0.0,24.0,0.7,0.0,0.0,320.0,9.0,1122.0,31.4,Albany City Unified (Alameda)
2,604740.0,61143,1611430000000.0,Alameda,Berkeley Unified,Unified,KG,12,KG,12,...,0.4,150.0,1.7,0.0,0.0,1092.0,12.0,2508.0,27.6,Berkeley Unified (Alameda)
3,607800.0,61150,1611500000000.0,Alameda,Castro Valley Unified,Unified,KG,12,KG,12,...,0.1,135.0,1.4,0.0,0.0,1048.0,11.0,3686.0,38.8,Castro Valley Unified (Alameda)
4,612630.0,61168,1611680000000.0,Alameda,Emery Unified,Unified,KG,12,KG,12,...,0.2,13.0,2.2,0.0,0.0,75.0,12.5,327.0,54.5,Emery Unified (Alameda)


#### Step 11: Combine `district_data` with `expense data`

In [18]:
# Use Merge
district_and_expenses = district_data.merge(expense_data, how='inner', on='District Code')

district_and_expenses.head()

Unnamed: 0,Fed ID,District Code,CDS Code,County Name,District Name_x,District Type,Grade Low,Grade High,Grade Low Census,Grade High Census,...,Students with Disabilities,Students with Disabilities (%),Socioeconomically Disadvantaged,Socioeconomically Disadvantaged (%),District Label,District Name_y,EDP 365,Expense ADA,Expense per ADA,LEA Type
0,601770.0,61119,1611190000000.0,Alameda,Alameda Unified,Unified,KG,12,KG,12,...,1286.0,12.2,4035.0,38.2,Alameda Unified (Alameda),Alameda Unified,155094800.0,8567.86,18101.93,Unified
1,601860.0,61127,1611270000000.0,Alameda,Albany City Unified,Unified,KG,12,KG,12,...,320.0,9.0,1122.0,31.4,Albany City Unified (Alameda),Albany City Unified,61490900.0,3435.41,17899.14,Unified
2,604740.0,61143,1611430000000.0,Alameda,Berkeley Unified,Unified,KG,12,KG,12,...,1092.0,12.0,2508.0,27.6,Berkeley Unified (Alameda),Berkeley Unified,220550800.0,8572.17,25728.7,Unified
3,607800.0,61150,1611500000000.0,Alameda,Castro Valley Unified,Unified,KG,12,KG,12,...,1048.0,11.0,3686.0,38.8,Castro Valley Unified (Alameda),Castro Valley Unified,142491300.0,8991.52,15847.3,Unified
4,612630.0,61168,1611680000000.0,Alameda,Emery Unified,Unified,KG,12,KG,12,...,75.0,12.5,327.0,54.5,Emery Unified (Alameda),Emery Unified,15863000.0,554.7,28597.44,Unified


### 2.6 Drop `District Name_x` (from District DF) & Use `District Name_y` (from Expense DF) as Authoritative District Name

### 2.7 Cleaning the 'Locale' column

In [19]:
# Change the data type of the 'Locale' column in the DataFrame to string
district_and_expenses['Locale'] = district_and_expenses['Locale'].astype(str)

# Display the previous DataFrame 'Locale' column value counts
print(district_and_expenses['Locale'].value_counts())

# Create a function that will update the 'Locale' value for each row in the DataFrame to not include the locale
# code (ex. 21) or location subtype (ex. Large)
def assign_location_type(locale):
  if locale == 'nan':
    return 'Not Reported'
  else:
    return locale.split()[2].strip(',')

# Apply the 'assign_location_type' function to update each value in the 'Locale' column
district_and_expenses['Locale'] = district_and_expenses['Locale'].apply(assign_location_type)

# Display the new DataFrame 'Locale' column value counts
print(district_and_expenses['Locale'].value_counts())

Locale
21 - Suburban, Large      228
41 - Rural, Fringe        146
42 - Rural, Distant       136
32 - Town, Distant         67
43 - Rural, Remote         60
31 - Town, Fringe          57
11 - City, Large           49
12 - City, Midsize         43
13 - City, Small           41
22 - Suburban, Midsize     38
33 - Town, Remote          33
23 - Suburban, Small       30
nan                         5
Name: count, dtype: int64
Locale
Rural           342
Suburban        296
Town            157
City            133
Not Reported      5
Name: count, dtype: int64


In [20]:
district_and_expenses = district_and_expenses.drop(columns=['District Name_x']).rename(columns={'District Name_y':'District Name'})

district_and_expenses.columns
district_and_expenses['District Code'].nunique()

933

### 2.8 Looking into Outliers
### Discrepancies between Total Enrollment & ADA

In [21]:
# Explore Total Enrollment vs. ADA and filter outliers
# (|ADA - Enrollment| / ADA >= 50)
# 5000% Difference

district_and_expenses_no_out = district_and_expenses[
    ~((abs(district_and_expenses['Expense ADA'] - district_and_expenses['Enroll Total']) / district_and_expenses['Expense ADA'] >= 50))]

district_and_expenses['Decimal Difference'] = (abs(district_and_expenses['Expense ADA'] - district_and_expenses['Enroll Total']) / district_and_expenses['Expense ADA'])

district_and_expenses_no_out.shape

# Look at dropped values
district_and_expenses_out = district_and_expenses[
    ((abs(district_and_expenses['Expense ADA'] - district_and_expenses['Enroll Total']) / district_and_expenses['Expense ADA'] >= 50))]

display(district_and_expenses_out)

Unnamed: 0,Fed ID,District Code,CDS Code,County Name,District Type,Grade Low,Grade High,Grade Low Census,Grade High Census,Assistance Status,...,Students with Disabilities (%),Socioeconomically Disadvantaged,Socioeconomically Disadvantaged (%),District Label,District Name,EDP 365,Expense ADA,Expense per ADA,LEA Type,Decimal Difference
192,623820.0,63628,15636280000000.0,Kern,Unified,KG,12,KG,12,Differentiated Assistance,...,10.1,7075.0,48.7,Maricopa Unified (Kern),Maricopa Unified,7810924.74,281.45,27752.44,Unified,50.643276
530,628950.0,67827,36678270000000.0,San Bernardino,Elementary,KG,12,KG,6,General Assistance,...,13.4,4794.0,83.8,Oro Grande (San Bernardino),Oro Grande Elementary,11112406.38,79.93,139026.73,Elementary,70.612661
552,610710.0,68049,37680490000000.0,San Diego,Elementary,KG,12,KG,8,General Assistance,...,10.5,4395.0,37.7,Dehesa Elementary (San Diego),Dehesa Elementary,4231109.69,227.51,18597.47,Elementary,50.281262
578,637680.0,68403,37684030000000.0,San Diego,Elementary,KG,12,KG,8,General Assistance,...,14.4,2644.0,68.1,Spencer Valley Elementary (San Diego),Spencer Valley Elementary,1594837.91,44.88,35535.6,Elementary,85.564171
596,627030.0,68627,39686270000000.0,San Joaquin,Elementary,KG,12,KG,8,General Assistance,...,12.7,4379.0,63.6,New Jerusalem Elementary (San Joaquin),New Jerusalem Elementary,5146020.92,13.45,382603.79,Elementary,511.118959


After looking through the output data, we noticed that `New Jerusalem Elementary` had a proportional difference of ~51000% between its `Expense ADA` (Count Day Attendance) & `Total Enrollment`.

Due to this extreme discrepancy, we decided to omit `New Jerusalmen Elementary` from our dataset.

In [22]:
# Drop 'New Jerusalem Elementary' data
district_and_expenses = district_and_expenses[district_and_expenses['District Name'] != 'New Jerusalem Elementary']

district_and_expenses.shape

(932, 51)

# 3.0 Exporting Clean Data

To organize our workflow, we decided to export our clean data to a CSV for easy reference.

The CSV file will be saved in our Shared Google folder, `STUDENT-milestone_1`.

In [26]:
# Write district_and_expenses to CSV for export into

district_and_expenses.to_csv('district_and_expenses.csv', index=False)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=246b06f0-3e45-45e3-acef-efea2bae7701' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>