In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pandas_profiling
from scipy import stats
import numpy as np
import math

In [2]:
grad_rates = pd.read_csv('./data/18_19_Grad_Rate_School.csv')

In [3]:
demo = pd.read_excel('./data/18_19_Demographics.xlsx')

In [4]:
finance = pd.read_excel('./data/18_19_Finance.xlsx')

### Exploring Graduation Rates Dataset

In [5]:
grad_rates

Unnamed: 0,year,system,system_name,school,school_name,subgroup,grad_cohort,grad_count,grad_rate
0,2019,10,Anderson County,2,Anderson County High School,All Students,280,269,96.1
1,2019,10,Anderson County,2,Anderson County High School,Asian,4,*,*
2,2019,10,Anderson County,2,Anderson County High School,Black or African American,2,*,*
3,2019,10,Anderson County,2,Anderson County High School,Black/Hispanic/Native American,4,*,*
4,2019,10,Anderson County,2,Anderson County High School,Economically Disadvantaged,80,72,90
...,...,...,...,...,...,...,...,...,...
7131,2019,985,Achievement School District,8140,Hillcrest High School,Non-Homeless,138,86,62.3
7132,2019,985,Achievement School District,8140,Hillcrest High School,Non-Migrant,149,90,60.4
7133,2019,985,Achievement School District,8140,Hillcrest High School,Non-Students with Disabilities,115,74,64.3
7134,2019,985,Achievement School District,8140,Hillcrest High School,Students with Disabilities,34,16,47.1


In [6]:
grad_rates.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7136 entries, 0 to 7135
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   year         7136 non-null   int64 
 1   system       7136 non-null   int64 
 2   system_name  7136 non-null   object
 3   school       7136 non-null   int64 
 4   school_name  7136 non-null   object
 5   subgroup     7136 non-null   object
 6   grad_cohort  7136 non-null   int64 
 7   grad_count   7136 non-null   object
 8   grad_rate    7136 non-null   object
dtypes: int64(4), object(5)
memory usage: 501.9+ KB


In [7]:
grad_rates.rename(columns={'system':'DISTRICT_ID', 'school':'SCHOOL_ID'}, inplace=True)
grad_rates.columns

Index(['year', 'DISTRICT_ID', 'system_name', 'SCHOOL_ID', 'school_name',
       'subgroup', 'grad_cohort', 'grad_count', 'grad_rate'],
      dtype='object')

In [8]:
grad_rates.subgroup.unique()

array(['All Students', 'Asian', 'Black or African American',
       'Black/Hispanic/Native American', 'Economically Disadvantaged',
       'Female', 'Hispanic', 'Homeless', 'Male',
       'Non-Black/Hispanic/Native American',
       'Non-Economically Disadvantaged',
       'Non-English Learners/Transitional 1-4', 'Non-Homeless',
       'Non-Migrant', 'Non-Students with Disabilities',
       'Students with Disabilities', 'White',
       'American Indian or Alaska Native',
       'English Learners with Transitional 1-4',
       'Native Hawaiian or Other Pacific Islander', 'Migrant'],
      dtype=object)

In [9]:
grad_rates = grad_rates.loc[grad_rates.subgroup == 'All Students']

In [10]:
grad_rates = grad_rates[['DISTRICT_ID','SCHOOL_ID','grad_rate']].copy()

In [11]:
grad_rates.grad_rate.unique()

array(['96.1', '95.2', '*', '92.1', '98.2', '88.7', '91.4', '**', '94.3',
       '93.3', '92.8', '98.8', '94.5', '92.3', '92.7', '95.8', '87.5',
       '90', '91.1', '92.2', '91', '97.7', '97.3', '91.5', '83.7', '87.3',
       '88.9', '91.2', '94', '91.8', '95.9', '90.7', '97.8', '92.6',
       '92.5', '90.8', '96.4', '94.8', '78.5', '77.8', '72.8', '84.5',
       '74.7', '83.3', '82.1', '88.3', '20', '77.2', '80.4', '64', '69.3',
       '66.9', '85.5', '76.8', '83.8', '88.4', '83.9', '90.3', '93.1',
       '91.3', '93.2', '94.4', '95', '96.7', '88.1', '98.6', '84.9',
       '97.9', '95.5', '90.5', '93.7', '94.7', '93.6', '88.5', '97.4',
       '90.6', '93.9', '83.2', '94.1', '88.8', '85.9', '85.8', '63.1',
       '85.7', '89.9', '77.9', '96.5', '92', '95.3', '95.1', '94.2',
       '94.6', '91.7', '98.5', '96.3', '81.1', '97.1', '93.8', '82',
       '85.6', '93.4', '18.8', '90.1', '72', '93.5', '99', '96.6', '98.4',
       '95.6', '92.9', '93', '97', '90.9', '89.5', '91.9', '81.2', '87

In [12]:
grad_rates = grad_rates.loc[~grad_rates.grad_rate.isin(['*', '**'])]

In [13]:
grad_rates.grad_rate = grad_rates.grad_rate.astype(float)
grad_rates.DISTRICT_ID = grad_rates.DISTRICT_ID.astype('category')
grad_rates.SCHOOL_ID = grad_rates.SCHOOL_ID.astype('category')

In [14]:
grad_rates.grad_rate = round(grad_rates.grad_rate / 100,2)

In [15]:
grad_rates

Unnamed: 0,DISTRICT_ID,SCHOOL_ID,grad_rate
0,10,2,0.96
17,10,25,0.95
46,12,35,0.92
66,20,13,0.98
85,20,20,0.89
...,...,...,...
7051,985,45,0.12
7069,985,50,0.11
7084,985,8055,0.80
7101,985,8065,0.73


In [16]:
grad_rates.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 333 entries, 0 to 7118
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   DISTRICT_ID  333 non-null    category
 1   SCHOOL_ID    333 non-null    category
 2   grad_rate    333 non-null    float64 
dtypes: category(2), float64(1)
memory usage: 18.4 KB


#### Summary

The grad_rates dataset contains per-school information pertaining to high school graduations rates in the state of Tennessee from 2018-2019. 

The only information I would need from this table is the overall graduation rate per school for all students. We drop the rest of the rows.

- Discovered two entries without graduation rates. Dropped these.

- Renamed system and school to a name that corresponds with what they actually refer to.

- Changed the grad_rate, system, and school to an appropriate type.

- Converted the grad_rate back to decimal percentage.

### Exploring Demographics Dataset

In [17]:
demo

Unnamed: 0,SCHOOL_YEAR,DISTRICT_ID,DISTRICT_NAME,SCHOOL_ID,SCHOOL_NAME,GRADES_SERVED,SAFE_SCHOOL,AVERAGE_DAILY_MEMBERSHIP,TOTAL,FEMALE,...,ASIAN_FEMALE,ASIAN_MALE,HAWAIIAN_PACISLD_FEMALE,HAWAIIAN_PACISLD_MALE,HISPANIC_FEMALE,HISPANIC_MALE,NATIVE_AMERICAN_FEMALE,NATIVE_AMERICAN_MALE,WHITE_FEMALE,WHITE_MALE
0,2018-19,985,Achievement School District,8050,Aspire Coleman,Grades PK-7,SAFE SCHOOL,607.0,612,315,...,3,2,0,0,14,10,0,0,2,6
1,2018-19,985,Achievement School District,8025,Aspire Hanley Elementary,Grades PK-5,SAFE SCHOOL,530.0,544,281,...,0,0,0,1,3,5,0,0,0,0
2,2018-19,985,Achievement School District,8024,Aspire Hanley Middle School,Grades 6-8,SAFE SCHOOL,238.0,239,112,...,0,0,0,0,2,1,0,0,0,2
3,2018-19,985,Achievement School District,8005,Brick Church: A LEAD Public School,Grades 5-8,SAFE SCHOOL,310.0,305,137,...,1,1,0,0,14,21,3,0,8,11
4,2018-19,985,Achievement School District,8010,Cornerstone Prep - Lester Campus,Grades PK-5,SAFE SCHOOL,380.0,375,189,...,1,0,0,0,10,6,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1751,2018-19,950,Wilson County School District,60,Watertown High School,Grades 9-12,SAFE SCHOOL,558.0,571,263,...,1,1,0,2,3,12,0,1,247,272
1752,2018-19,950,Wilson County School District,95,Watertown Middle School,Grades 6-8,SAFE SCHOOL,301.0,300,133,...,1,0,0,0,2,8,0,0,123,152
1753,2018-19,950,Wilson County School District,65,West Elementary,Grades K-5,SAFE SCHOOL,799.0,794,362,...,15,8,0,1,24,23,1,3,288,362
1754,2018-19,950,Wilson County School District,33,West Wilson Middle School,Grades 6-8,SAFE SCHOOL,1408.0,1408,688,...,28,36,0,2,50,56,1,1,532,554


In [18]:
demo = demo[['DISTRICT_ID',
             'SCHOOL_ID',
             'TOTAL', 
             'ECONOMICALLY_DISADVANTAGED',
             'FEMALE','HISPANIC_FEMALE','ASIAN_FEMALE','AFRICAN_AMERICAN_FEMALE', 'WHITE_FEMALE',
             'MALE','AFRICAN_AMERICAN_MALE','WHITE_MALE', 'ASIAN_MALE', 'HISPANIC_MALE', 
             'AFRICAN_AMERICAN',
             'ASIAN', 
             'HISPANIC', 
             'WHITE',
             'STUDENTS_WITH_DISABILITIES',
             'NATIVE_AMERICAN',
             'HAWAIIAN_PACISLD', 
             'HAWAIIAN_PACISLD_FEMALE', 'HAWAIIAN_PACISLD_MALE', 
             'NATIVE_AMERICAN_FEMALE', 'NATIVE_AMERICAN_MALE']].copy()

In [19]:
demo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1756 entries, 0 to 1755
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   DISTRICT_ID                 1756 non-null   int64
 1   SCHOOL_ID                   1756 non-null   int64
 2   TOTAL                       1756 non-null   int64
 3   ECONOMICALLY_DISADVANTAGED  1756 non-null   int64
 4   FEMALE                      1756 non-null   int64
 5   HISPANIC_FEMALE             1756 non-null   int64
 6   ASIAN_FEMALE                1756 non-null   int64
 7   AFRICAN_AMERICAN_FEMALE     1756 non-null   int64
 8   WHITE_FEMALE                1756 non-null   int64
 9   MALE                        1756 non-null   int64
 10  AFRICAN_AMERICAN_MALE       1756 non-null   int64
 11  WHITE_MALE                  1756 non-null   int64
 12  ASIAN_MALE                  1756 non-null   int64
 13  HISPANIC_MALE               1756 non-null   int64
 14  AFRICAN_

In [20]:
demo.DISTRICT_ID = demo.DISTRICT_ID.astype('category')
demo.SCHOOL_ID = demo.SCHOOL_ID.astype('category')

In [21]:
demo.loc[:, 'ECONOMICALLY_DISADVANTAGED':] = demo.loc[:, 'ECONOMICALLY_DISADVANTAGED':].div(demo.TOTAL,axis=0)

In [22]:
demo

Unnamed: 0,DISTRICT_ID,SCHOOL_ID,TOTAL,ECONOMICALLY_DISADVANTAGED,FEMALE,HISPANIC_FEMALE,ASIAN_FEMALE,AFRICAN_AMERICAN_FEMALE,WHITE_FEMALE,MALE,...,ASIAN,HISPANIC,WHITE,STUDENTS_WITH_DISABILITIES,NATIVE_AMERICAN,HAWAIIAN_PACISLD,HAWAIIAN_PACISLD_FEMALE,HAWAIIAN_PACISLD_MALE,NATIVE_AMERICAN_FEMALE,NATIVE_AMERICAN_MALE
0,985,8050,612,0.712418,0.514706,0.022876,0.004902,0.483660,0.003268,0.485294,...,0.008170,0.039216,0.013072,0.120915,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000
1,985,8025,544,0.788603,0.516544,0.005515,0.000000,0.511029,0.000000,0.483456,...,0.000000,0.014706,0.000000,0.108456,0.000000,0.001838,0.00000,0.001838,0.000000,0.000000
2,985,8024,239,0.769874,0.468619,0.008368,0.000000,0.460251,0.000000,0.531381,...,0.000000,0.012552,0.008368,0.150628,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000
3,985,8005,305,0.747541,0.449180,0.045902,0.003279,0.363934,0.026230,0.550820,...,0.006557,0.114754,0.062295,0.190164,0.009836,0.000000,0.00000,0.000000,0.009836,0.000000
4,985,8010,375,0.797333,0.504000,0.026667,0.002667,0.474667,0.000000,0.496000,...,0.002667,0.042667,0.005333,0.181333,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1751,950,60,571,0.185639,0.460595,0.005254,0.001751,0.021016,0.432574,0.539405,...,0.003503,0.026270,0.908932,0.145359,0.001751,0.003503,0.00000,0.003503,0.000000,0.001751
1752,950,95,300,0.263333,0.443333,0.006667,0.003333,0.023333,0.410000,0.556667,...,0.003333,0.033333,0.916667,0.136667,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000
1753,950,65,794,0.110831,0.455919,0.030227,0.018892,0.042821,0.362720,0.544081,...,0.028967,0.059194,0.818640,0.138539,0.005038,0.001259,0.00000,0.001259,0.001259,0.003778
1754,950,33,1408,0.102273,0.488636,0.035511,0.019886,0.054688,0.377841,0.511364,...,0.045455,0.075284,0.771307,0.101562,0.001420,0.001420,0.00000,0.001420,0.000710,0.000710


In [23]:
demo.loc[:, 'ECONOMICALLY_DISADVANTAGED':].describe()['min':'max']

Unnamed: 0,ECONOMICALLY_DISADVANTAGED,FEMALE,HISPANIC_FEMALE,ASIAN_FEMALE,AFRICAN_AMERICAN_FEMALE,WHITE_FEMALE,MALE,AFRICAN_AMERICAN_MALE,WHITE_MALE,ASIAN_MALE,...,ASIAN,HISPANIC,WHITE,STUDENTS_WITH_DISABILITIES,NATIVE_AMERICAN,HAWAIIAN_PACISLD,HAWAIIAN_PACISLD_FEMALE,HAWAIIAN_PACISLD_MALE,NATIVE_AMERICAN_FEMALE,NATIVE_AMERICAN_MALE
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.260766,0.469611,0.012825,0.0,0.016058,0.171333,0.495418,0.017608,0.187754,0.0,...,0.002241,0.027643,0.362357,0.10968,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.376848,0.486809,0.026971,0.003918,0.050489,0.368442,0.513171,0.052445,0.38961,0.003695,...,0.007851,0.055152,0.767334,0.138889,0.002667,0.0,0.0,0.0,0.000773,0.000907
75%,0.508127,0.504582,0.058627,0.010992,0.176346,0.435295,0.530389,0.184029,0.46163,0.010753,...,0.021108,0.118865,0.90415,0.169791,0.005476,0.002482,0.000898,0.001251,0.002717,0.002875
max,0.955224,1.0,0.387464,0.232955,0.91,0.675676,1.0,0.943182,0.769231,0.224966,...,0.454545,0.778533,1.0,1.0,0.063694,0.068966,0.051724,0.017241,0.02459,0.0625


In [24]:
demo.drop(columns=['ASIAN', 'ASIAN_MALE', 'ASIAN_FEMALE','STUDENTS_WITH_DISABILITIES', 'NATIVE_AMERICAN',
       'HAWAIIAN_PACISLD', 'HAWAIIAN_PACISLD_FEMALE', 'HAWAIIAN_PACISLD_MALE',
       'NATIVE_AMERICAN_FEMALE', 'NATIVE_AMERICAN_MALE'],inplace=True)

In [25]:
demo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1756 entries, 0 to 1755
Data columns (total 15 columns):
 #   Column                      Non-Null Count  Dtype   
---  ------                      --------------  -----   
 0   DISTRICT_ID                 1756 non-null   category
 1   SCHOOL_ID                   1756 non-null   category
 2   TOTAL                       1756 non-null   int64   
 3   ECONOMICALLY_DISADVANTAGED  1756 non-null   float64 
 4   FEMALE                      1756 non-null   float64 
 5   HISPANIC_FEMALE             1756 non-null   float64 
 6   AFRICAN_AMERICAN_FEMALE     1756 non-null   float64 
 7   WHITE_FEMALE                1756 non-null   float64 
 8   MALE                        1756 non-null   float64 
 9   AFRICAN_AMERICAN_MALE       1756 non-null   float64 
 10  WHITE_MALE                  1756 non-null   float64 
 11  HISPANIC_MALE               1756 non-null   float64 
 12  AFRICAN_AMERICAN            1756 non-null   float64 
 13  HISPANIC          

#### Summary

The demographics data set contains information on the amount of students per listed demographic group.

- Converted total number of students per group to a percentage % that signifies proprotion to that school's total population.

- Dropped columns that contained very little students. Left with these subgroups: 
        ['ECONOMICALLY_DISADVANTAGED', 'FEMALE', 'HISPANIC_FEMALE',
       'ASIAN_FEMALE', 'AFRICAN_AMERICAN_FEMALE', 'WHITE_FEMALE', 'MALE',
       'AFRICAN_AMERICAN_MALE', 'WHITE_MALE', 'ASIAN_MALE', 'HISPANIC_MALE',
       'AFRICAN_AMERICAN', 'ASIAN', 'HISPANIC', 'WHITE']

-Converted school, district id to more appropriate types



### Exploring Financial Dataset

In [26]:
finance

Unnamed: 0,District ID,School ID,Total School Per Pupil Expenditures
0,10.0,2.0,9171.371439
1,10.0,5.0,10321.694494
2,10.0,10.0,12914.574920
3,10.0,15.0,11044.536400
4,10.0,20.0,9565.777693
...,...,...,...
1748,985.0,8135.0,10672.163548
1749,985.0,8140.0,10769.016889
1750,986.0,8005.0,6621.930607
1751,986.0,8015.0,9036.184478


In [27]:
finance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1753 entries, 0 to 1752
Data columns (total 3 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   District ID                          1752 non-null   float64
 1   School ID                            1752 non-null   float64
 2   Total School Per Pupil Expenditures  1753 non-null   float64
dtypes: float64(3)
memory usage: 41.2 KB


In [28]:
finance.rename(columns={'District ID':'DISTRICT_ID', 'School ID':'SCHOOL_ID', 'Total School Per Pupil Expenditures': 'Expend_per_pupil'}, inplace=True)
finance.columns

Index(['DISTRICT_ID', 'SCHOOL_ID', 'Expend_per_pupil'], dtype='object')

In [29]:
finance.DISTRICT_ID = finance.DISTRICT_ID.astype('category')
finance.SCHOOL_ID = finance.SCHOOL_ID.astype('category')

In [30]:
finance = finance.loc[~finance.isna().any(axis=1)].copy()

In [31]:
finance.Expend_per_pupil = finance.Expend_per_pupil.apply(lambda x: round(x, 2))

In [32]:
finance

Unnamed: 0,DISTRICT_ID,SCHOOL_ID,Expend_per_pupil
0,10.0,2.0,9171.37
1,10.0,5.0,10321.69
2,10.0,10.0,12914.57
3,10.0,15.0,11044.54
4,10.0,20.0,9565.78
...,...,...,...
1747,985.0,8130.0,11058.70
1748,985.0,8135.0,10672.16
1749,985.0,8140.0,10769.02
1750,986.0,8005.0,6621.93


In [33]:
finance.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1752 entries, 0 to 1751
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   DISTRICT_ID       1752 non-null   category
 1   SCHOOL_ID         1752 non-null   category
 2   Expend_per_pupil  1752 non-null   float64 
dtypes: category(2), float64(1)
memory usage: 64.2 KB


#### Summary
The finance dataset contains various data about finances. I am only interested in Total expenditure per pupil. I deleted the columns I was not intersted in using Excel.

- Renamed school,district id
- Renamed Total School Per Pupil Expenditure to a simpler/more useable name
- Changed ids to more appropriate types
- Dropped any rows that contained nulls.
- Rounded for nicer looking numbers

### Merge Datasets

In [34]:
grad_demo = pd.merge(grad_rates,demo, how='inner', on=['SCHOOL_ID', 'DISTRICT_ID'])
grad_demo_fin = pd.merge(grad_demo,finance, how='inner', on=['SCHOOL_ID', 'DISTRICT_ID'])

In [35]:
grad_demo_fin

Unnamed: 0,DISTRICT_ID,SCHOOL_ID,grad_rate,TOTAL,ECONOMICALLY_DISADVANTAGED,FEMALE,HISPANIC_FEMALE,AFRICAN_AMERICAN_FEMALE,WHITE_FEMALE,MALE,AFRICAN_AMERICAN_MALE,WHITE_MALE,HISPANIC_MALE,AFRICAN_AMERICAN,HISPANIC,WHITE,Expend_per_pupil
0,10,2,0.96,1075,0.247442,0.482791,0.008372,0.002791,0.465116,0.517209,0.007442,0.500465,0.001860,0.010233,0.010233,0.965581,9171.37
1,10,25,0.95,1120,0.339286,0.451786,0.010714,0.020536,0.409821,0.548214,0.019643,0.505357,0.016071,0.040179,0.026786,0.915179,9523.76
2,12,35,0.92,1406,0.199858,0.491465,0.044097,0.070413,0.351351,0.508535,0.085349,0.346373,0.046942,0.155761,0.091038,0.697724,12546.31
3,20,13,0.98,481,0.193347,0.442827,0.037422,0.016632,0.382536,0.557173,0.027027,0.482328,0.037422,0.043659,0.074844,0.864865,9106.97
4,20,20,0.89,1506,0.371846,0.490704,0.169323,0.086321,0.228420,0.509296,0.083001,0.255644,0.160691,0.169323,0.330013,0.484064,7418.07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
322,985,45,0.12,174,0.494253,0.586207,0.028736,0.522989,0.022989,0.413793,0.339080,0.022989,0.045977,0.862069,0.074713,0.045977,8025.96
323,985,50,0.11,173,0.531792,0.554913,0.028902,0.520231,0.005780,0.445087,0.421965,0.005780,0.017341,0.942197,0.046243,0.011561,8876.53
324,985,8055,0.80,520,0.682692,0.494231,0.005769,0.482692,0.000000,0.505769,0.492308,0.005769,0.005769,0.975000,0.011538,0.005769,9794.46
325,985,8065,0.73,513,0.647173,0.477583,0.005848,0.471735,0.000000,0.522417,0.499025,0.013645,0.009747,0.970760,0.015595,0.013645,9890.77


In [36]:
grad_demo_fin.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 327 entries, 0 to 326
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   DISTRICT_ID                 327 non-null    object 
 1   SCHOOL_ID                   327 non-null    object 
 2   grad_rate                   327 non-null    float64
 3   TOTAL                       327 non-null    int64  
 4   ECONOMICALLY_DISADVANTAGED  327 non-null    float64
 5   FEMALE                      327 non-null    float64
 6   HISPANIC_FEMALE             327 non-null    float64
 7   AFRICAN_AMERICAN_FEMALE     327 non-null    float64
 8   WHITE_FEMALE                327 non-null    float64
 9   MALE                        327 non-null    float64
 10  AFRICAN_AMERICAN_MALE       327 non-null    float64
 11  WHITE_MALE                  327 non-null    float64
 12  HISPANIC_MALE               327 non-null    float64
 13  AFRICAN_AMERICAN            327 non

In [37]:
grad_demo_fin.drop(columns=['SCHOOL_ID', 'DISTRICT_ID'], inplace=True)

#### Summary
Merged the three data sets into one table.

In [38]:
grad_demo_fin.rename(columns={'HISPANIC_FEMALE': 'H_Female',
                             'AFRICAN_AMERICAN_FEMALE':'AA_FEMALE',
                             'WHITE_FEMALE':'W_FEMALE',
                             'AFRICAN_AMERICAN_MALE': 'AA_MALE',
                             'WHITE_MALE': 'W_MALE',
                             'HISPANIC_MALE': 'H_MALE',
                             'AFRICAN_AMERICAN': 'AA',
                             'HISPANIC': 'H',
                             'WHITE': 'W'}, inplace = True)

In [39]:
grad_demo_fin = grad_demo_fin.loc[grad_demo_fin[(np.abs(stats.zscore(grad_demo_fin)) < 2.5).all(axis=1)].index]

In [40]:
grad_demo_fin.to_csv('./data/grad_demo_fin.csv', index=False)

In [41]:
grad_demo_fin

Unnamed: 0,grad_rate,TOTAL,ECONOMICALLY_DISADVANTAGED,FEMALE,H_Female,AA_FEMALE,W_FEMALE,MALE,AA_MALE,W_MALE,H_MALE,AA,H,W,Expend_per_pupil
0,0.96,1075,0.247442,0.482791,0.008372,0.002791,0.465116,0.517209,0.007442,0.500465,0.001860,0.010233,0.010233,0.965581,9171.37
1,0.95,1120,0.339286,0.451786,0.010714,0.020536,0.409821,0.548214,0.019643,0.505357,0.016071,0.040179,0.026786,0.915179,9523.76
2,0.92,1406,0.199858,0.491465,0.044097,0.070413,0.351351,0.508535,0.085349,0.346373,0.046942,0.155761,0.091038,0.697724,12546.31
3,0.98,481,0.193347,0.442827,0.037422,0.016632,0.382536,0.557173,0.027027,0.482328,0.037422,0.043659,0.074844,0.864865,9106.97
5,0.91,534,0.237828,0.488764,0.056180,0.020599,0.408240,0.511236,0.022472,0.423221,0.059925,0.043071,0.116105,0.831461,8279.85
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318,0.97,2220,0.071171,0.487838,0.018018,0.044595,0.407207,0.512162,0.050450,0.420270,0.024775,0.095045,0.042793,0.827477,7519.21
319,0.96,571,0.185639,0.460595,0.005254,0.021016,0.432574,0.539405,0.035026,0.476357,0.021016,0.056042,0.026270,0.908932,10360.40
320,0.97,1935,0.109044,0.473902,0.027390,0.050646,0.375711,0.526098,0.047028,0.430491,0.033592,0.097674,0.060982,0.806202,7995.72
324,0.80,520,0.682692,0.494231,0.005769,0.482692,0.000000,0.505769,0.492308,0.005769,0.005769,0.975000,0.011538,0.005769,9794.46


In [42]:
profile_report = grad_demo_fin.profile_report(html={'style': {'full_width': True}})
profile_report.to_file("./Data/Grad_Demo_Profile.html")

HBox(children=(HTML(value='Summarize dataset'), FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(HTML(value='Generate report structure'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Render HTML'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Export report to file'), FloatProgress(value=0.0, max=1.0), HTML(value='')))


