# Predicting Teacher Turnover

__Meaghan Ross__

Flatiron School Capstone

### Data Preparation

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats as stats
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
ls data/

SPR_SY1415_School_Metric_Scores_20160208.xlsx
SPR_SY1516_School_Metric_Scores_20170203.xlsx
SPR_SY1617_School_Metric_Scores_20180206.xlsx
SPR_SY1718_School_Metric_Scores_20190129.xlsx
SPR_SY1819_School_Metric_Scores_20200127.xlsx
employee_information_April2015.csv
employee_information_April2016.csv
employee_information_April2017.csv
employee_information_April2018.csv
employee_information_April2019.csv
employee_information_April2020.csv
employee_information_April2021.csv


#### Teacher Data

Load in the data from the 2018 and 2019 employee information files.

In [3]:
teacher_df_2016 = pd.read_csv('data/employee_information_April2016.csv')
teacher_df_2017 = pd.read_csv('data/employee_information_April2017.csv')
teacher_df_2018 = pd.read_csv('data/employee_information_April2018.csv')
teacher_df_2019 = pd.read_csv('data/employee_information_April2019.csv')

In [4]:
teacher_df_2016.head()

Unnamed: 0,LAST_NAME,FIRST_NAME,PAY_RATE_TYPE,PAY_RATE,TITLE_DESCRIPTION,HOME_ORGANIZATION,HOME_ORGANIZATION_DESCRIPTION,ORGANIZATION_LEVEL,TYPE_OF_REPRESENTATION,GENDER,RUN_DATE
0,AARON,ANDREA,SALARIED,31261,"GENERAL CLEANER, 8 HOURS",4300,"HESTON, EDWARD SCHOOL",ELEMENTARY SCHOOL,LOCAL 1201,F,4/1/2016
1,AARON,PEGGY,SALARIED,9349,"STUDENT CLIMATE STAFF,4 HOURS",6360,ROOSEVELT ELEMENTARY SCHOOL,ELEMENTARY SCHOOL,LOCAL 634,F,4/1/2016
2,ABARY,RODNEY,SALARIED,76461,SCHOOL NURSE,2370,"MCDANIEL, DELAPLAINE SCHOOL",ELEMENTARY SCHOOL,PFT-TEACHER,M,4/1/2016
3,ABATE,JO-ANN,HOURLY,48,TEACHER-EXTRA CURR/STAFF DEVEL,9EW0,NON-PUBLIC PROGRAMS,NON ADMINISTRATIVE OFFICE,PFT-TEACHER,F,4/1/2016
4,ABAYOMI-IGE,OLABIMPE,SALARIED,76461,"TEACHER,SPEC EDUCATION",6100,"LEEDS, MORRIS E. MIDDLE SCHOOL",MIDDLE SCHOOL,PFT-TEACHER,F,4/1/2016


In [5]:
teacher_df_2017.head()

Unnamed: 0,LAST_NAME,FIRST_NAME,PAY_RATE_TYPE,PAY_RATE,TITLE_DESCRIPTION,HOME_ORGANIZATION,HOME_ORGANIZATION_DESCRIPTION,ORGANIZATION_LEVEL,TYPE_OF_REPRESENTATION,GENDER,RUN_DATE
0,AARON,ANDREA,SALARIED,31261,"GENERAL CLEANER, 8 HOURS",4300,"HESTON, EDWARD SCHOOL",ELEMENTARY SCHOOL,LOCAL 1201,F,4/1/2017
1,AARON,PEGGY,SALARIED,11949,"STUDENT CLIMATE STAFF,5 HOURS",6360,ROOSEVELT ELEMENTARY SCHOOL,ELEMENTARY SCHOOL,LOCAL 634,F,4/1/2017
2,ABARY,RODNEY,SALARIED,76461,SCHOOL NURSE,2370,"MCDANIEL, DELAPLAINE SCHOOL",ELEMENTARY SCHOOL,PFT-TEACHER,M,4/1/2017
3,ABATE,JO-ANN,HOURLY,39,TEACHER-EXTRA CURR/STAFF DEVEL,9EW0,NON-PUBLIC PROGRAMS,NON ADMINISTRATIVE OFFICE,PFT-TEACHER,F,4/1/2017
4,ABAYOMI-IGE,OLABIMPE,SALARIED,76461,"TEACHER,SPEC EDUCATION",5070,PARKWAY-NORTHWEST HIGH SCHOOL,HIGH SCHOOL,PFT-TEACHER,F,4/1/2017


In [6]:
teacher_df_2018.head()

Unnamed: 0,LAST_NAME,FIRST_NAME,PAY_RATE_TYPE,PAY_RATE,TITLE_DESCRIPTION,HOME_ORGANIZATION,HOME_ORGANIZATION_DESCRIPTION,ORGANIZATION_LEVEL,TYPE_OF_REPRESENTATION,GENDER,RUN_DATE
0,AARON,ANDREA,SALARIED,32199,"GENERAL CLEANER, 8 HOURS",4300,"HESTON, EDWARD SCHOOL",ELEMENTARY SCHOOL,LOCAL 1201,F,4/1/2018
1,AARON,PEGGY,SALARIED,10084,"STUDENT CLIMATE STAFF,4 HOURS",6360,ROOSEVELT ELEMENTARY SCHOOL,ELEMENTARY SCHOOL,LOCAL 634,F,4/1/2018
2,ABARY,RODNEY,SALARIED,76461,SCHOOL NURSE,2370,"MCDANIEL, DELAPLAINE SCHOOL",ELEMENTARY SCHOOL,PFT-TEACHER,M,4/1/2018
3,ABATE,JO-ANN,HOURLY,39,TEACHER-EXTRA CURR/STAFF DEVEL,9EW0,NON-PUBLIC PROGRAMS,NON ADMINISTRATIVE OFFICE,PFT-TEACHER,F,4/1/2018
4,ABAYOMI-IGE,OLABIMPE,SALARIED,90051,"TEACHER,SPEC EDUCATION",5070,PARKWAY-NORTHWEST HIGH SCHOOL,HIGH SCHOOL,PFT-TEACHER,F,4/1/2018


In [7]:
teacher_df_2019.head()

Unnamed: 0,LAST_NAME,FIRST_NAME,PAY_RATE_TYPE,PAY_RATE,TITLE_DESCRIPTION,HOME_ORGANIZATION,HOME_ORGANIZATION_DESCRIPTION,ORGANIZATION_LEVEL,TYPE_OF_REPRESENTATION,GENDER,RUN_DATE
0,AARAS,YOUSRA,SALARIED,10383,"STUDENT CLIMATE STAFF,4 HOURS",2620,ACADEMY AT PALUMBO,HIGH SCHOOL,LOCAL 634,F,4/1/2019
1,AARON,ANDREA,SALARIED,33165,"GENERAL CLEANER, 8 HOURS",4300,"HESTON, EDWARD SCHOOL",ELEMENTARY SCHOOL,LOCAL 1201,F,4/1/2019
2,AARON,ATIA,SALARIED,12979,"STUDENT CLIMATE STAFF,5 HOURS",2050,SCIENCE LEADERSHIP ACADEMY MS,MIDDLE SCHOOL,LOCAL 634,F,4/1/2019
3,AARON,PEGGY,SALARIED,10383,"STUDENT CLIMATE STAFF,4 HOURS",7130,"WAGNER, GEN. LOUIS MIDDLE SCH.",MIDDLE SCHOOL,LOCAL 634,F,4/1/2019
4,ABARA,BERNADINE,SALARIED,7788,"STUDENT CLIMATE STAFF,3 HOURS",8310,"MOORE, J. HAMPTON SCHOOL",ELEMENTARY SCHOOL,LOCAL 634,F,4/1/2019


In [8]:
teacher_df_2016.shape

(17562, 11)

In [9]:
teacher_df_2017.shape

(18353, 11)

In [10]:
teacher_df_2018.shape

(19181, 11)

In [11]:
teacher_df_2019.shape

(20151, 11)

In order to find out the teacher turnover after the 2018 school year, we will merge the 2018 personnel files with the 2019 personnel files using a left join to identify all of the employees of the Philadelphia school system in 2018. The join will use the employee's first and last name as well as their school name since we want to identify teachers who remained in the same school the following year. If there are null values in 2019, then it means the employee did not return for the following school year.

In [12]:
teacher_turnover_2017 = pd.merge(teacher_df_2016, teacher_df_2017, 
                                 on=['LAST_NAME', 'FIRST_NAME', 'HOME_ORGANIZATION'], 
                                 how='left', suffixes=('_2016', '_2017'))

In [13]:
teacher_turnover_2018 = pd.merge(teacher_df_2017, teacher_df_2018, 
                                 on=['LAST_NAME', 'FIRST_NAME', 'HOME_ORGANIZATION'], 
                                 how='left', suffixes=('_2017', '_2018'))

In [14]:
teacher_turnover_2019 = pd.merge(teacher_df_2018, teacher_df_2019, 
                                 on=['LAST_NAME', 'FIRST_NAME', 'HOME_ORGANIZATION'], 
                                 how='left', suffixes=('_2018', '_2019'))

In [15]:
teacher_turnover_2017.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17725 entries, 0 to 17724
Data columns (total 19 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   LAST_NAME                           17724 non-null  object 
 1   FIRST_NAME                          17725 non-null  object 
 2   PAY_RATE_TYPE_2016                  17725 non-null  object 
 3   PAY_RATE_2016                       17725 non-null  int64  
 4   TITLE_DESCRIPTION_2016              17725 non-null  object 
 5   HOME_ORGANIZATION                   17725 non-null  object 
 6   HOME_ORGANIZATION_DESCRIPTION_2016  17725 non-null  object 
 7   ORGANIZATION_LEVEL_2016             17725 non-null  object 
 8   TYPE_OF_REPRESENTATION_2016         17724 non-null  object 
 9   GENDER_2016                         17725 non-null  object 
 10  RUN_DATE_2016                       17725 non-null  object 
 11  PAY_RATE_TYPE_2017                  13114

In [16]:
teacher_turnover_2018.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18457 entries, 0 to 18456
Data columns (total 19 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   LAST_NAME                           18456 non-null  object 
 1   FIRST_NAME                          18457 non-null  object 
 2   PAY_RATE_TYPE_2017                  18457 non-null  object 
 3   PAY_RATE_2017                       18457 non-null  int64  
 4   TITLE_DESCRIPTION_2017              18457 non-null  object 
 5   HOME_ORGANIZATION                   18457 non-null  object 
 6   HOME_ORGANIZATION_DESCRIPTION_2017  18457 non-null  object 
 7   ORGANIZATION_LEVEL_2017             18457 non-null  object 
 8   TYPE_OF_REPRESENTATION_2017         18447 non-null  object 
 9   GENDER_2017                         18457 non-null  object 
 10  RUN_DATE_2017                       18457 non-null  object 
 11  PAY_RATE_TYPE_2018                  14170

In [17]:
teacher_turnover_2019.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19279 entries, 0 to 19278
Data columns (total 19 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   LAST_NAME                           19278 non-null  object 
 1   FIRST_NAME                          19279 non-null  object 
 2   PAY_RATE_TYPE_2018                  19279 non-null  object 
 3   PAY_RATE_2018                       19279 non-null  int64  
 4   TITLE_DESCRIPTION_2018              19279 non-null  object 
 5   HOME_ORGANIZATION                   19279 non-null  object 
 6   HOME_ORGANIZATION_DESCRIPTION_2018  19279 non-null  object 
 7   ORGANIZATION_LEVEL_2018             19279 non-null  object 
 8   TYPE_OF_REPRESENTATION_2018         18962 non-null  object 
 9   GENDER_2018                         19279 non-null  object 
 10  RUN_DATE_2018                       19279 non-null  object 
 11  PAY_RATE_TYPE_2019                  14258

The merged data frame includes all employes of the School District of Philadelphia. Let's look at identifying those who are labeled as teachers.

In [18]:
teacher_turnover_2017['TITLE_DESCRIPTION_2017'].value_counts()

TEACHER,FULL TIME                5193
TEACHER,SPEC EDUCATION           1002
ONE TO ONE ASST, SPECIAL ED       586
CLASSROOM ASST,SP ED,SV HND       508
STUDENT CLIMATE STAFF,4 HOURS     297
                                 ... 
STRATEGY ANALYST I                  1
ENVIRONMENTAL MANAGER               1
SYSTEMS ANALYST                     1
OPERATIONS TRAINEE                  1
DIRECTOR, FISCAL SERVICES           1
Name: TITLE_DESCRIPTION_2017, Length: 464, dtype: int64

In [19]:
teacher_turnover_2018['TITLE_DESCRIPTION_2017'].value_counts()

TEACHER,FULL TIME               6779
TEACHER,SPEC EDUCATION          1356
ONE TO ONE ASST, SPECIAL ED      959
CLASSROOM ASST,SP ED,SV HND      622
GENERAL CLEANER, 8 HOURS         561
                                ... 
DIR,EDUCATIONAL TECHNOLOGY         1
DISCIPLINARY HEARING OFFICER       1
FOOD SVCS MENU SPECIALIST          1
ERP SPECIALIST                     1
COMPENSATION SPECIALIST            1
Name: TITLE_DESCRIPTION_2017, Length: 566, dtype: int64

In [20]:
teacher_turnover_2019['TITLE_DESCRIPTION_2018'].value_counts()

TEACHER,FULL TIME                 6789
SPECIAL EDUCATION ASSISTANT       1848
TEACHER,SPEC EDUCATION            1389
GENERAL CLEANER, 8 HOURS           558
FOOD SVCS ASSISTANT                428
                                  ... 
DISCIPLINARY HEARING OFFICER         1
FOOD SVCS MENU SPECIALIST            1
MANAGER, CAPITAL FINANCIAL SVC       1
EX DIR PARTNERSHIP SUP & DEVEL       1
EX DIR,HEALTH,SAFETY,NUTRI,PE        1
Name: TITLE_DESCRIPTION_2018, Length: 591, dtype: int64

We are going to create a new DataFrame that only includes those employees identified as `TEACHER,FULL TIME` or `TEACHER,SPEC EDUCATION` as the remaining employee types are outside of the classroom or support staff who do not need to be liscenced educators.

In [21]:
teacher_turnover_2017 = teacher_turnover_2017.loc[(teacher_turnover_2017['TITLE_DESCRIPTION_2016']=='TEACHER,FULL TIME')|
                           (teacher_turnover_2017['TITLE_DESCRIPTION_2016']=='TEACHER,SPEC EDUCATION')]


In [22]:
teacher_turnover_2018 = teacher_turnover_2018.loc[(teacher_turnover_2018['TITLE_DESCRIPTION_2017']=='TEACHER,FULL TIME')|
                           (teacher_turnover_2018['TITLE_DESCRIPTION_2017']=='TEACHER,SPEC EDUCATION')]


In [23]:
teacher_turnover_2019 = teacher_turnover_2019.loc[(teacher_turnover_2019['TITLE_DESCRIPTION_2018']=='TEACHER,FULL TIME')|
                           (teacher_turnover_2019['TITLE_DESCRIPTION_2018']=='TEACHER,SPEC EDUCATION')]


In [24]:
teacher_turnover_2017.isna().sum()

LAST_NAME                                1
FIRST_NAME                               0
PAY_RATE_TYPE_2016                       0
PAY_RATE_2016                            0
TITLE_DESCRIPTION_2016                   0
HOME_ORGANIZATION                        0
HOME_ORGANIZATION_DESCRIPTION_2016       0
ORGANIZATION_LEVEL_2016                  0
TYPE_OF_REPRESENTATION_2016              0
GENDER_2016                              0
RUN_DATE_2016                            0
PAY_RATE_TYPE_2017                    1789
PAY_RATE_2017                         1789
TITLE_DESCRIPTION_2017                1789
HOME_ORGANIZATION_DESCRIPTION_2017    1789
ORGANIZATION_LEVEL_2017               1789
TYPE_OF_REPRESENTATION_2017           1789
GENDER_2017                           1789
RUN_DATE_2017                         1789
dtype: int64

In [25]:
teacher_turnover_2018.isna().sum()

LAST_NAME                                1
FIRST_NAME                               0
PAY_RATE_TYPE_2017                       0
PAY_RATE_2017                            0
TITLE_DESCRIPTION_2017                   0
HOME_ORGANIZATION                        0
HOME_ORGANIZATION_DESCRIPTION_2017       0
ORGANIZATION_LEVEL_2017                  0
TYPE_OF_REPRESENTATION_2017              0
GENDER_2017                              0
RUN_DATE_2017                            0
PAY_RATE_TYPE_2018                    1664
PAY_RATE_2018                         1664
TITLE_DESCRIPTION_2018                1664
HOME_ORGANIZATION_DESCRIPTION_2018    1664
ORGANIZATION_LEVEL_2018               1664
TYPE_OF_REPRESENTATION_2018           1664
GENDER_2018                           1664
RUN_DATE_2018                         1664
dtype: int64

In [26]:
teacher_turnover_2019.isna().sum()

LAST_NAME                                1
FIRST_NAME                               0
PAY_RATE_TYPE_2018                       0
PAY_RATE_2018                            0
TITLE_DESCRIPTION_2018                   0
HOME_ORGANIZATION                        0
HOME_ORGANIZATION_DESCRIPTION_2018       0
ORGANIZATION_LEVEL_2018                  0
TYPE_OF_REPRESENTATION_2018              0
GENDER_2018                              0
RUN_DATE_2018                            0
PAY_RATE_TYPE_2019                    1593
PAY_RATE_2019                         1593
TITLE_DESCRIPTION_2019                1593
HOME_ORGANIZATION_DESCRIPTION_2019    1593
ORGANIZATION_LEVEL_2019               1593
TYPE_OF_REPRESENTATION_2019           1593
GENDER_2019                           1593
RUN_DATE_2019                         1593
dtype: int64

Looking at the null values now, we can see there is one teacher with a null value for their last name, so we will fill it with 'None'. The additional null values are all in the columns from 2019 indicating that those are the teachers who left after the 2018 school year, and we want to keep those null values for now.

In [27]:
teacher_turnover_2017.loc[:,['LAST_NAME']]=teacher_turnover_2017['LAST_NAME'].fillna(value='None')

In [28]:
teacher_turnover_2018.loc[:,['LAST_NAME']]=teacher_turnover_2018['LAST_NAME'].fillna(value='None')

In [29]:
teacher_turnover_2019.loc[:,['LAST_NAME']]=teacher_turnover_2019['LAST_NAME'].fillna(value='None')

In [30]:
teacher_turnover_2017.isna().sum()

LAST_NAME                                0
FIRST_NAME                               0
PAY_RATE_TYPE_2016                       0
PAY_RATE_2016                            0
TITLE_DESCRIPTION_2016                   0
HOME_ORGANIZATION                        0
HOME_ORGANIZATION_DESCRIPTION_2016       0
ORGANIZATION_LEVEL_2016                  0
TYPE_OF_REPRESENTATION_2016              0
GENDER_2016                              0
RUN_DATE_2016                            0
PAY_RATE_TYPE_2017                    1789
PAY_RATE_2017                         1789
TITLE_DESCRIPTION_2017                1789
HOME_ORGANIZATION_DESCRIPTION_2017    1789
ORGANIZATION_LEVEL_2017               1789
TYPE_OF_REPRESENTATION_2017           1789
GENDER_2017                           1789
RUN_DATE_2017                         1789
dtype: int64

In [31]:
teacher_turnover_2018.isna().sum()

LAST_NAME                                0
FIRST_NAME                               0
PAY_RATE_TYPE_2017                       0
PAY_RATE_2017                            0
TITLE_DESCRIPTION_2017                   0
HOME_ORGANIZATION                        0
HOME_ORGANIZATION_DESCRIPTION_2017       0
ORGANIZATION_LEVEL_2017                  0
TYPE_OF_REPRESENTATION_2017              0
GENDER_2017                              0
RUN_DATE_2017                            0
PAY_RATE_TYPE_2018                    1664
PAY_RATE_2018                         1664
TITLE_DESCRIPTION_2018                1664
HOME_ORGANIZATION_DESCRIPTION_2018    1664
ORGANIZATION_LEVEL_2018               1664
TYPE_OF_REPRESENTATION_2018           1664
GENDER_2018                           1664
RUN_DATE_2018                         1664
dtype: int64

In [32]:
teacher_turnover_2019.isna().sum()

LAST_NAME                                0
FIRST_NAME                               0
PAY_RATE_TYPE_2018                       0
PAY_RATE_2018                            0
TITLE_DESCRIPTION_2018                   0
HOME_ORGANIZATION                        0
HOME_ORGANIZATION_DESCRIPTION_2018       0
ORGANIZATION_LEVEL_2018                  0
TYPE_OF_REPRESENTATION_2018              0
GENDER_2018                              0
RUN_DATE_2018                            0
PAY_RATE_TYPE_2019                    1593
PAY_RATE_2019                         1593
TITLE_DESCRIPTION_2019                1593
HOME_ORGANIZATION_DESCRIPTION_2019    1593
ORGANIZATION_LEVEL_2019               1593
TYPE_OF_REPRESENTATION_2019           1593
GENDER_2019                           1593
RUN_DATE_2019                         1593
dtype: int64

#### School Data

Load in the data from the 2018 School Progress Reports from the School District of Philadelphia. The school ratings from the 2018 school year would likely influence teacher turnover and whether a teacher returns or not for the 2019 school year.

In [33]:
school_df_2016 = pd.read_excel('data/SPR_SY1516_School_Metric_Scores_20170203.xlsx', sheet_name='SY2015-2016 SPR')
school_df_2016.head()

Unnamed: 0,School,SRC School ID,Report,Rpt Type Long,Street Address,City,State,Zip Code,Phone Number,Fax Number,...,FAFSA Tier,Student Survey College & Career Score,Student Survey College & Career Pts Earn,Student Survey College & Career Pts Poss,Student Survey College & Career Pct Earn,Student Survey College & Career Tier,Teach Effect Distinguished Score,Teach Effect Instruction Score,Teacher Attendance Score,Student Survey Teaching Score
0,John Bartram High School,101,HS,High School,2401 S. 67th St.,Philadelphia,PA,19142,215-492-6450,215-492-6117,...,INTERVENE,22,0.22,1,22,INTERVENE,7,10,56,45
1,West Philadelphia High School,102,HS,High School,4901 Chestnut St.,Philadelphia,PA,19139,215-471-2902,215-471-6402,...,WATCH,31,0.31,1,31,WATCH,9,40,65,44
2,High School of the Future,103,HS,High School,4021 Parkside Ave.,Philadelphia,PA,19104,215-823-5500,215-823-5504,...,WATCH,27,0.27,1,27,WATCH,0,0,53,46
3,Paul Robeson High School for Human Services,105,HS,High School,4125 Ludlow St.,Philadelphia,PA,19104,215-823-8207,215-823-8252,...,REINFORCE,38,0.38,1,38,WATCH,15,18,76,56
4,William L. Sayre High School,110,HS,High School,5800 Walnut St.,Philadelphia,PA,19139,215-471-2904,215-471-3486,...,INTERVENE,29,0.29,1,29,WATCH,4,6,57,46


In [34]:
school_df_2017 = pd.read_excel('data/SPR_SY1617_School_Metric_Scores_20180206.xlsx', sheet_name='SY2016-2017 SPR')
school_df_2017.head()

Unnamed: 0,School,SRC School ID,Report,Rpt Type Long,Street Address,City,State,Zip Code,Phone Number,Fax Number,...,FAFSA Tier,Student Survey College & Career Score,Student Survey College & Career Pts Earn,Student Survey College & Career Pts Poss,Student Survey College & Career Pct Earn,Student Survey College & Career Tier,Teach Effect Distinguished Score,Teach Effect Instruction Score,Teacher Attendance Score,Student Survey Teaching Score
0,John Bartram High School,101,HS,High School,2401 S 67th St,Philadelphia,PA,19142,215-400-8100,215-400-8101,...,WATCH,19,0.19,1,19,INTERVENE,15,21,56,47
1,West Philadelphia High School,102,HS,High School,4901 Chestnut St,Philadelphia,PA,19139,215-400-7900,215-400-7901,...,WATCH,27,0.27,1,27,WATCH,11,29,56,42
2,High School of the Future,103,HS,High School,4021 Parkside Ave,Philadelphia,PA,19104,215-400-7790,215-400-7791,...,REINFORCE,0,0.0,1,0,INTERVENE,16,13,50,0
3,Paul Robeson High School for Human Services,105,HS,High School,4125 Ludlow St,Philadelphia,PA,19104,215-400-7780,215-400-7781,...,MODEL,27,0.27,1,27,WATCH,15,15,61,55
4,William L. Sayre High School,110,HS,High School,5800 Walnut St,Philadelphia,PA,19139,215-400-7800,215-400-7801,...,WATCH,24,0.24,1,24,INTERVENE,6,10,53,46


In [35]:
school_df_2018 = pd.read_excel('data/SPR_SY1718_School_Metric_Scores_20190129.xlsx', sheet_name='SPR SY2017-2018')
school_df_2018.head()

Unnamed: 0,School,SRC School ID,ULCS Code,Report,Rpt Type Long,Street Address,City,State,Zip Code,Phone Number,...,FAFSA Tier,Student Survey College & Career Score,Student Survey College & Career Pts Earn,Student Survey College & Career Pts Poss,Student Survey College & Career Pct Earn,Student Survey College & Career Tier,Teach Effect Distinguished Score,Teach Effect Instruction Score,Teacher Attendance Score,Student Survey Teaching Score
0,John Bartram High School,101,1010,HS,High School,2401 S. 67th St.,Philadelphia,PA,19142,215-400-8100,...,INTERVENE,33,0.33,1,33,WATCH,Data Not Available,Data Not Available,52,43
1,West Philadelphia High School,102,1020,HS,High School,4901 Chestnut St.,Philadelphia,PA,19139,215-400-7900,...,WATCH,0,0.0,1,0,INTERVENE,Data Not Available,Data Not Available,49,0
2,High School of the Future,103,1030,HS,High School,4021 Parkside Ave.,Philadelphia,PA,19104,215-400-7790,...,REINFORCE,40,0.4,1,40,WATCH,Data Not Available,Data Not Available,56,36
3,Paul Robeson High School for Human Services,105,1050,HS,High School,4125 Ludlow St.,Philadelphia,PA,19104,215-400-7780,...,MODEL,39,0.39,1,39,WATCH,Data Not Available,Data Not Available,68,53
4,William L. Sayre High School,110,1100,HS,High School,5800 Walnut St.,Philadelphia,PA,19139,215-400-7800,...,INTERVENE,0,0.0,1,0,INTERVENE,Data Not Available,Data Not Available,71,0


In [36]:
merge_code = school_df_2018[['SRC School ID','ULCS Code']]

In [37]:
school_df_2016['SRC School ID'] = school_df_2016['SRC School ID'].astype(str)
school_df_2017['SRC School ID'] = school_df_2017['SRC School ID'].astype(str)
merge_code['SRC School ID'] = merge_code['SRC School ID'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merge_code['SRC School ID'] = merge_code['SRC School ID'].astype(str)


In [38]:
merge_code.head()

Unnamed: 0,SRC School ID,ULCS Code
0,101,1010
1,102,1020
2,103,1030
3,105,1050
4,110,1100


In [39]:
school_df_2016 = pd.merge(school_df_2016, merge_code, 
                             left_on='SRC School ID', 
                             right_on='SRC School ID',
                             how='left')

In [40]:
school_df_2017 = pd.merge(school_df_2017, merge_code, 
                             left_on='SRC School ID', 
                             right_on='SRC School ID',
                             how='left')

In [41]:
school_df_2016.head()

Unnamed: 0,School,SRC School ID,Report,Rpt Type Long,Street Address,City,State,Zip Code,Phone Number,Fax Number,...,Student Survey College & Career Score,Student Survey College & Career Pts Earn,Student Survey College & Career Pts Poss,Student Survey College & Career Pct Earn,Student Survey College & Career Tier,Teach Effect Distinguished Score,Teach Effect Instruction Score,Teacher Attendance Score,Student Survey Teaching Score,ULCS Code
0,John Bartram High School,101,HS,High School,2401 S. 67th St.,Philadelphia,PA,19142,215-492-6450,215-492-6117,...,22,0.22,1,22,INTERVENE,7,10,56,45,1010.0
1,West Philadelphia High School,102,HS,High School,4901 Chestnut St.,Philadelphia,PA,19139,215-471-2902,215-471-6402,...,31,0.31,1,31,WATCH,9,40,65,44,1020.0
2,High School of the Future,103,HS,High School,4021 Parkside Ave.,Philadelphia,PA,19104,215-823-5500,215-823-5504,...,27,0.27,1,27,WATCH,0,0,53,46,1030.0
3,Paul Robeson High School for Human Services,105,HS,High School,4125 Ludlow St.,Philadelphia,PA,19104,215-823-8207,215-823-8252,...,38,0.38,1,38,WATCH,15,18,76,56,1050.0
4,William L. Sayre High School,110,HS,High School,5800 Walnut St.,Philadelphia,PA,19139,215-471-2904,215-471-3486,...,29,0.29,1,29,WATCH,4,6,57,46,1100.0


In [42]:
school_df_2017.head()

Unnamed: 0,School,SRC School ID,Report,Rpt Type Long,Street Address,City,State,Zip Code,Phone Number,Fax Number,...,Student Survey College & Career Score,Student Survey College & Career Pts Earn,Student Survey College & Career Pts Poss,Student Survey College & Career Pct Earn,Student Survey College & Career Tier,Teach Effect Distinguished Score,Teach Effect Instruction Score,Teacher Attendance Score,Student Survey Teaching Score,ULCS Code
0,John Bartram High School,101,HS,High School,2401 S 67th St,Philadelphia,PA,19142,215-400-8100,215-400-8101,...,19,0.19,1,19,INTERVENE,15,21,56,47,1010.0
1,West Philadelphia High School,102,HS,High School,4901 Chestnut St,Philadelphia,PA,19139,215-400-7900,215-400-7901,...,27,0.27,1,27,WATCH,11,29,56,42,1020.0
2,High School of the Future,103,HS,High School,4021 Parkside Ave,Philadelphia,PA,19104,215-400-7790,215-400-7791,...,0,0.0,1,0,INTERVENE,16,13,50,0,1030.0
3,Paul Robeson High School for Human Services,105,HS,High School,4125 Ludlow St,Philadelphia,PA,19104,215-400-7780,215-400-7781,...,27,0.27,1,27,WATCH,15,15,61,55,1050.0
4,William L. Sayre High School,110,HS,High School,5800 Walnut St,Philadelphia,PA,19139,215-400-7800,215-400-7801,...,24,0.24,1,24,INTERVENE,6,10,53,46,1100.0


The School Progress Reports contain many features, including the way some of the features are calculated and features that are specific to a certain type of school and are not reported across all schools. For the purpose of our model, we want to include information that is reported for all schools including features regarding student achievement, school climate, and progress from prior years.

In [43]:
school_col_keep = ['School', 'ULCS Code', 'Report', 'Turnaround Model', 'Enrollment',
                   'Grades Served', 'Admissions Type', 'Overall Score', 'Overall Tier', 
                   'Ach Score', 'Ach Tier', 'Prog Score', 'Prog Tier', 'Clim Score', 
                   'Clim Tier', 'Attendance (95%+) Score', 'Attendance (95%+) Tier', 
                   'Retention Score', 'Retention Tier', 'ISS Score', 'ISS Tier', 'OSS Score',
                   'OSS Tier', 'Student Survey Climate Score', 'Student Survey Climate Tier', 
                   'Parent Survey Climate Score', 'Parent Survey Climate Tier', 
                   'Parent Survey Participation Score', 'Parent Survey Participation Tier', 
                   'Teacher Attendance Score','Student Survey Teaching Score']

In [44]:
school_df_2016=school_df_2016[school_col_keep]

In [45]:
school_df_2017=school_df_2017[school_col_keep]

In [46]:
school_df_2018=school_df_2018[school_col_keep]

In [47]:
school_df_2016.head()

Unnamed: 0,School,ULCS Code,Report,Turnaround Model,Enrollment,Grades Served,Admissions Type,Overall Score,Overall Tier,Ach Score,...,OSS Score,OSS Tier,Student Survey Climate Score,Student Survey Climate Tier,Parent Survey Climate Score,Parent Survey Climate Tier,Parent Survey Participation Score,Parent Survey Participation Tier,Teacher Attendance Score,Student Survey Teaching Score
0,John Bartram High School,1010.0,HS,,710,'9-12',Neighborhood,9,INTERVENE,4,...,63,INTERVENE,53,REINFORCE,0,INTERVENE,3,INTERVENE,56,45
1,West Philadelphia High School,1020.0,HS,Promise Academy,535,'9-12',Neighborhood,14,INTERVENE,0,...,79,WATCH,62,REINFORCE,81,MODEL,27,WATCH,65,44
2,High School of the Future,1030.0,HS,,518,'9-12',Citywide,24,INTERVENE,2,...,82,WATCH,63,REINFORCE,0,INTERVENE,6,INTERVENE,53,46
3,Paul Robeson High School for Human Services,1050.0,HS,,293,'9-12',Citywide,49,WATCH,6,...,97,MODEL,65,REINFORCE,94,MODEL,20,WATCH,76,56
4,William L. Sayre High School,1100.0,HS,,509,'9-12',Neighborhood,8,INTERVENE,0,...,87,REINFORCE,56,REINFORCE,0,INTERVENE,1,INTERVENE,57,46


In [48]:
school_df_2017.head()

Unnamed: 0,School,ULCS Code,Report,Turnaround Model,Enrollment,Grades Served,Admissions Type,Overall Score,Overall Tier,Ach Score,...,OSS Score,OSS Tier,Student Survey Climate Score,Student Survey Climate Tier,Parent Survey Climate Score,Parent Survey Climate Tier,Parent Survey Participation Score,Parent Survey Participation Tier,Teacher Attendance Score,Student Survey Teaching Score
0,John Bartram High School,1010.0,HS,,601,'9-12',Neighborhood,14,INTERVENE,0,...,83,WATCH,54,REINFORCE,0,INTERVENE,4,INTERVENE,56,47
1,West Philadelphia High School,1020.0,HS,Turnaround,483,'9-12',Neighborhood,22,INTERVENE,0,...,77,INTERVENE,59,REINFORCE,79,MODEL,22,WATCH,56,42
2,High School of the Future,1030.0,HS,,476,'9-12',Citywide,23,INTERVENE,3,...,93,MODEL,0,INTERVENE,0,INTERVENE,3,INTERVENE,50,0
3,Paul Robeson High School for Human Services,1050.0,HS,,297,'9-12',Citywide,41,WATCH,0,...,96,MODEL,69,REINFORCE,0,INTERVENE,6,INTERVENE,61,55
4,William L. Sayre High School,1100.0,HS,,492,'9-12',Neighborhood,8,INTERVENE,0,...,87,REINFORCE,56,REINFORCE,0,INTERVENE,9,INTERVENE,53,46


In [49]:
school_df_2018.head()

Unnamed: 0,School,ULCS Code,Report,Turnaround Model,Enrollment,Grades Served,Admissions Type,Overall Score,Overall Tier,Ach Score,...,OSS Score,OSS Tier,Student Survey Climate Score,Student Survey Climate Tier,Parent Survey Climate Score,Parent Survey Climate Tier,Parent Survey Participation Score,Parent Survey Participation Tier,Teacher Attendance Score,Student Survey Teaching Score
0,John Bartram High School,1010,HS,,561,'9-12',Neighborhood,13,INTERVENE,0,...,82,WATCH,52,REINFORCE,0,INTERVENE,2,INTERVENE,52,43
1,West Philadelphia High School,1020,HS,Turnaround,478,'9-12',Neighborhood,8,INTERVENE,0,...,76,INTERVENE,0,INTERVENE,0,INTERVENE,8,INTERVENE,49,0
2,High School of the Future,1030,HS,,488,'9-12',Citywide,25,WATCH,0,...,87,REINFORCE,55,REINFORCE,0,INTERVENE,2,INTERVENE,56,36
3,Paul Robeson High School for Human Services,1050,HS,,299,'9-12',Citywide,44,WATCH,3,...,95,MODEL,64,REINFORCE,82,MODEL,13,INTERVENE,68,53
4,William L. Sayre High School,1100,HS,,425,'9-12',Neighborhood,8,INTERVENE,0,...,82,WATCH,0,INTERVENE,0,INTERVENE,8,INTERVENE,71,0


When checking for null values, the `Turnaround Model` has null values meaning the school was not identified to participate in a turnaround model for the 2017-2018 school year. The null values will be filled with 'None'.

In [50]:
school_df_2016.isna().sum()

School                                 0
ULCS Code                             14
Report                                 0
Turnaround Model                     340
Enrollment                             0
Grades Served                          0
Admissions Type                        0
Overall Score                          0
Overall Tier                           0
Ach Score                              0
Ach Tier                               0
Prog Score                             0
Prog Tier                              0
Clim Score                             0
Clim Tier                              0
Attendance (95%+) Score                0
Attendance (95%+) Tier                 0
Retention Score                        0
Retention Tier                         0
ISS Score                              0
ISS Tier                               0
OSS Score                              0
OSS Tier                               0
Student Survey Climate Score           0
Student Survey C

In [51]:
school_df_2017.isna().sum()

School                                 0
ULCS Code                              3
Report                                 0
Turnaround Model                     328
Enrollment                             0
Grades Served                          0
Admissions Type                        0
Overall Score                          0
Overall Tier                           0
Ach Score                              0
Ach Tier                               0
Prog Score                             0
Prog Tier                              0
Clim Score                             0
Clim Tier                              0
Attendance (95%+) Score                0
Attendance (95%+) Tier                 0
Retention Score                        0
Retention Tier                         0
ISS Score                              0
ISS Tier                               0
OSS Score                              0
OSS Tier                               0
Student Survey Climate Score           0
Student Survey C

In [52]:
school_df_2018.isna().sum()

School                                 0
ULCS Code                              0
Report                                 0
Turnaround Model                     272
Enrollment                             0
Grades Served                          0
Admissions Type                        0
Overall Score                          0
Overall Tier                           0
Ach Score                              0
Ach Tier                               0
Prog Score                             0
Prog Tier                              0
Clim Score                             0
Clim Tier                              0
Attendance (95%+) Score                0
Attendance (95%+) Tier                 0
Retention Score                        0
Retention Tier                         0
ISS Score                              0
ISS Tier                               0
OSS Score                              0
OSS Tier                               0
Student Survey Climate Score           0
Student Survey C

In [53]:
school_df_2016.loc[:,['Turnaround Model']]=school_df_2016['Turnaround Model'].fillna(value='None')

In [54]:
school_df_2017.loc[:,['Turnaround Model']]=school_df_2017['Turnaround Model'].fillna(value='None')

In [55]:
school_df_2018.loc[:,['Turnaround Model']]=school_df_2018['Turnaround Model'].fillna(value='None')

In [56]:
school_df_2016.isna().sum()

School                                0
ULCS Code                            14
Report                                0
Turnaround Model                      0
Enrollment                            0
Grades Served                         0
Admissions Type                       0
Overall Score                         0
Overall Tier                          0
Ach Score                             0
Ach Tier                              0
Prog Score                            0
Prog Tier                             0
Clim Score                            0
Clim Tier                             0
Attendance (95%+) Score               0
Attendance (95%+) Tier                0
Retention Score                       0
Retention Tier                        0
ISS Score                             0
ISS Tier                              0
OSS Score                             0
OSS Tier                              0
Student Survey Climate Score          0
Student Survey Climate Tier           0


In [57]:
school_df_2017.isna().sum()

School                               0
ULCS Code                            3
Report                               0
Turnaround Model                     0
Enrollment                           0
Grades Served                        0
Admissions Type                      0
Overall Score                        0
Overall Tier                         0
Ach Score                            0
Ach Tier                             0
Prog Score                           0
Prog Tier                            0
Clim Score                           0
Clim Tier                            0
Attendance (95%+) Score              0
Attendance (95%+) Tier               0
Retention Score                      0
Retention Tier                       0
ISS Score                            0
ISS Tier                             0
OSS Score                            0
OSS Tier                             0
Student Survey Climate Score         0
Student Survey Climate Tier          0
Parent Survey Climate Sco

In [58]:
school_df_2018.isna().sum()

School                               0
ULCS Code                            0
Report                               0
Turnaround Model                     0
Enrollment                           0
Grades Served                        0
Admissions Type                      0
Overall Score                        0
Overall Tier                         0
Ach Score                            0
Ach Tier                             0
Prog Score                           0
Prog Tier                            0
Clim Score                           0
Clim Tier                            0
Attendance (95%+) Score              0
Attendance (95%+) Tier               0
Retention Score                      0
Retention Tier                       0
ISS Score                            0
ISS Tier                             0
OSS Score                            0
OSS Tier                             0
Student Survey Climate Score         0
Student Survey Climate Tier          0
Parent Survey Climate Sco

### Merging Teacher Data with School Data

In order to merge the individual teacher data with the school progress report data, we will need to merge on the school code as the school name is not written the same between files. The school code is consistent, however, in the teacher data it is stored as an object when it is numerical in the school data.

We will turn the `HOME_ORGANIZATION` code into a numeric and those that are numerical will be coerced to a null value. The staff with the non-numeric codes are assigned to locations that are not actual school buildings such as code `9KT0` represents the Office of High School Support.

Then, we will drop the null values from the `HOME ORGANIZATION`.

In [59]:
teacher_turnover_2017.loc[:,['HOME_ORGANIZATION']] = pd.to_numeric(teacher_turnover_2017['HOME_ORGANIZATION'], 
                                                        errors='coerce')

In [60]:
teacher_turnover_2018.loc[:,['HOME_ORGANIZATION']] = pd.to_numeric(teacher_turnover_2018['HOME_ORGANIZATION'], 
                                                        errors='coerce')

In [61]:
teacher_turnover_2019.loc[:,['HOME_ORGANIZATION']] = pd.to_numeric(teacher_turnover_2019['HOME_ORGANIZATION'], 
                                                        errors='coerce')

In [62]:
teacher_turnover_2017.dropna(subset = ['HOME_ORGANIZATION'], inplace=True)

In [63]:
teacher_turnover_2018.dropna(subset = ['HOME_ORGANIZATION'], inplace=True)

In [64]:
teacher_turnover_2019.dropna(subset = ['HOME_ORGANIZATION'], inplace=True)

In [65]:
teacher_turnover_2017.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7882 entries, 4 to 17724
Data columns (total 19 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   LAST_NAME                           7882 non-null   object 
 1   FIRST_NAME                          7882 non-null   object 
 2   PAY_RATE_TYPE_2016                  7882 non-null   object 
 3   PAY_RATE_2016                       7882 non-null   int64  
 4   TITLE_DESCRIPTION_2016              7882 non-null   object 
 5   HOME_ORGANIZATION                   7882 non-null   float64
 6   HOME_ORGANIZATION_DESCRIPTION_2016  7882 non-null   object 
 7   ORGANIZATION_LEVEL_2016             7882 non-null   object 
 8   TYPE_OF_REPRESENTATION_2016         7882 non-null   object 
 9   GENDER_2016                         7882 non-null   object 
 10  RUN_DATE_2016                       7882 non-null   object 
 11  PAY_RATE_TYPE_2017                  6102 n

In [66]:
teacher_turnover_2018.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8050 entries, 4 to 18455
Data columns (total 19 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   LAST_NAME                           8050 non-null   object 
 1   FIRST_NAME                          8050 non-null   object 
 2   PAY_RATE_TYPE_2017                  8050 non-null   object 
 3   PAY_RATE_2017                       8050 non-null   int64  
 4   TITLE_DESCRIPTION_2017              8050 non-null   object 
 5   HOME_ORGANIZATION                   8050 non-null   float64
 6   HOME_ORGANIZATION_DESCRIPTION_2017  8050 non-null   object 
 7   ORGANIZATION_LEVEL_2017             8050 non-null   object 
 8   TYPE_OF_REPRESENTATION_2017         8050 non-null   object 
 9   GENDER_2017                         8050 non-null   object 
 10  RUN_DATE_2017                       8050 non-null   object 
 11  PAY_RATE_TYPE_2018                  6393 n

In [67]:
teacher_turnover_2019.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8085 entries, 4 to 19277
Data columns (total 19 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   LAST_NAME                           8085 non-null   object 
 1   FIRST_NAME                          8085 non-null   object 
 2   PAY_RATE_TYPE_2018                  8085 non-null   object 
 3   PAY_RATE_2018                       8085 non-null   int64  
 4   TITLE_DESCRIPTION_2018              8085 non-null   object 
 5   HOME_ORGANIZATION                   8085 non-null   float64
 6   HOME_ORGANIZATION_DESCRIPTION_2018  8085 non-null   object 
 7   ORGANIZATION_LEVEL_2018             8085 non-null   object 
 8   TYPE_OF_REPRESENTATION_2018         8085 non-null   object 
 9   GENDER_2018                         8085 non-null   object 
 10  RUN_DATE_2018                       8085 non-null   object 
 11  PAY_RATE_TYPE_2019                  6521 n

Now we can merge the teacher data with the school data using what is referred to as `HOME_ORGANIZATION` in the teacher data and the `ULCS Code` for the school data.

In [68]:
full_turnover_2017= pd.merge(teacher_turnover_2017, school_df_2016, 
                             left_on='HOME_ORGANIZATION', 
                             right_on='ULCS Code',
                             how='left')

In [69]:
full_turnover_2018= pd.merge(teacher_turnover_2018, school_df_2017, 
                             left_on='HOME_ORGANIZATION', 
                             right_on='ULCS Code',
                             how='left')

In [70]:
full_turnover_2019= pd.merge(teacher_turnover_2019, school_df_2018, 
                             left_on='HOME_ORGANIZATION', 
                             right_on='ULCS Code',
                             how='left')

In [71]:
full_turnover_2017.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8281 entries, 0 to 8280
Data columns (total 50 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   LAST_NAME                           8281 non-null   object 
 1   FIRST_NAME                          8281 non-null   object 
 2   PAY_RATE_TYPE_2016                  8281 non-null   object 
 3   PAY_RATE_2016                       8281 non-null   int64  
 4   TITLE_DESCRIPTION_2016              8281 non-null   object 
 5   HOME_ORGANIZATION                   8281 non-null   float64
 6   HOME_ORGANIZATION_DESCRIPTION_2016  8281 non-null   object 
 7   ORGANIZATION_LEVEL_2016             8281 non-null   object 
 8   TYPE_OF_REPRESENTATION_2016         8281 non-null   object 
 9   GENDER_2016                         8281 non-null   object 
 10  RUN_DATE_2016                       8281 non-null   object 
 11  PAY_RATE_TYPE_2017                  6399 no

In [72]:
full_turnover_2018.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8521 entries, 0 to 8520
Data columns (total 50 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   LAST_NAME                           8521 non-null   object 
 1   FIRST_NAME                          8521 non-null   object 
 2   PAY_RATE_TYPE_2017                  8521 non-null   object 
 3   PAY_RATE_2017                       8521 non-null   int64  
 4   TITLE_DESCRIPTION_2017              8521 non-null   object 
 5   HOME_ORGANIZATION                   8521 non-null   float64
 6   HOME_ORGANIZATION_DESCRIPTION_2017  8521 non-null   object 
 7   ORGANIZATION_LEVEL_2017             8521 non-null   object 
 8   TYPE_OF_REPRESENTATION_2017         8521 non-null   object 
 9   GENDER_2017                         8521 non-null   object 
 10  RUN_DATE_2017                       8521 non-null   object 
 11  PAY_RATE_TYPE_2018                  6777 no

In [73]:
full_turnover_2019.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8252 entries, 0 to 8251
Data columns (total 50 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   LAST_NAME                           8252 non-null   object 
 1   FIRST_NAME                          8252 non-null   object 
 2   PAY_RATE_TYPE_2018                  8252 non-null   object 
 3   PAY_RATE_2018                       8252 non-null   int64  
 4   TITLE_DESCRIPTION_2018              8252 non-null   object 
 5   HOME_ORGANIZATION                   8252 non-null   float64
 6   HOME_ORGANIZATION_DESCRIPTION_2018  8252 non-null   object 
 7   ORGANIZATION_LEVEL_2018             8252 non-null   object 
 8   TYPE_OF_REPRESENTATION_2018         8252 non-null   object 
 9   GENDER_2018                         8252 non-null   object 
 10  RUN_DATE_2018                       8252 non-null   object 
 11  PAY_RATE_TYPE_2019                  6663 no

In [74]:
full_turnover_2017[full_turnover_2017.filter(like='School').isnull().any(1)]

Unnamed: 0,LAST_NAME,FIRST_NAME,PAY_RATE_TYPE_2016,PAY_RATE_2016,TITLE_DESCRIPTION_2016,HOME_ORGANIZATION,HOME_ORGANIZATION_DESCRIPTION_2016,ORGANIZATION_LEVEL_2016,TYPE_OF_REPRESENTATION_2016,GENDER_2016,...,OSS Score,OSS Tier,Student Survey Climate Score,Student Survey Climate Tier,Parent Survey Climate Score,Parent Survey Climate Tier,Parent Survey Participation Score,Parent Survey Participation Tier,Teacher Attendance Score,Student Survey Teaching Score
0,ABAYOMI-IGE,OLABIMPE,SALARIED,76461,"TEACHER,SPEC EDUCATION",6100.0,"LEEDS, MORRIS E. MIDDLE SCHOOL",MIDDLE SCHOOL,PFT-TEACHER,F,...,,,,,,,,,,
6,ABDUL-WAKEEL,AMIRA,SALARIED,49615,"TEACHER,FULL TIME",6100.0,"LEEDS, MORRIS E. MIDDLE SCHOOL",MIDDLE SCHOOL,PFT-TEACHER,F,...,,,,,,,,,,
33,ADAMS,CA-TISHA,SALARIED,83382,"TEACHER,FULL TIME",4394.0,PRATT CENTER HEAD START,EARLY CHILDHOOD,PFT-TEACHER,F,...,,,,,,,,,,
51,ADERSON,CRYSTAL,SALARIED,60453,"TEACHER,FULL TIME",1330.0,"HUEY, SAMUEL B. SCHOOL",ELEMENTARY SCHOOL,PFT-TEACHER,F,...,,,,,,,,,,
91,ALBUCK,STEPHANIE,SALARIED,76461,"TEACHER,FULL TIME",3190.0,CROSSROADS @ HUNTING PARK,TRANSITION / OVERAGE SCHOOL,PFT-TEACHER,F,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8138,WRIGHT,AMY,SALARIED,67789,"TEACHER,FULL TIME",2242.0,BREGY HEAD START,EARLY CHILDHOOD,PFT- PRE K,F,...,,,,,,,,,,
8181,YELLOCK,MONICA,SALARIED,64045,"TEACHER,FULL TIME",1330.0,"HUEY, SAMUEL B. SCHOOL",ELEMENTARY SCHOOL,PFT-TEACHER,F,...,,,,,,,,,,
8195,YORKONIS,TRACY,SALARIED,62368,"TEACHER,FULL TIME",1475.0,HAVERFORD CTR BRIGHT FUTURES,EARLY CHILDHOOD,PFT- PRE K,F,...,,,,,,,,,,
8218,YUSIBOVA,GALINA,SALARIED,65121,"TEACHER,FULL TIME",5600.0,KENSINGTON URBAN EDUCATION,HIGH SCHOOL,PFT-TEACHER,F,...,,,,,,,,,,


In [75]:
full_turnover_2018[full_turnover_2018.filter(like='School').isnull().any(1)]

Unnamed: 0,LAST_NAME,FIRST_NAME,PAY_RATE_TYPE_2017,PAY_RATE_2017,TITLE_DESCRIPTION_2017,HOME_ORGANIZATION,HOME_ORGANIZATION_DESCRIPTION_2017,ORGANIZATION_LEVEL_2017,TYPE_OF_REPRESENTATION_2017,GENDER_2017,...,OSS Score,OSS Tier,Student Survey Climate Score,Student Survey Climate Tier,Parent Survey Climate Score,Parent Survey Climate Tier,Parent Survey Participation Score,Parent Survey Participation Tier,Teacher Attendance Score,Student Survey Teaching Score
19,ABT,DEBBIE,SALARIED,78376,"TEACHER,FULL TIME",6392.0,STEEL SCHOOL HEAD START,EARLY CHILDHOOD,PFT- PRE K,F,...,,,,,,,,,,
82,ALBERTI,JOE,SALARIED,79313,"TEACHER,FULL TIME",2050.0,SCIENCE LEADERSHIP ACADEMY MS,MIDDLE SCHOOL,PFT-TEACHER,M,...,,,,,,,,,,
118,ALLEN,HEATHER,SALARIED,49615,"TEACHER,FULL TIME",6392.0,STEEL SCHOOL HEAD START,EARLY CHILDHOOD,PFT- PRE K,F,...,,,,,,,,,,
123,ALLMAN,ELIZABETH,SALARIED,51113,"TEACHER,FULL TIME",4399.0,WRIGHT HEAD START,EARLY CHILDHOOD,PFT- PRE K,F,...,,,,,,,,,,
182,ANGELINI,CHRISTOPHER,SALARIED,62368,"TEACHER,FULL TIME",8690.0,CROSSROADS ACCELERATED ACADEMY,TRANSITION / OVERAGE SCHOOL,PFT-TEACHER,M,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8379,WRIGHT,AMY,SALARIED,67789,"TEACHER,FULL TIME",2242.0,BREGY HEAD START,EARLY CHILDHOOD,PFT- PRE K,F,...,,,,,,,,,,
8435,YORKONIS,TRACY,SALARIED,62368,"TEACHER,FULL TIME",1475.0,HAVERFORD CTR BRIGHT FUTURES,EARLY CHILDHOOD,PFT- PRE K,F,...,,,,,,,,,,
8454,YU,JUYEON,SALARIED,90051,"TEACHER,FULL TIME",6400.0,WIDENER MEMORIAL SCHOOL,ELEMENTARY SCHOOL,PFT-TEACHER,F,...,,,,,,,,,,
8467,ZAJDEL,HEATHER,SALARIED,62368,"TEACHER,FULL TIME",8460.0,PHILA LEARNING ACADEMY-SOUTH,TRANSITION / OVERAGE SCHOOL,PFT-TEACHER,F,...,,,,,,,,,,


In [76]:
full_turnover_2019[full_turnover_2019.filter(like='School').isnull().any(1)]

Unnamed: 0,LAST_NAME,FIRST_NAME,PAY_RATE_TYPE_2018,PAY_RATE_2018,TITLE_DESCRIPTION_2018,HOME_ORGANIZATION,HOME_ORGANIZATION_DESCRIPTION_2018,ORGANIZATION_LEVEL_2018,TYPE_OF_REPRESENTATION_2018,GENDER_2018,...,OSS Score,OSS Tier,Student Survey Climate Score,Student Survey Climate Tier,Parent Survey Climate Score,Parent Survey Climate Tier,Parent Survey Participation Score,Parent Survey Participation Tier,Teacher Attendance Score,Student Survey Teaching Score
14,ABRAHAM-CUFF,NAFHRAH,SALARIED,48527,"TEACHER,SPEC EDUCATION",6400.0,WIDENER MEMORIAL SCHOOL,ELEMENTARY SCHOOL,PFT-TEACHER,F,...,,,,,,,,,,
18,ABT,DEBBIE,SALARIED,78376,"TEACHER,FULL TIME",6392.0,STEEL SCHOOL HEAD START,EARLY CHILDHOOD,PFT- PRE K,F,...,,,,,,,,,,
113,ALLEN,HEATHER,SALARIED,52196,"TEACHER,FULL TIME",6202.0,"DAY, ANNA B. HEAD START",EARLY CHILDHOOD,PFT- PRE K,F,...,,,,,,,,,,
114,ALLEN,MAYA,SALARIED,45359,"TEACHER,FULL TIME",6341.0,PENNELL HEAD START,EARLY CHILDHOOD,PFT-TEACHER,F,...,,,,,,,,,,
117,ALLMAN,ELIZABETH,SALARIED,59532,"TEACHER,FULL TIME",4399.0,WRIGHT HEAD START,EARLY CHILDHOOD,PFT- PRE K,F,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8167,YORKONIS,TRACY,SALARIED,67789,"TEACHER,FULL TIME",1475.0,HAVERFORD CTR BRIGHT FUTURES,EARLY CHILDHOOD,PFT- PRE K,F,...,,,,,,,,,,
8185,YU,JUYEON,SALARIED,90051,"TEACHER,FULL TIME",6400.0,WIDENER MEMORIAL SCHOOL,ELEMENTARY SCHOOL,PFT-TEACHER,F,...,,,,,,,,,,
8197,ZAJDEL,HEATHER,SALARIED,90051,"TEACHER,FULL TIME",8460.0,PHILA LEARNING ACADEMY-SOUTH,TRANSITION / OVERAGE SCHOOL,PFT-TEACHER,F,...,,,,,,,,,,
8213,ZELNER,KATHLEEN,SALARIED,69623,"TEACHER,FULL TIME",8272.0,HOLME HEAD START,EARLY CHILDHOOD,PFT- PRE K,F,...,,,,,,,,,,


After the merge, we can see that the teachers with no school listed are those from primarily Pre-K or Transition/Overage Schools where we don't have School Progress Report data for those schools. We will drop those ~250 rows to focus on teachers in Philadelphia schools that participated in the 2017-2018 School Progress Report.

In [77]:
full_turnover_2017.dropna(subset = ['School'], inplace=True)

In [78]:
full_turnover_2018.dropna(subset = ['School'], inplace=True)

In [79]:
full_turnover_2019.dropna(subset = ['School'], inplace=True)

In [80]:
full_turnover_2017.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7942 entries, 1 to 8280
Data columns (total 50 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   LAST_NAME                           7942 non-null   object 
 1   FIRST_NAME                          7942 non-null   object 
 2   PAY_RATE_TYPE_2016                  7942 non-null   object 
 3   PAY_RATE_2016                       7942 non-null   int64  
 4   TITLE_DESCRIPTION_2016              7942 non-null   object 
 5   HOME_ORGANIZATION                   7942 non-null   float64
 6   HOME_ORGANIZATION_DESCRIPTION_2016  7942 non-null   object 
 7   ORGANIZATION_LEVEL_2016             7942 non-null   object 
 8   TYPE_OF_REPRESENTATION_2016         7942 non-null   object 
 9   GENDER_2016                         7942 non-null   object 
 10  RUN_DATE_2016                       7942 non-null   object 
 11  PAY_RATE_TYPE_2017                  6219 no

In [81]:
full_turnover_2018.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8253 entries, 0 to 8520
Data columns (total 50 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   LAST_NAME                           8253 non-null   object 
 1   FIRST_NAME                          8253 non-null   object 
 2   PAY_RATE_TYPE_2017                  8253 non-null   object 
 3   PAY_RATE_2017                       8253 non-null   int64  
 4   TITLE_DESCRIPTION_2017              8253 non-null   object 
 5   HOME_ORGANIZATION                   8253 non-null   float64
 6   HOME_ORGANIZATION_DESCRIPTION_2017  8253 non-null   object 
 7   ORGANIZATION_LEVEL_2017             8253 non-null   object 
 8   TYPE_OF_REPRESENTATION_2017         8253 non-null   object 
 9   GENDER_2017                         8253 non-null   object 
 10  RUN_DATE_2017                       8253 non-null   object 
 11  PAY_RATE_TYPE_2018                  6566 no

In [82]:
full_turnover_2019.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7984 entries, 0 to 8251
Data columns (total 50 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   LAST_NAME                           7984 non-null   object 
 1   FIRST_NAME                          7984 non-null   object 
 2   PAY_RATE_TYPE_2018                  7984 non-null   object 
 3   PAY_RATE_2018                       7984 non-null   int64  
 4   TITLE_DESCRIPTION_2018              7984 non-null   object 
 5   HOME_ORGANIZATION                   7984 non-null   float64
 6   HOME_ORGANIZATION_DESCRIPTION_2018  7984 non-null   object 
 7   ORGANIZATION_LEVEL_2018             7984 non-null   object 
 8   TYPE_OF_REPRESENTATION_2018         7984 non-null   object 
 9   GENDER_2018                         7984 non-null   object 
 10  RUN_DATE_2018                       7984 non-null   object 
 11  PAY_RATE_TYPE_2019                  6450 no

We want to only keep teachers who had a salary in 2018, so we filter out the 57 teachers listed as having a salary equal to 0.

In [83]:
full_turnover_2017 = full_turnover_2017[full_turnover_2017['PAY_RATE_2016']>0]

In [84]:
full_turnover_2018 = full_turnover_2018[full_turnover_2018['PAY_RATE_2017']>0]

In [85]:
full_turnover_2019 = full_turnover_2019[full_turnover_2019['PAY_RATE_2018']>0]

In [86]:
full_turnover_2017.shape

(7902, 50)

In [87]:
full_turnover_2018.shape

(8193, 50)

In [88]:
full_turnover_2019.shape

(7927, 50)

We also want to create a dummy column that indicates whether a teacher is new to teaching or not using their salary information. New to teaching will be those teachers who are qualified for the Teacher Induction program:
>"Our Induction program provides one year of support that prioritizes skill development, personal reflection, and professional networking. Our model provides teachers with general pedagogical strategies that are designed for teachers in years 0-3." from https://www.philasd.org/teachingandlearning/professional-development/induction/

That means we will use cutoffs up to Step 3 from the Philadelphia Federation of Teachers' [Salary Schedule](https://jobs.philasd.org/wp-content/uploads/sites/47/2018/08/PFT-Salary-Schedules.pdf) from 2018 to indicate new to teaching. 


In [89]:
#create new column and set all values to 0
full_turnover_2017['NEW_TEACHER']=0

In [90]:
#create new column and set all values to 0
full_turnover_2018['NEW_TEACHER']=0

In [91]:
#create new column and set all values to 0
full_turnover_2019['NEW_TEACHER']=0

In [92]:
full_turnover_2017.head()

Unnamed: 0,LAST_NAME,FIRST_NAME,PAY_RATE_TYPE_2016,PAY_RATE_2016,TITLE_DESCRIPTION_2016,HOME_ORGANIZATION,HOME_ORGANIZATION_DESCRIPTION_2016,ORGANIZATION_LEVEL_2016,TYPE_OF_REPRESENTATION_2016,GENDER_2016,...,OSS Tier,Student Survey Climate Score,Student Survey Climate Tier,Parent Survey Climate Score,Parent Survey Climate Tier,Parent Survey Participation Score,Parent Survey Participation Tier,Teacher Attendance Score,Student Survey Teaching Score,NEW_TEACHER
1,ABBOTT,JOYCE,SALARIED,76461,"TEACHER,FULL TIME",1290.0,"HAMILTON, ANDREW SCHOOL",ELEMENTARY SCHOOL,PFT-TEACHER,F,...,REINFORCE,53,REINFORCE,82.0,MODEL,11.0,INTERVENE,55,55,0
2,ABDALLAH,JUWAYRIYAH,SALARIED,46694,"TEACHER,FULL TIME",1470.0,"LOCKE, ALAIN SCHOOL",ELEMENTARY SCHOOL,PFT-TEACHER,F,...,INTERVENE,55,REINFORCE,77.0,MODEL,12.0,INTERVENE,55,56,0
3,ABDEL-JALIL,GHADEER,SALARIED,45359,"TEACHER,FULL TIME",7440.0,"TAYLOR, BAYARD SCHOOL",ELEMENTARY SCHOOL,PFT-TEACHER,F,...,WATCH,49,WATCH,0.0,INTERVENE,2.0,INTERVENE,70,55,0
4,ABDUL BASIT,BARBARA,SALARIED,67706,"TEACHER,FULL TIME",7530.0,"ROWEN, WILLIAM SCHOOL",ELEMENTARY SCHOOL,PFT-TEACHER,F,...,REINFORCE,60,REINFORCE,0.0,INTERVENE,6.0,INTERVENE,68,69,0
5,ABDUL-LATEEF,VILLIA,SALARIED,48945,"TEACHER,FULL TIME",1010.0,"BARTRAM, JOHN HIGH SCHOOL",HIGH SCHOOL,PFT-TEACHER,F,...,INTERVENE,53,REINFORCE,0.0,INTERVENE,3.0,INTERVENE,56,45,0


In [93]:
full_turnover_2019.head()

Unnamed: 0,LAST_NAME,FIRST_NAME,PAY_RATE_TYPE_2018,PAY_RATE_2018,TITLE_DESCRIPTION_2018,HOME_ORGANIZATION,HOME_ORGANIZATION_DESCRIPTION_2018,ORGANIZATION_LEVEL_2018,TYPE_OF_REPRESENTATION_2018,GENDER_2018,...,OSS Tier,Student Survey Climate Score,Student Survey Climate Tier,Parent Survey Climate Score,Parent Survey Climate Tier,Parent Survey Participation Score,Parent Survey Participation Tier,Teacher Attendance Score,Student Survey Teaching Score,NEW_TEACHER
0,ABAYOMI-IGE,OLABIMPE,SALARIED,90051,"TEACHER,SPEC EDUCATION",5070.0,PARKWAY-NORTHWEST HIGH SCHOOL,HIGH SCHOOL,PFT-TEACHER,F,...,REINFORCE,48,WATCH,73.0,REINFORCE,18.0,INTERVENE,67,33,0
1,ABBOTT,JOYCE,SALARIED,76461,"TEACHER,FULL TIME",1290.0,"HAMILTON, ANDREW SCHOOL",ELEMENTARY SCHOOL,PFT-TEACHER,F,...,REINFORCE,0,INTERVENE,0.0,INTERVENE,1.0,INTERVENE,46,0,0
2,ABDUL-LATEEF,VILLIA,SALARIED,56531,"TEACHER,FULL TIME",1010.0,"BARTRAM, JOHN HIGH SCHOOL",HIGH SCHOOL,PFT-TEACHER,F,...,WATCH,52,REINFORCE,0.0,INTERVENE,2.0,INTERVENE,52,43,0
3,ABDULALEEM,MUHAMMAD,SALARIED,70564,"TEACHER,FULL TIME",6090.0,RANDOLPH TECHNICAL HIGH SCHOOL,CAREER AND TECHNICAL HIGH SCHL,PFT-TEACHER,M,...,REINFORCE,56,REINFORCE,0.0,INTERVENE,2.0,INTERVENE,64,36,0
4,ABDULLAH,AARON,SALARIED,46694,"TEACHER,FULL TIME",6360.0,ROOSEVELT ELEMENTARY SCHOOL,ELEMENTARY SCHOOL,PFT-TEACHER,M,...,INTERVENE,47,WATCH,63.0,REINFORCE,10.0,INTERVENE,58,51,0


In [94]:
full_turnover_2017['TYPE_OF_REPRESENTATION_2016'].value_counts()

PFT-TEACHER    7892
PFT- PRE K       10
Name: TYPE_OF_REPRESENTATION_2016, dtype: int64

In [95]:
full_turnover_2018['TYPE_OF_REPRESENTATION_2017'].value_counts()

PFT-TEACHER    8182
PFT- PRE K       11
Name: TYPE_OF_REPRESENTATION_2017, dtype: int64

In [96]:
full_turnover_2019['TYPE_OF_REPRESENTATION_2018'].value_counts()

PFT-TEACHER    7922
PFT- PRE K        5
Name: TYPE_OF_REPRESENTATION_2018, dtype: int64

In [97]:
conditions_2017 = [
    (full_turnover_2017['PAY_RATE_2016'] <= 53281)&(full_turnover_2017['TITLE_DESCRIPTION_2016'] == 'TEACHER,FULL TIME'),
    (full_turnover_2017['PAY_RATE_2016'] <= 54534)&(full_turnover_2017['TITLE_DESCRIPTION_2016'] == 'TEACHER,SPEC EDUCATION')
    ]
values = [1, 1]
full_turnover_2017['NEW_TEACHER'] = np.select(conditions_2017, values)

In [98]:
conditions_2018 = [
    (full_turnover_2018['PAY_RATE_2017'] <= 53281)&(full_turnover_2018['TITLE_DESCRIPTION_2017'] == 'TEACHER,FULL TIME'),
    (full_turnover_2018['PAY_RATE_2017'] <= 54534)&(full_turnover_2018['TITLE_DESCRIPTION_2017'] == 'TEACHER,SPEC EDUCATION')
    ]
values = [1, 1]
full_turnover_2018['NEW_TEACHER'] = np.select(conditions_2018, values)

In [99]:
conditions_2019 = [
    (full_turnover_2019['PAY_RATE_2018'] <= 53281)&(full_turnover_2019['TITLE_DESCRIPTION_2018'] == 'TEACHER,FULL TIME'),
    (full_turnover_2019['PAY_RATE_2018'] <= 54534)&(full_turnover_2019['TITLE_DESCRIPTION_2018'] == 'TEACHER,SPEC EDUCATION')
    ]
values = [1, 1]
full_turnover_2019['NEW_TEACHER'] = np.select(conditions_2019, values)

In [100]:
full_turnover_2017['NEW_TEACHER'].value_counts(normalize=True)

0    0.835864
1    0.164136
Name: NEW_TEACHER, dtype: float64

In [101]:
full_turnover_2018['NEW_TEACHER'].value_counts(normalize=True)

0    0.788234
1    0.211766
Name: NEW_TEACHER, dtype: float64

In [102]:
full_turnover_2019['NEW_TEACHER'].value_counts(normalize=True)

0    0.828561
1    0.171439
Name: NEW_TEACHER, dtype: float64

We will establish a `TURNOVER` column that will act as the target column for modeling purposes. To do this, if a record has non-null values from the 2019 dataset, then they will be marked as 0 indicating they were retained in their 2018 school. However, if a record has a null value in the 2019 dataset, they will be marked with a 1 indicating they turned over from their 2018 school.

In [103]:
full_turnover_2017['TURNOVER']=np.where(full_turnover_2017['PAY_RATE_2017'].isnull(), 1,0)

In [104]:
full_turnover_2018['TURNOVER']=np.where(full_turnover_2018['PAY_RATE_2018'].isnull(), 1,0)

In [105]:
full_turnover_2019['TURNOVER']=np.where(full_turnover_2019['PAY_RATE_2019'].isnull(), 1,0)

In [106]:
full_turnover_2017['TURNOVER'].value_counts(normalize=True)

0    0.783473
1    0.216527
Name: TURNOVER, dtype: float64

In [107]:
full_turnover_2018['TURNOVER'].value_counts(normalize=True)

0    0.797876
1    0.202124
Name: TURNOVER, dtype: float64

In [108]:
full_turnover_2019['TURNOVER'].value_counts(normalize=True)

0    0.810647
1    0.189353
Name: TURNOVER, dtype: float64

From this, we can see that nearly 20% of the public school teachers left their schools after the 2018 school year. This does indicate that there is a class imbalance at play that will need to be accounted for when modeling.

We still need to clean up some of the data before modeling. We can drop columns that will not be helpful for modeling, such as the teacher data from 2019 since it contains null values from those teachers that left their school.

In [109]:
full_turnover_2017.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7902 entries, 1 to 8280
Data columns (total 52 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   LAST_NAME                           7902 non-null   object 
 1   FIRST_NAME                          7902 non-null   object 
 2   PAY_RATE_TYPE_2016                  7902 non-null   object 
 3   PAY_RATE_2016                       7902 non-null   int64  
 4   TITLE_DESCRIPTION_2016              7902 non-null   object 
 5   HOME_ORGANIZATION                   7902 non-null   float64
 6   HOME_ORGANIZATION_DESCRIPTION_2016  7902 non-null   object 
 7   ORGANIZATION_LEVEL_2016             7902 non-null   object 
 8   TYPE_OF_REPRESENTATION_2016         7902 non-null   object 
 9   GENDER_2016                         7902 non-null   object 
 10  RUN_DATE_2016                       7902 non-null   object 
 11  PAY_RATE_TYPE_2017                  6191 no

In [110]:
full_turnover_2018.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8193 entries, 0 to 8520
Data columns (total 52 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   LAST_NAME                           8193 non-null   object 
 1   FIRST_NAME                          8193 non-null   object 
 2   PAY_RATE_TYPE_2017                  8193 non-null   object 
 3   PAY_RATE_2017                       8193 non-null   int64  
 4   TITLE_DESCRIPTION_2017              8193 non-null   object 
 5   HOME_ORGANIZATION                   8193 non-null   float64
 6   HOME_ORGANIZATION_DESCRIPTION_2017  8193 non-null   object 
 7   ORGANIZATION_LEVEL_2017             8193 non-null   object 
 8   TYPE_OF_REPRESENTATION_2017         8193 non-null   object 
 9   GENDER_2017                         8193 non-null   object 
 10  RUN_DATE_2017                       8193 non-null   object 
 11  PAY_RATE_TYPE_2018                  6537 no

In [111]:
full_turnover_2019.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7927 entries, 0 to 8251
Data columns (total 52 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   LAST_NAME                           7927 non-null   object 
 1   FIRST_NAME                          7927 non-null   object 
 2   PAY_RATE_TYPE_2018                  7927 non-null   object 
 3   PAY_RATE_2018                       7927 non-null   int64  
 4   TITLE_DESCRIPTION_2018              7927 non-null   object 
 5   HOME_ORGANIZATION                   7927 non-null   float64
 6   HOME_ORGANIZATION_DESCRIPTION_2018  7927 non-null   object 
 7   ORGANIZATION_LEVEL_2018             7927 non-null   object 
 8   TYPE_OF_REPRESENTATION_2018         7927 non-null   object 
 9   GENDER_2018                         7927 non-null   object 
 10  RUN_DATE_2018                       7927 non-null   object 
 11  PAY_RATE_TYPE_2019                  6426 no

In [112]:
cols_to_drop_2017 = ['PAY_RATE_TYPE_2017', 'PAY_RATE_2017','TITLE_DESCRIPTION_2017', 
                'HOME_ORGANIZATION_DESCRIPTION_2017', 'ORGANIZATION_LEVEL_2017',
                'TYPE_OF_REPRESENTATION_2017','GENDER_2017','RUN_DATE_2017', 'RUN_DATE_2016',
                'HOME_ORGANIZATION_DESCRIPTION_2016', 'PAY_RATE_TYPE_2016', 'TYPE_OF_REPRESENTATION_2016',
                'ULCS Code', 'Report']

In [113]:
cols_to_drop_2018 = ['PAY_RATE_TYPE_2018', 'PAY_RATE_2018','TITLE_DESCRIPTION_2018', 
                'HOME_ORGANIZATION_DESCRIPTION_2018', 'ORGANIZATION_LEVEL_2018',
                'TYPE_OF_REPRESENTATION_2018','GENDER_2018','RUN_DATE_2018', 'RUN_DATE_2017',
                'HOME_ORGANIZATION_DESCRIPTION_2017', 'PAY_RATE_TYPE_2017', 'TYPE_OF_REPRESENTATION_2017',
                'ULCS Code', 'Report']

In [114]:
cols_to_drop_2019 = ['PAY_RATE_TYPE_2019', 'PAY_RATE_2019','TITLE_DESCRIPTION_2019', 
                'HOME_ORGANIZATION_DESCRIPTION_2019', 'ORGANIZATION_LEVEL_2019',
                'TYPE_OF_REPRESENTATION_2019','GENDER_2019','RUN_DATE_2019', 'RUN_DATE_2018',
                'HOME_ORGANIZATION_DESCRIPTION_2018', 'PAY_RATE_TYPE_2018', 'TYPE_OF_REPRESENTATION_2018',
                'ULCS Code', 'Report']

In [115]:
full_turnover_2017.drop(cols_to_drop_2017, axis=1, inplace=True)

In [116]:
full_turnover_2018.drop(cols_to_drop_2018, axis=1, inplace=True)

In [117]:
full_turnover_2019.drop(cols_to_drop_2019, axis=1, inplace=True)

In [118]:
full_turnover_2017.head()

Unnamed: 0,LAST_NAME,FIRST_NAME,PAY_RATE_2016,TITLE_DESCRIPTION_2016,HOME_ORGANIZATION,ORGANIZATION_LEVEL_2016,GENDER_2016,School,Turnaround Model,Enrollment,...,Student Survey Climate Score,Student Survey Climate Tier,Parent Survey Climate Score,Parent Survey Climate Tier,Parent Survey Participation Score,Parent Survey Participation Tier,Teacher Attendance Score,Student Survey Teaching Score,NEW_TEACHER,TURNOVER
1,ABBOTT,JOYCE,76461,"TEACHER,FULL TIME",1290.0,ELEMENTARY SCHOOL,F,Andrew Hamilton School,,576.0,...,53,REINFORCE,82.0,MODEL,11.0,INTERVENE,55,55,0,0
2,ABDALLAH,JUWAYRIYAH,46694,"TEACHER,FULL TIME",1470.0,ELEMENTARY SCHOOL,F,Alain Locke School,,461.0,...,55,REINFORCE,77.0,MODEL,12.0,INTERVENE,55,56,1,1
3,ABDEL-JALIL,GHADEER,45359,"TEACHER,FULL TIME",7440.0,ELEMENTARY SCHOOL,F,Bayard Taylor School,,555.0,...,49,WATCH,0.0,INTERVENE,2.0,INTERVENE,70,55,1,1
4,ABDUL BASIT,BARBARA,67706,"TEACHER,FULL TIME",7530.0,ELEMENTARY SCHOOL,F,William Rowen School,,523.0,...,60,REINFORCE,0.0,INTERVENE,6.0,INTERVENE,68,69,0,1
5,ABDUL-LATEEF,VILLIA,48945,"TEACHER,FULL TIME",1010.0,HIGH SCHOOL,F,John Bartram High School,,710.0,...,53,REINFORCE,0.0,INTERVENE,3.0,INTERVENE,56,45,1,1


In [119]:
full_turnover_2018.head()

Unnamed: 0,LAST_NAME,FIRST_NAME,PAY_RATE_2017,TITLE_DESCRIPTION_2017,HOME_ORGANIZATION,ORGANIZATION_LEVEL_2017,GENDER_2017,School,Turnaround Model,Enrollment,...,Student Survey Climate Score,Student Survey Climate Tier,Parent Survey Climate Score,Parent Survey Climate Tier,Parent Survey Participation Score,Parent Survey Participation Tier,Teacher Attendance Score,Student Survey Teaching Score,NEW_TEACHER,TURNOVER
0,ABAYOMI-IGE,OLABIMPE,76461,"TEACHER,SPEC EDUCATION",5070.0,HIGH SCHOOL,F,Parkway Northwest High School,,246.0,...,60,REINFORCE,0.0,INTERVENE,3.0,INTERVENE,69,45,0,0
1,ABBOTT,JOYCE,76461,"TEACHER,FULL TIME",1290.0,ELEMENTARY SCHOOL,F,Andrew Hamilton School,,587.0,...,54,REINFORCE,80.0,MODEL,10.0,INTERVENE,42,60,0,0
2,ABDALLAH,JUWAYRIYAH,46694,"TEACHER,FULL TIME",1440.0,ELEMENTARY SCHOOL,F,Penrose School,,590.0,...,57,REINFORCE,0.0,INTERVENE,2.0,INTERVENE,51,51,1,1
3,ABDUL-LATEEF,VILLIA,48945,"TEACHER,FULL TIME",1030.0,HIGH SCHOOL,F,High School of the Future,,476.0,...,0,INTERVENE,0.0,INTERVENE,3.0,INTERVENE,50,0,1,1
4,ABDUL-WAKEEL,AMIRA,49615,"TEACHER,FULL TIME",6310.0,ELEMENTARY SCHOOL,F,John F. McCloskey School,,521.0,...,0,INTERVENE,0.0,INTERVENE,6.0,INTERVENE,59,0,1,1


In [120]:
full_turnover_2019.head()

Unnamed: 0,LAST_NAME,FIRST_NAME,PAY_RATE_2018,TITLE_DESCRIPTION_2018,HOME_ORGANIZATION,ORGANIZATION_LEVEL_2018,GENDER_2018,School,Turnaround Model,Enrollment,...,Student Survey Climate Score,Student Survey Climate Tier,Parent Survey Climate Score,Parent Survey Climate Tier,Parent Survey Participation Score,Parent Survey Participation Tier,Teacher Attendance Score,Student Survey Teaching Score,NEW_TEACHER,TURNOVER
0,ABAYOMI-IGE,OLABIMPE,90051,"TEACHER,SPEC EDUCATION",5070.0,HIGH SCHOOL,F,Parkway Northwest High School,,260.0,...,48,WATCH,73.0,REINFORCE,18.0,INTERVENE,67,33,0,0
1,ABBOTT,JOYCE,76461,"TEACHER,FULL TIME",1290.0,ELEMENTARY SCHOOL,F,Andrew Hamilton School,,532.0,...,0,INTERVENE,0.0,INTERVENE,1.0,INTERVENE,46,0,0,0
2,ABDUL-LATEEF,VILLIA,56531,"TEACHER,FULL TIME",1010.0,HIGH SCHOOL,F,John Bartram High School,,561.0,...,52,REINFORCE,0.0,INTERVENE,2.0,INTERVENE,52,43,0,1
3,ABDULALEEM,MUHAMMAD,70564,"TEACHER,FULL TIME",6090.0,CAREER AND TECHNICAL HIGH SCHL,M,A. Philip Randolph Career and Technical High S...,,493.0,...,56,REINFORCE,0.0,INTERVENE,2.0,INTERVENE,64,36,0,0
4,ABDULLAH,AARON,46694,"TEACHER,FULL TIME",6360.0,ELEMENTARY SCHOOL,M,Theodore Roosevelt School,Turnaround,525.0,...,47,WATCH,63.0,REINFORCE,10.0,INTERVENE,58,51,1,0


#### Putting it Together

In [None]:
full_turnover_2017

In [121]:
df = pd.DataFrame(np.vstack([full_turnover_2017, full_turnover_2018, full_turnover_2019]), 
                  columns=full_turnover_2017.columns)


In [122]:
df.head()

Unnamed: 0,LAST_NAME,FIRST_NAME,PAY_RATE_2016,TITLE_DESCRIPTION_2016,HOME_ORGANIZATION,ORGANIZATION_LEVEL_2016,GENDER_2016,School,Turnaround Model,Enrollment,...,Student Survey Climate Score,Student Survey Climate Tier,Parent Survey Climate Score,Parent Survey Climate Tier,Parent Survey Participation Score,Parent Survey Participation Tier,Teacher Attendance Score,Student Survey Teaching Score,NEW_TEACHER,TURNOVER
0,ABBOTT,JOYCE,76461,"TEACHER,FULL TIME",1290,ELEMENTARY SCHOOL,F,Andrew Hamilton School,,576,...,53,REINFORCE,82,MODEL,11,INTERVENE,55,55,0,0
1,ABDALLAH,JUWAYRIYAH,46694,"TEACHER,FULL TIME",1470,ELEMENTARY SCHOOL,F,Alain Locke School,,461,...,55,REINFORCE,77,MODEL,12,INTERVENE,55,56,1,1
2,ABDEL-JALIL,GHADEER,45359,"TEACHER,FULL TIME",7440,ELEMENTARY SCHOOL,F,Bayard Taylor School,,555,...,49,WATCH,0,INTERVENE,2,INTERVENE,70,55,1,1
3,ABDUL BASIT,BARBARA,67706,"TEACHER,FULL TIME",7530,ELEMENTARY SCHOOL,F,William Rowen School,,523,...,60,REINFORCE,0,INTERVENE,6,INTERVENE,68,69,0,1
4,ABDUL-LATEEF,VILLIA,48945,"TEACHER,FULL TIME",1010,HIGH SCHOOL,F,John Bartram High School,,710,...,53,REINFORCE,0,INTERVENE,3,INTERVENE,56,45,1,1


In [125]:
df.columns

Index(['LAST_NAME', 'FIRST_NAME', 'PAY_RATE_2016', 'TITLE_DESCRIPTION_2016',
       'HOME_ORGANIZATION', 'ORGANIZATION_LEVEL_2016', 'GENDER_2016', 'School',
       'Turnaround Model', 'Enrollment', 'Grades Served', 'Admissions Type',
       'Overall Score', 'Overall Tier', 'Ach Score', 'Ach Tier', 'Prog Score',
       'Prog Tier', 'Clim Score', 'Clim Tier', 'Attendance (95%+) Score',
       'Attendance (95%+) Tier', 'Retention Score', 'Retention Tier',
       'ISS Score', 'ISS Tier', 'OSS Score', 'OSS Tier',
       'Student Survey Climate Score', 'Student Survey Climate Tier',
       'Parent Survey Climate Score', 'Parent Survey Climate Tier',
       'Parent Survey Participation Score', 'Parent Survey Participation Tier',
       'Teacher Attendance Score', 'Student Survey Teaching Score',
       'NEW_TEACHER', 'TURNOVER'],
      dtype='object')

In [126]:
df.rename(columns={'PAY_RATE_2016': 'PAY_RATE',
                   'TITLE_DESCRIPTION_2016': 'TITLE_DESCRIPTION',
                   'ORGANIZATION_LEVEL_2016': 'ORGANIZATION_LEVEL',
                   'GENDER_2016': 'GENDER'}, inplace=True)


In [128]:
df['TURNOVER'].value_counts(normalize=True)

0    0.797352
1    0.202648
Name: TURNOVER, dtype: float64