## Policing in Schools
A look at whether school policing can predict graduation rates.

By Onel Abreu, Alexander Roche, Sabrina Sedovic

In [132]:
import pandas as pd
import pipeline as pipeline

school_data = pd.read_csv("school_data.csv")
school_data.head(10)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0_x,Unnamed: 0.1.1,Year,Dept/Unit Number,Job Title_Chief Safety & Security OFF,Job Title_Flex Team Security Officer,Job Title_Safety And Security Off,Job Title_School Security Officer,...,# of Unique Students Receiving OSS,% of Unique Students Receiving OSS,Average Length of OSS,# of Police Notifications,% of Misconducts Resulting in a Police Notification,Police Notifications per 100 Students,# of Unique Students Receiving Police Notification,% of Unique Students Receiving Police Notification,# of Students Expelled,Expulsions per 100 Students
0,0,0,1,384,2011,24101,0,0,0,2,...,,,,,,,,,,
1,1,1,2,898,2012,24101,0,0,0,2,...,5.0,1.6,3.4,1.0,9.1,0.32,1.0,0.3,0.0,0.0
2,2,2,3,1417,2013,24101,0,0,0,2,...,12.0,4.0,1.43,2.0,6.9,0.67,2.0,0.7,0.0,0.0
3,3,3,4,1969,2014,24101,0,0,0,2,...,5.0,1.8,1.8,5.0,38.5,1.78,5.0,1.8,0.0,0.0
4,4,4,5,2489,2015,24101,0,0,0,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,5,5,10,450,2011,25151,0,0,0,2,...,,,,,,,,,,
6,6,6,11,965,2012,25151,0,0,0,2,...,31.0,3.4,2.0,1.0,1.0,0.11,1.0,0.1,0.0,0.0
7,7,7,12,1489,2013,25151,0,0,0,2,...,41.0,4.4,2.07,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,8,8,13,2041,2014,25151,0,0,0,2,...,18.0,1.9,2.42,1.0,1.6,0.11,1.0,0.1,0.0,0.0
9,9,9,14,2559,2015,25151,0,0,0,2,...,10.0,1.0,2.91,1.0,2.4,0.1,1.0,0.1,0.0,0.0


In [133]:
school_data = school_data.drop(columns = ["Unnamed: 0", "Unnamed: 0.1", "Unnamed: 0_x", "Unnamed: 0.1.1",
                                          "Dept/Unit Number", "Job Title_Chief Safety & Security OFF",
                                          "Job Title_Flex Team Security Officer",
                                          "Job Title_Career Counseling Manager",
                                          "Job Title_Director of Counseling",
                                          "Job Title_Director, School Counseling & Post Secondary Advising",
                                          "Job Title_HS Counseling Specialist", 
                                          "Job Title_K‐8 Counseling Specialist",
                                          "Job Title_Manager-Secdry Schl Counselng",
                                          "pos_name_0", "Unit_Number", "Notes", "Unnamed: 26", 
                                          "Unit Number", "Unnamed: 0_y", "Unit", "School_x", 
                                          "School Name_y", "School_y", "Total_y", 
                                          "School Year",'Status as of 2014','Status as of 2019'])

### Data Cleaning

In [134]:
indices = school_data[school_data["Expulsions per 100 Students"].isna() & (school_data["Year"] != 2011)].index
school_data.drop(indices, inplace=True)

 ### Initial Data Analysis

In [None]:
%load_ext autoreload
%autoreload 2
import yearsplit

In [135]:
school_data["Year"] = pd.to_datetime(school_data["Year"], format='%Y')

TimeBasedCV = yearsplit.TimeBasedCV()
splits = TimeBasedCV.split(school_data,  date_column="Year")

Train period: 2011-01-01 - 2012-05-01 , Test period 2012-11-01 - 2013-07-01 # train records 177 , # test records 80
Train period: 2011-09-01 - 2013-01-01 , Test period 2013-07-01 - 2014-03-01 # train records 83 , # test records 93
Train period: 2012-05-01 - 2013-09-01 , Test period 2014-03-01 - 2014-11-01 # train records 80 , # test records 0
Train period: 2013-01-01 - 2014-05-01 , Test period 2014-11-01 - 2015-07-01 # train records 173 , # test records 84


In [136]:
all_train = school_data.loc[splits[0][0] + splits[1][0] + splits[2][0] + splits[3][0], :]

test_1 = school_data.loc[splits[0][1], :]
test_2 = school_data.loc[splits[1][1], :]
test_3 = school_data.loc[splits[3][1], :]

all_train = pd.concat([train_1, train_2, train_3])
suspension_cols = ['# of Misconducts', '# of Group 1-2 Misconducts','# of Group 3-4 Misconducts', '# of Group 5-6 Misconducts',
       '# of Suspensions (includes ISS and OSS)','% of Misconducts Resulting in a Suspension\n(includes ISS and OSS)',
       '# of ISS', '% of Misconducts Resulting in an ISS', 'ISS per 100 Students', '# of Unique Students Receiving ISS',
       '% of Unique Students Receiving ISS', 'Average Length of ISS', '# of OSS', 
       '% of Misconducts Resulting in an OSS', 'OSS per 100 Students', '# of Unique Students Receiving OSS',
       '% of Unique Students Receiving OSS', 'Average Length of OSS', '# of Police Notifications',
       '% of Misconducts Resulting in a Police Notification', 'Police Notifications per 100 Students',
       '# of Unique Students Receiving Police Notification','% of Unique Students Receiving Police Notification',
       '# of Students Expelled', 'Expulsions per 100 Students']
all_train[suspension_cols] = all_train.groupby("School ID")[suspension_cols].transform(lambda x: x.fillna(x.median()))

train_1 = all_train.loc[splits[0][0], :]
train_2 = all_train.loc[splits[1][0] + splits[2][0], :]
train_3 = all_train.loc[splits[3][0], :]


In [137]:
test_1[test_1.isna().any(axis=1)][['Attendance', '5YR Grad Rate']]


Unnamed: 0,Attendance,5YR Grad Rate
2,94.0,
7,95.4,
12,95.8,
22,90.7,


In [138]:
test_2[test_2.isna().any(axis=1)][['Attendance', '5YR Grad Rate']]


Unnamed: 0,Attendance,5YR Grad Rate
3,94.0,
8,96.2,
13,96.1,
23,90.3,


In [139]:
test_3[test_3.isna().any(axis=1)][['Attendance', '5YR Grad Rate']]


Unnamed: 0,Attendance,5YR Grad Rate
4,95.3,
9,96.5,
14,96.1,
24,92.0,


In [140]:
train_1[train_1[['Attendance', '5YR Grad Rate']].isna().any(axis=1)][['Attendance', '5YR Grad Rate']]

Unnamed: 0,Attendance,5YR Grad Rate
0,94.8,
1,94.9,
1,94.9,
1,94.9,
1,94.9,
...,...,...
21,91.7,
105,,
232,,
258,,50.7


In [141]:
train_2[train_2.isna().any(axis=1)][['Attendance', '5YR Grad Rate']]

Unnamed: 0,Attendance,5YR Grad Rate
1,94.9,
1,94.9,
1,94.9,
1,94.9,
1,94.9,
...,...,...
22,90.7,
22,90.7,
22,90.7,
22,90.7,


### Pipeline

### Evaluation