## Policing in Schools
A look at whether school policing can predict graduation rates.

By Onel Abreu, Alexander Roche, Sabrina Sedovic

In [233]:
import pandas as pd

school_data = pd.read_csv("school_data.csv")
school_data.head(10)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0_x,Unnamed: 0.1.1,Year,Dept/Unit Number,Job Title_Chief Safety & Security OFF,Job Title_Flex Team Security Officer,Job Title_Safety And Security Off,Job Title_School Security Officer,...,# of Unique Students Receiving OSS,% of Unique Students Receiving OSS,Average Length of OSS,# of Police Notifications,% of Misconducts Resulting in a Police Notification,Police Notifications per 100 Students,# of Unique Students Receiving Police Notification,% of Unique Students Receiving Police Notification,# of Students Expelled,Expulsions per 100 Students
0,0,0,1,384,2011,24101,0,0,0,2,...,,,,,,,,,,
1,1,1,2,898,2012,24101,0,0,0,2,...,5.0,1.6,3.4,1.0,9.1,0.32,1.0,0.3,0.0,0.0
2,2,2,3,1417,2013,24101,0,0,0,2,...,12.0,4.0,1.43,2.0,6.9,0.67,2.0,0.7,0.0,0.0
3,3,3,4,1969,2014,24101,0,0,0,2,...,5.0,1.8,1.8,5.0,38.5,1.78,5.0,1.8,0.0,0.0
4,4,4,5,2489,2015,24101,0,0,0,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,5,5,10,450,2011,25151,0,0,0,2,...,,,,,,,,,,
6,6,6,11,965,2012,25151,0,0,0,2,...,31.0,3.4,2.0,1.0,1.0,0.11,1.0,0.1,0.0,0.0
7,7,7,12,1489,2013,25151,0,0,0,2,...,41.0,4.4,2.07,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,8,8,13,2041,2014,25151,0,0,0,2,...,18.0,1.9,2.42,1.0,1.6,0.11,1.0,0.1,0.0,0.0
9,9,9,14,2559,2015,25151,0,0,0,2,...,10.0,1.0,2.91,1.0,2.4,0.1,1.0,0.1,0.0,0.0


In [234]:
school_data = school_data.drop(columns = ["Unnamed: 0", "Unnamed: 0.1", "Unnamed: 0_x", "Unnamed: 0.1.1",
                                          "Dept/Unit Number", "Job Title_Chief Safety & Security OFF",
                                          "Job Title_Flex Team Security Officer",
                                          "Job Title_Career Counseling Manager",
                                          "Job Title_Director of Counseling",
                                          "Job Title_Director, School Counseling & Post Secondary Advising",
                                          "Job Title_HS Counseling Specialist", 
                                          "Job Title_K‐8 Counseling Specialist",
                                          "Job Title_Manager-Secdry Schl Counselng",
                                          "pos_name_0", "Unit_Number", "Notes", "Unnamed: 26", 
                                          "Unit Number", "Unnamed: 0_y", "Unit", "School_x", 
                                          "School Name_y", "School_y", "Total_y", 
                                          "School Year",'Status as of 2014','Status as of 2019'])

### Data Cleaning

In [235]:
indices = school_data[school_data["Expulsions per 100 Students"].isna() & (school_data["Year"] != 2011)].index
school_data.drop(indices, inplace=True)
school_data.drop(school_data.loc[school_data["5YR Grad Rate"] == ' '].index, inplace=True)
school_data.dropna(subset=['5YR Grad Rate'], inplace=True)
school_data.reset_index(inplace=True)
school_data.drop("index", inplace=True, axis=1)
school_data['5YR Grad Rate'] =  school_data['5YR Grad Rate'].astype(float)

 ### Initial Data Analysis

In [253]:
%load_ext autoreload
%autoreload 2
import yearsplit
import pipeline as pipeline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [237]:
school_data["Year"] = pd.to_datetime(school_data["Year"], format='%Y')

TimeBasedCV = yearsplit.TimeBasedCV()
splits = TimeBasedCV.split(school_data,  date_column="Year")

Train period: 2011-01-01 - 2012-01-01 , Test period 2012-01-01 - 2013-01-01 # train records 78 , # test records 74
Train period: 2012-01-01 - 2013-01-01 , Test period 2013-01-01 - 2014-01-01 # train records 74 , # test records 71
Train period: 2013-01-01 - 2014-01-01 , Test period 2014-01-01 - 2015-01-01 # train records 71 , # test records 88


In [238]:
all_train = school_data.loc[splits[0][0] + splits[1][0] + splits[2][0], :]

test_1 = school_data.loc[splits[0][1], :]
test_2 = school_data.loc[splits[1][1], :]
test_3 =  school_data.loc[splits[2][1], :]
suspension_cols = ['# of Misconducts', '# of Group 1-2 Misconducts','# of Group 3-4 Misconducts', '# of Group 5-6 Misconducts',
       '# of Suspensions (includes ISS and OSS)','% of Misconducts Resulting in a Suspension\n(includes ISS and OSS)',
       '# of ISS', '% of Misconducts Resulting in an ISS', 'ISS per 100 Students', '# of Unique Students Receiving ISS',
       '% of Unique Students Receiving ISS', 'Average Length of ISS', '# of OSS', 
       '% of Misconducts Resulting in an OSS', 'OSS per 100 Students', '# of Unique Students Receiving OSS',
       '% of Unique Students Receiving OSS', 'Average Length of OSS', '# of Police Notifications',
       '% of Misconducts Resulting in a Police Notification', 'Police Notifications per 100 Students',
       '# of Unique Students Receiving Police Notification','% of Unique Students Receiving Police Notification',
       '# of Students Expelled', 'Expulsions per 100 Students']
school_data.loc[all_train.index, suspension_cols] = all_train.groupby("School ID")[suspension_cols].transform(lambda x: x.fillna(x.median()))

train_1 = school_data.loc[splits[0][0], :]
train_2 = school_data.loc[splits[1][0], :]
train_3 = school_data.loc[splits[2][0], :]

In [239]:
new_splits = []
for n, sets in enumerate([(train_1, test_1), (train_2, test_2), (train_3, test_3)]):
    train, test = sets[0], sets[1]
    list_test = test[test.isna().any(axis=1)].index.to_list()
    split_test = [x for x in splits[n][1] if x not in list_test]
    list_train = train[train.isna().any(axis=1)].index.to_list()
    split_train = []
    for x in splits[n][0]:
        break_later = True
        for y in list_train:
            a = school_data.loc[x, :].name
            b = school_data.iloc[y,:].name
            if a == b:
                break_later = False
                break
        if break_later:
            split_train.append(x)
    new_splits.append((split_train, split_test))
new_splits = tuple(new_splits)

In [240]:
all_train = school_data.loc[new_splits[0][0] + new_splits[1][0] + new_splits[2][0] +
                             new_splits[0][1] + new_splits[1][1] + new_splits[2][1], :]
all_train[all_train.isna().any(axis=1)]
train_1 = school_data.loc[new_splits[0][0], :]
train_2 = school_data.loc[new_splits[1][0], :]
train_3 = school_data.loc[new_splits[2][0], :]

### Pipeline

In [275]:
from sklearn.model_selection import GridSearchCV

from sklearn import linear_model
from sklearn import metrics
from sklearn.metrics import make_scorer
from random import randint, uniform

models = {'lr' :linear_model.LinearRegression()}
params = {'lr':[{'fit_intercept': x, 'copy_X': y} 
                           for x in (True, False) \
                           for y in (True, False)]}
scoring = {'MSE': make_scorer(metrics.mean_squared_error)}
pipeline.gridsearch(((train_1, test_1), (train_2, test_2), (train_3, test_3)),
         models, params, '5YR Grad Rate')

Training model: lr | {'fit_intercept': True, 'copy_X': True} 0
{'lr': {0: {0: 1124.5549240443377}}}
Training model: lr | {'fit_intercept': True, 'copy_X': False} 0
{'lr': {0: {0: 1124.5549240443377, 1: 1124.5549240443377}}}
Training model: lr | {'fit_intercept': False, 'copy_X': True} 0
{'lr': {0: {0: 1124.5549240443377, 1: 1124.5549240443377, 2: 1235.2138498516906}}}
Training model: lr | {'fit_intercept': False, 'copy_X': False} 0
{'lr': {0: {0: 1124.5549240443377, 1: 1124.5549240443377, 2: 1235.2138498516906, 3: 1235.2138498516906}}}
Training model: lr | {'fit_intercept': True, 'copy_X': True} 1
{'lr': {0: {0: 1124.5549240443377, 1: 1124.5549240443377, 2: 1235.2138498516906, 3: 1235.2138498516906}, 1: {0: 2.3013600852985204e+22}}}
Training model: lr | {'fit_intercept': True, 'copy_X': False} 1
{'lr': {0: {0: 1124.5549240443377, 1: 1124.5549240443377, 2: 1235.2138498516906, 3: 1235.2138498516906}, 1: {0: 2.3013600852985204e+22, 1: 2.3013600852985204e+22}}}
Training model: lr | {'fit_i

{'lr': {0: {0: 1124.5549240443377,
   1: 1124.5549240443377,
   2: 1235.2138498516906,
   3: 1235.2138498516906},
  1: {0: 2.3013600852985204e+22,
   1: 2.3013600852985204e+22,
   2: 2.9533571514515974e+17,
   3: 2.9533571514515974e+17},
  2: {0: 11522.276879726676,
   1: 11522.276879726676,
   2: 1073.013322071283,
   3: 1073.013322071283}}}

### Evaluation

In [256]:
lr = linear_model.LinearRegression()

lr.fit(train_1.drop(['Year', 'School Name_x', '5YR Grad Rate'], axis=1), 
       train_1['5YR Grad Rate'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)