## Policing in Schools
A look at whether school policing can predict graduation rates.

By Onel Abreu, Alexander Roche, Sabrina Sedovic

In [47]:
import pandas as pd
import pipeline as pipeline

school_data = pd.read_csv("school_data.csv")
school_data.head(10)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0_x,Unnamed: 0.1.1,Year,Dept/Unit Number,Job Title_Chief Safety & Security OFF,Job Title_Flex Team Security Officer,Job Title_Safety And Security Off,Job Title_School Security Officer,...,# of Unique Students Receiving OSS,% of Unique Students Receiving OSS,Average Length of OSS,# of Police Notifications,% of Misconducts Resulting in a Police Notification,Police Notifications per 100 Students,# of Unique Students Receiving Police Notification,% of Unique Students Receiving Police Notification,# of Students Expelled,Expulsions per 100 Students
0,0,0,1,384,2011,24101,0,0,0,2,...,,,,,,,,,,
1,1,1,2,898,2012,24101,0,0,0,2,...,5.0,1.6,3.4,1.0,9.1,0.32,1.0,0.3,0.0,0.0
2,2,2,3,1417,2013,24101,0,0,0,2,...,12.0,4.0,1.43,2.0,6.9,0.67,2.0,0.7,0.0,0.0
3,3,3,4,1969,2014,24101,0,0,0,2,...,5.0,1.8,1.8,5.0,38.5,1.78,5.0,1.8,0.0,0.0
4,4,4,5,2489,2015,24101,0,0,0,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,5,5,10,450,2011,25151,0,0,0,2,...,,,,,,,,,,
6,6,6,11,965,2012,25151,0,0,0,2,...,31.0,3.4,2.0,1.0,1.0,0.11,1.0,0.1,0.0,0.0
7,7,7,12,1489,2013,25151,0,0,0,2,...,41.0,4.4,2.07,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,8,8,13,2041,2014,25151,0,0,0,2,...,18.0,1.9,2.42,1.0,1.6,0.11,1.0,0.1,0.0,0.0
9,9,9,14,2559,2015,25151,0,0,0,2,...,10.0,1.0,2.91,1.0,2.4,0.1,1.0,0.1,0.0,0.0


In [48]:
school_data = school_data.drop(columns = ["Unnamed: 0", "Unnamed: 0.1", "Unnamed: 0_x", "Unnamed: 0.1.1",
                                          "Dept/Unit Number", "Job Title_Chief Safety & Security OFF",
                                          "Job Title_Flex Team Security Officer",
                                          "Job Title_Career Counseling Manager",
                                          "Job Title_Director of Counseling",
                                          "Job Title_Director, School Counseling & Post Secondary Advising",
                                          "Job Title_HS Counseling Specialist", 
                                          "Job Title_K‐8 Counseling Specialist",
                                          "Job Title_Manager-Secdry Schl Counselng",
                                          "pos_name_0", "Unit_Number", "Notes", "Unnamed: 26", 
                                          "Unit Number", "Unnamed: 0_y", "Unit", "School_x", 
                                          "School Name_y", "School_y", "Total_y", 
                                          "School Year",'Status as of 2014','Status as of 2019'])

### Data Cleaning

In [57]:
indices = school_data[school_data["Expulsions per 100 Students"].isna() & (school_data["Year"] != 2011)].index
school_data.drop(indices, inplace=True)
school_data.reset_index(inplace=True)

 ### Initial Data Analysis

In [51]:
%load_ext autoreload
%autoreload 2
import yearsplit

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [112]:
school_data["Year"] = pd.to_datetime(school_data["Year"], format='%Y')

TimeBasedCV = yearsplit.TimeBasedCV()
splits = TimeBasedCV.split(school_data,  date_column="Year")

Train period: 2012-01-01 - 2013-01-01 , Test period 2013-01-01 - 2014-01-01 # train records 83 , # test records 80
Train period: 2013-01-01 - 2014-01-01 , Test period 2014-01-01 - 2015-01-01 # train records 80 , # test records 93


In [113]:
splits

[([0,
   4,
   8,
   12,
   16,
   20,
   24,
   30,
   34,
   39,
   43,
   48,
   52,
   56,
   60,
   64,
   69,
   73,
   77,
   81,
   85,
   89,
   93,
   101,
   105,
   109,
   113,
   114,
   115,
   116,
   118,
   122,
   123,
   130,
   131,
   135,
   138,
   139,
   143,
   147,
   151,
   156,
   160,
   164,
   168,
   172,
   176,
   179,
   183,
   187,
   191,
   195,
   199,
   203,
   207,
   211,
   215,
   219,
   223,
   227,
   231,
   235,
   239,
   243,
   247,
   252,
   256,
   261,
   265,
   269,
   273,
   277,
   281,
   285,
   289,
   296,
   300,
   304,
   308,
   311,
   315,
   321,
   325],
  [1,
   5,
   9,
   13,
   17,
   21,
   25,
   31,
   35,
   40,
   44,
   49,
   53,
   57,
   61,
   65,
   70,
   74,
   78,
   82,
   86,
   90,
   94,
   97,
   102,
   106,
   110,
   117,
   119,
   124,
   125,
   132,
   140,
   144,
   148,
   152,
   157,
   161,
   165,
   169,
   173,
   177,
   180,
   184,
   188,
   192,
   196,
   200,
   2

In [116]:
all_train = school_data.iloc[splits[0][0] + splits[1][0], :]

test_1 = school_data.loc[splits[0][1], :]
test_2 = school_data.loc[splits[1][1], :]

suspension_cols = ['# of Misconducts', '# of Group 1-2 Misconducts','# of Group 3-4 Misconducts', '# of Group 5-6 Misconducts',
       '# of Suspensions (includes ISS and OSS)','% of Misconducts Resulting in a Suspension\n(includes ISS and OSS)',
       '# of ISS', '% of Misconducts Resulting in an ISS', 'ISS per 100 Students', '# of Unique Students Receiving ISS',
       '% of Unique Students Receiving ISS', 'Average Length of ISS', '# of OSS', 
       '% of Misconducts Resulting in an OSS', 'OSS per 100 Students', '# of Unique Students Receiving OSS',
       '% of Unique Students Receiving OSS', 'Average Length of OSS', '# of Police Notifications',
       '% of Misconducts Resulting in a Police Notification', 'Police Notifications per 100 Students',
       '# of Unique Students Receiving Police Notification','% of Unique Students Receiving Police Notification',
       '# of Students Expelled', 'Expulsions per 100 Students']
all_train[suspension_cols] = all_train.groupby("School ID")[suspension_cols].transform(lambda x: x.fillna(x.median()))

train_1 = all_train.loc[splits[0][0], :]
train_2 = all_train.loc[splits[1][0], :]


IndexError: list index out of range

In [55]:
test_1[test_1.isna().any(axis=1)][['Attendance', '5YR Grad Rate']]


Unnamed: 0,Attendance,5YR Grad Rate
2,94.0,
7,95.4,
12,95.8,
22,90.7,


In [138]:
test_2[test_2.isna().any(axis=1)][['Attendance', '5YR Grad Rate']]


Unnamed: 0,Attendance,5YR Grad Rate
3,94.0,
8,96.2,
13,96.1,
23,90.3,


In [139]:
test_3[test_3.isna().any(axis=1)][['Attendance', '5YR Grad Rate']]


Unnamed: 0,Attendance,5YR Grad Rate
4,95.3,
9,96.5,
14,96.1,
24,92.0,


In [140]:
train_1[train_1[['Attendance', '5YR Grad Rate']].isna().any(axis=1)][['Attendance', '5YR Grad Rate']]

Unnamed: 0,Attendance,5YR Grad Rate
0,94.8,
1,94.9,
1,94.9,
1,94.9,
1,94.9,
...,...,...
21,91.7,
105,,
232,,
258,,50.7


In [141]:
train_2[train_2.isna().any(axis=1)][['Attendance', '5YR Grad Rate']]

Unnamed: 0,Attendance,5YR Grad Rate
1,94.9,
1,94.9,
1,94.9,
1,94.9,
1,94.9,
...,...,...
22,90.7,
22,90.7,
22,90.7,
22,90.7,


### Pipeline

In [126]:
from sklearn.model_selection import GridSearchCV
from sklearn import linear_model
from random import randint, uniform

lr = linear_model.LinearRegression()
params = {'fit_intercept':[True,False], 'normalize':[True,False], 'copy_X':[True, False]}

model = GridSearchCV(
    estimator = lr,
    param_grid = params,
    cv = splits,
    pre_dispatch='2*3',
    return_train_score = True,
    verbose=5,
    refit=True)

model.fit(school_data.drop(['Year', 'Expulsions per 100 Students', 'School Name_x', '5YR Grad Rate'], axis=1), 
          school_data['Expulsions per 100 Students'])
grid_model_result = pd.DataFrame(model.cv_results_)
results = grid_model_result[['param_copy_X', 'param_fit_intercept', 'param_normalize',
                     'mean_test_score']]
results.sort_values(["mean_test_score"], ascending=False)

Fitting 2 folds for each of 8 candidates, totalling 16 fits
[CV] copy_X=True, fit_intercept=True, normalize=True .................
[CV]  copy_X=True, fit_intercept=True, normalize=True, score=(train=0.957, test=0.030), total=   0.0s
[CV] copy_X=True, fit_intercept=True, normalize=True .................
[CV]  copy_X=True, fit_intercept=True, normalize=True, score=(train=0.944, test=-890.156), total=   0.0s
[CV] copy_X=True, fit_intercept=True, normalize=False ................
[CV]  copy_X=True, fit_intercept=True, normalize=False, score=(train=0.957, test=0.182), total=   0.0s
[CV] copy_X=True, fit_intercept=True, normalize=False ................
[CV]  copy_X=True, fit_intercept=True, normalize=False, score=(train=0.944, test=-890.156), total=   0.0s
[CV] copy_X=True, fit_intercept=False, normalize=True ................
[CV]  copy_X=True, fit_intercept=False, normalize=True, score=(train=0.957, test=0.183), total=   0.0s
[CV] copy_X=True, fit_intercept=False, normalize=True ............

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.1s remaining:    0.0s


[CV]  copy_X=False, fit_intercept=False, normalize=False, score=(train=0.939, test=-34.336), total=   0.0s


[Parallel(n_jobs=1)]: Done  16 out of  16 | elapsed:    0.2s finished


Unnamed: 0,param_copy_X,param_fit_intercept,param_normalize,mean_test_score
2,True,False,True,-17.076573
3,True,False,False,-17.076573
6,False,False,True,-17.076573
7,False,False,False,-17.076573
1,True,True,False,-444.987108
5,False,True,False,-444.987108
0,True,True,True,-445.063399
4,False,True,True,-445.063399


### Evaluation

In [45]:
school_data.select_dtypes(include=[object, bool])

Unnamed: 0,School Name_x,5YR Grad Rate
0,LOZANO,
1,LOZANO,
2,LOZANO,
3,LOZANO,
4,LOZANO,
...,...,...
439,TEAM HS,71.8
440,CHIARTS HS,88.8
441,CHIARTS HS,80.4
442,AUSTIN BUS & ENTRP HS,50.0
