# VADIR Data Cleaning Part 2.
Source: CSV file generated by VADIR_data_cleanup
        
#### TO DOs (carried over from last file):
* clean up uppercase/lowercase issues (school name and county)
* fix column name duplicates and typos (alcohol and drug possession, sex offenses)
* check that schools are consistently assigned the same district name
* figure out a way to handle the data that doesn't have county info (search for school's subsequent year data?)
* get lattitude and longitude (and addresses???)
* re order columns in some meaningful way
* start computing tallies of incidents with and without weapons
* check datatypes for columns to make sure they make sense.
* whats with the 5 missing values?


In [2]:
# Initial Imports
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
from matplotlib import pyplot as plt
% matplotlib inline

In [20]:
# read in saved dataframe
vadir_df = pd.read_csv("VADIR_incidents_2006-14.csv", dtype = object)

In [21]:
# rearrange columns so that School Demographic Info comes first
# County[12], District[15], School Name[46], BEDS code[8], School Year[48], Enrollment [18]
# Grade Organization[22], Need/Resource Category[33], School Type[47]
cols = vadir_df.columns.tolist()
new_order = [12, 15, 46, 48, 18, 8, 22, 33, 47] + list(range(1,8)) + [9, 19, 10, 11, 13, 14, 16, 17, 20, 21]
new_order += list(range(23, 33)) + list(range(34, 46)) + [49, 50, 51, 52]
cols = [cols[idx] for idx in new_order]
vadir_df = vadir_df[cols]

In [22]:
# rename 'False Alarm' to "Bomb Threat False Alarm"
vadir_df.rename(columns={'False Alarm':"Bomb Threat False Alarm"}, inplace=True)

In [23]:
# Check for and merge/delete duplicate column names
vadir_df.columns.tolist()

['County',
 'District',
 'School Name',
 'School Year',
 'Enrollment',
 'BEDS Code',
 'Grade Organization',
 'Need/Resource Category',
 'School Type',
 'Alcohol Possesion',
 'Alcohol Possession',
 'Arson',
 'Assault With Serious Physical Injury_nw',
 'Assault With Serious Physical Injury_ww',
 'Assault with Physical Injury_nw',
 'Assault with Physical Injury_ww',
 'Bomb Threat',
 'Bomb Threat False Alarm',
 'Burglary_nw',
 'Burglary_ww',
 'Criminal Mischief_nw',
 'Criminal Mischief_ww',
 'Drug Possesion',
 'Drug Possession',
 'Forcible Sex Offenses_nw',
 'Forcible Sex Offenses_ww',
 'Homicide_nw',
 'Homicide_ww',
 'Intimidation, Harassment, Menacing, or Bullying_nw',
 'Intimidation, Harassment, Menacing, or Bullying_ww',
 'Kidnapping_nw',
 'Kidnapping_ww',
 'Larceny, or Other Theft_nw',
 'Larceny, or Other Theft_ww',
 'Minor Altercations_nw',
 'Minor Altercations_ww',
 'Other Disruptive',
 'Other Disruptive Incidents',
 'Other Sex Offenses_nw',
 'Other Sex Offenses_ww',
 'Other Sex off

In [24]:
# Merge 'Alcohol Possesion' and 'Use Possession or Sale of Alcohol' into'Alcohol Possession' column
print("Dataframe Length: {}".format(len(vadir_df)), 
       "Missing Values for this column: {}\n".format((vadir_df['Alcohol Possession'].isnull()).sum()),
      "# of Values in first column to merge:{}\n".format((vadir_df['Alcohol Possesion'].notnull()).sum()),
      "# of Values in second:{}\n".format((vadir_df['Use Possession or Sale of Alcohol'].notnull()).sum()))
     
vadir_df['Alcohol Possession'] = vadir_df['Alcohol Possession'].combine_first(vadir_df['Alcohol Possesion'])
vadir_df['Alcohol Possession'] = vadir_df['Alcohol Possession'].combine_first(vadir_df['Use Possession or Sale of Alcohol'])
vadir_df.drop(['Alcohol Possesion', 'Use Possession or Sale of Alcohol'], axis=1, inplace=True)

print("...'Alcohol Possesion' and 'Use Possession or Sale of Alcohol' merged into'Alcohol Possession' column",
      "\n... which now has {} remaining missing values.".format((vadir_df['Alcohol Possession'].isnull()).sum()),
      " Former columns dropped.")

Dataframe Length: 14734 Missing Values for this column: 2956
 # of Values in first column to merge:1498
 # of Values in second:1453

...'Alcohol Possesion' and 'Use Possession or Sale of Alcohol' merged into'Alcohol Possession' column 
... which now has 5 remaining missing values.  Former columns dropped.


In [25]:
# Merge 'Drug Possesion' and 'Use Possession or Sale of Drugs' into'Drug Possession' column
print("Dataframe Length: {}".format(len(vadir_df)), 
       "Missing Values for this column: {}\n".format((vadir_df['Drug Possession'].isnull()).sum()),
      "# of Values in first column to merge:{}\n".format((vadir_df['Drug Possesion'].notnull()).sum()),
      "# of Values in second:{}\n".format((vadir_df['Use Possession or Sale of Drugs'].notnull()).sum()))
     
vadir_df['Drug Possession'] = vadir_df['Drug Possession'].combine_first(vadir_df['Drug Possesion'])
vadir_df['Drug Possession'] = vadir_df['Drug Possession'].combine_first(vadir_df['Use Possession or Sale of Drugs'])
vadir_df.drop(['Drug Possesion', 'Use Possession or Sale of Drugs'], axis=1, inplace=True)

print("...'Drug Possesion' and 'Use Possession or Sale of Drugs' merged into'Drug Possession' column",
      "\n... which now has {} remaining missing values.".format((vadir_df['Drug Possession'].isnull()).sum()),
      " Former columns dropped.")

Dataframe Length: 14734 Missing Values for this column: 2956
 # of Values in first column to merge:1498
 # of Values in second:1453

...'Drug Possesion' and 'Use Possession or Sale of Drugs' merged into'Drug Possession' column 
... which now has 5 remaining missing values.  Former columns dropped.


In [26]:
# Merge 'Other Disruptive' into 'Other Disruptive Incidents'
print("Dataframe Length: {}".format(len(vadir_df)), 
       "Missing Values for this column: {}\n".format((vadir_df['Other Disruptive Incidents'].isnull()).sum()),
      "# of Values in column to merge:{}\n".format((vadir_df['Other Disruptive'].notnull()).sum()))
     
vadir_df['Other Disruptive Incidents'] = vadir_df['Other Disruptive Incidents'].combine_first(vadir_df['Other Disruptive'])
vadir_df.drop(['Other Disruptive'], axis=1, inplace=True)

print("...'Other Disruptive' merged into 'Other Disruptive Incidents' column",
      "which now has {} remaining missing values.".format((vadir_df['Other Disruptive Incidents'].isnull()).sum()),
      "\n... Former column dropped.")

Dataframe Length: 14734 Missing Values for this column: 13281
 # of Values in column to merge:13275

...'Other Disruptive' merged into 'Other Disruptive Incidents' column which now has 6 remaining missing values. 
 Former column dropped.


In [27]:
# Merge'Other Sex offenses_nw' into'Other Sex Offenses_nw',
print("Dataframe Length: {}".format(len(vadir_df)), 
       "Missing Values for this column: {}\n".format((vadir_df['Other Sex Offenses_nw'].isnull()).sum()),
      "# of Values in column to merge:{}\n".format((vadir_df['Other Sex offenses_nw'].notnull()).sum()))
     
vadir_df['Other Sex Offenses_nw'] = vadir_df['Other Sex Offenses_nw'].combine_first(vadir_df['Other Sex offenses_nw'])
vadir_df.drop(['Other Sex offenses_nw'], axis=1, inplace=True)

print("...'Other Sex offenses_nw' merged into 'Other Sex Offenses_nw' column",
      "which now has {} remaining missing values.".format((vadir_df['Other Sex Offenses_nw'].isnull()).sum()),
      "\n... Former column dropped.")

Dataframe Length: 14734 Missing Values for this column: 1458
 # of Values in column to merge:1453

...'Other Sex offenses_nw' merged into 'Other Sex Offenses_nw' column which now has 5 remaining missing values. 
... Former column dropped.


In [28]:
# Merge'Other Sex offenses_ww' into'Other Sex Offenses_ww',
print("Dataframe Length: {}".format(len(vadir_df)), 
       "Missing Values for this column: {}\n".format((vadir_df['Other Sex Offenses_ww'].isnull()).sum()),
      "# of Values in column to merge:{}\n".format((vadir_df['Other Sex offenses_ww'].notnull()).sum()))
     
vadir_df['Other Sex Offenses_ww'] = vadir_df['Other Sex Offenses_ww'].combine_first(vadir_df['Other Sex offenses_ww'])
vadir_df.drop(['Other Sex offenses_ww'], axis=1, inplace=True)

print("...'Other Sex offenses_ww' merged into 'Other Sex Offenses_ww' column",
      "which now has {} remaining missing values.".format((vadir_df['Other Sex Offenses_ww'].isnull()).sum()),
      "\n... Former column dropped.")

Dataframe Length: 14734 Missing Values for this column: 1458
 # of Values in column to merge:1453

...'Other Sex offenses_ww' merged into 'Other Sex Offenses_ww' column which now has 5 remaining missing values. 
... Former column dropped.


In [29]:
# Take a look at the now cleaned column names
vadir_df.columns.tolist()

['County',
 'District',
 'School Name',
 'School Year',
 'Enrollment',
 'BEDS Code',
 'Grade Organization',
 'Need/Resource Category',
 'School Type',
 'Alcohol Possession',
 'Arson',
 'Assault With Serious Physical Injury_nw',
 'Assault With Serious Physical Injury_ww',
 'Assault with Physical Injury_nw',
 'Assault with Physical Injury_ww',
 'Bomb Threat',
 'Bomb Threat False Alarm',
 'Burglary_nw',
 'Burglary_ww',
 'Criminal Mischief_nw',
 'Criminal Mischief_ww',
 'Drug Possession',
 'Forcible Sex Offenses_nw',
 'Forcible Sex Offenses_ww',
 'Homicide_nw',
 'Homicide_ww',
 'Intimidation, Harassment, Menacing, or Bullying_nw',
 'Intimidation, Harassment, Menacing, or Bullying_ww',
 'Kidnapping_nw',
 'Kidnapping_ww',
 'Larceny, or Other Theft_nw',
 'Larceny, or Other Theft_ww',
 'Minor Altercations_nw',
 'Minor Altercations_ww',
 'Other Disruptive Incidents',
 'Other Sex Offenses_nw',
 'Other Sex Offenses_ww',
 'Reckless Endangerment_nw',
 'Reckless Endangerment_ww',
 'Riot_nw',
 'Riot_

In [None]:
# Merge Need/Resource Category and School Type?

In [30]:
# Save results to a 'clean' file.
vadir_df.to_csv("VADIR_clean.csv")

In [31]:
# Check that is saved
pd.read_csv("VADIR_clean.csv").head()

Unnamed: 0.1,Unnamed: 0,County,District,School Name,School Year,Enrollment,BEDS Code,Grade Organization,Need/Resource Category,School Type,...,Other Sex Offenses_nw,Other Sex Offenses_ww,Reckless Endangerment_nw,Reckless Endangerment_ww,Riot_nw,Riot_ww,Robbery_nw,Robbery_ww,Weapon Possession_nw,Weapon Possession_ww
0,0,Bronx,,Academic Leadership Charter School,2014,376,320700900000.0,Elementary,Charter School,Charter,...,0,0,0,0,0,0,0,0,0,0
1,1,Bronx,,American Dream Charter School,2014,81,320700900000.0,,Charter School,Charter,...,0,0,0,0,0,0,0,0,0,0
2,2,Bronx,,Brilla College Preparatory Charter School,2014,247,320700900000.0,Elementary,Charter School,Charter,...,0,0,0,0,0,0,0,0,0,0
3,3,Bronx,,Bronx Academy Of Promise Charter School,2014,581,320900900000.0,Elementary,Charter School,Charter,...,0,0,0,0,0,0,0,0,0,0
4,4,Bronx,,Bronx Charter School For Better Learning,2014,470,321100900000.0,Elementary,Charter School,Charter,...,0,0,9,0,0,0,0,0,0,0
