## Explore and Clean the Chicago Use-Of-Force Complaints Data
## Author: Mihir Bhaskar

### Data source: https://data.cpdp.co/data/bVBkzB/ (accessed on 21st November, 2021)

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [3]:
# Import data (downloaded from website linked above as Excel file)

## To-do: use Python's openpyxl library to filter the datasets before importing -> improve speed

complaints = pd.read_excel('data/uof_complaints_chicago.xlsx', sheet_name='Allegations')
comp_witness = pd.read_excel('data/uof_complaints_chicago.xlsx', sheet_name='Complaining Witnesses')
officers = pd.read_excel('data/uof_complaints_chicago.xlsx', sheet_name='Officer Profile')

## Dropping irrelevant columns
complaints.drop(['OfficeFirst', 'OfficerLast', 'AllegationCode', 'RecommendedFinding', 'RecommendedOutcome',
                'FinalFinding', 'FinalOutcome', ''], axis=1)

## Drop irrelevant columns, merge the info across the three datasets
print(complaints.info(), '\n', comp_witness.info(), '\n', officers.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57574 entries, 0 to 57573
Data columns (total 25 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   CRID                57574 non-null  object 
 1   OfficerID           57574 non-null  int64  
 2   OfficeFirst         57574 non-null  object 
 3   OfficerLast         57574 non-null  object 
 4   AllegationCode      57574 non-null  object 
 5   Category            57574 non-null  object 
 6   Allegation          57574 non-null  object 
 7   RecommendedFinding  20328 non-null  object 
 8   RecommendedOutcome  23328 non-null  float64
 9   FinalFinding        54106 non-null  object 
 10  FinalOutcome        55402 non-null  float64
 11  Finding             57574 non-null  object 
 12  Outcome             57574 non-null  object 
 13  Beat                23395 non-null  float64
 14  Location            25667 non-null  object 
 15  Add1                21952 non-null  object 
 16  Add2

In [11]:
# Filtering the data to only include complaints from 2015 onwards

## There are 8 rows where incident date (what we want to filter on) is missing
complaints['IncidentDate'].isnull().sum()

## Drop cases where incidentdate is missing - these are only 8 observations, and there is no other good way
## to tell when a complaint occured. The start date only refers to the start of the investigation, and this could be
## very different from the actual timing of the complaint.
complaints = complaints.dropna(subset=['IncidentDate'])

## Converting incident date to a date variable
complaints['Date']= pd.to_datetime(complaints['IncidentDate'])

## Keeping only complaints >= 2015
cmp = complaints[complaints['Date'] >= '2015-01-01']

cmp.head()

cmp[['Beat', 'Location', 'City']].head(100)

#cmp['Diff'] = np.where(cmp['RecommendedFinding'] == cmp['FinalFinding'], 1, 0)

#cmp['Diff'].mean()

Unnamed: 0,Beat,Location,City
55436,313.0,Public Way - Other,CHICAGO IL
55439,1112.0,Public Way - Other,CHICAGO IL
55440,1112.0,Public Way - Other,CHICAGO IL
55447,1112.0,Public Way - Other,CHICAGO IL
55453,932.0,Public Way - Other,CHICAGO IL 60609
...,...,...,...
55581,1724.0,Public Way - Other,CHICAGO IL 60618
55582,1724.0,Public Way - Other,CHICAGO IL 60618
55583,624.0,Public Way - Other,CHICAGO IL
55584,624.0,Public Way - Other,CHICAGO IL


In [35]:
## Testing the uniqueness of the IDs 



## The unique ID here is CRID and officerID
## Drop the unnecessary columns, merge the good columns from the other datasets, then describe the missingness and uniqueness 

cmp.describe()
cmp.info()

cmp.nunique(axis=0)

#cmp[cmp['CRID'] == '1090030']

cmp['uid'] = cmp['CRID'] + (cmp['OfficerID']).astype(str)

cmp.nunique(axis=0)


#dups = cmp[cmp.duplicated(['CRID'])].sort_values('CRID')

#dups.head(100)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1180 entries, 55436 to 57573
Data columns (total 27 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   CRID                1180 non-null   object        
 1   OfficerID           1180 non-null   int64         
 2   OfficeFirst         1180 non-null   object        
 3   OfficerLast         1180 non-null   object        
 4   AllegationCode      1180 non-null   object        
 5   Category            1180 non-null   object        
 6   Allegation          1180 non-null   object        
 7   RecommendedFinding  101 non-null    object        
 8   RecommendedOutcome  224 non-null    float64       
 9   FinalFinding        782 non-null    object        
 10  FinalOutcome        287 non-null    float64       
 11  Finding             1180 non-null   object        
 12  Outcome             1180 non-null   object        
 13  Beat                1161 non-null   float64

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cmp['uid'] = cmp['CRID'] + (cmp['OfficerID']).astype(str)


CRID                   689
OfficerID             1035
OfficeFirst            464
OfficerLast            852
AllegationCode          50
Category                 1
Allegation              34
RecommendedFinding       4
RecommendedOutcome       4
FinalFinding             5
FinalOutcome            22
Finding                  6
Outcome                 23
Beat                   223
Location                50
Add1                   122
Add2                   426
City                   104
IncidentDate           521
StartDate              514
EndDate                245
InvestigatorName        24
InvestigatorRank         0
Latitude               590
Longitude              588
Date                   521
uid                   1180
dtype: int64

In [None]:
# Checking for 