In [1]:
# DEPENDENCIES
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats as st
from datetime import datetime

# DATA FILES
path_crime_data = "Data/crime_data.csv"

# READ CRIME DATA
raw_crime_data = pd.read_csv(path_crime_data)
raw_crime_data

Unnamed: 0,Website Region,WAPOL_Hierarchy_order_Lvl1,WAPOL_Hierarchy_Lvl1,WAPOL_Hierarchy_order_Lvl2,WAPOL_Hierarchy_Lvl2,Period,Year,Key,MonthYear,COUNT_of_Offnc_ID
0,Armadale District,1.1,Murder,1,Homicide,1-Jan-07,2006-07,Armadale DistrictMurderHomicide12007,12007,
1,Armadale District,1.1,Murder,1,Homicide,1-Feb-07,2006-07,Armadale DistrictMurderHomicide22007,22007,
2,Armadale District,1.1,Murder,1,Homicide,1-Mar-07,2006-07,Armadale DistrictMurderHomicide32007,32007,1.0
3,Armadale District,1.1,Murder,1,Homicide,1-Apr-07,2006-07,Armadale DistrictMurderHomicide42007,42007,
4,Armadale District,1.1,Murder,1,Homicide,1-May-07,2006-07,Armadale DistrictMurderHomicide52007,52007,
...,...,...,...,...,...,...,...,...,...,...
143095,Wheatbelt District,20.3,Breach of Police Order,20,Breach of Violence Restraint Order,1-Nov-19,2019-20,Wheatbelt DistrictBreach of Police OrderBreach...,112019,5.0
143096,Wheatbelt District,20.3,Breach of Police Order,20,Breach of Violence Restraint Order,1-Dec-19,2019-20,Wheatbelt DistrictBreach of Police OrderBreach...,122019,7.0
143097,Wheatbelt District,20.3,Breach of Police Order,20,Breach of Violence Restraint Order,1-Jan-20,2019-20,Wheatbelt DistrictBreach of Police OrderBreach...,12020,5.0
143098,Wheatbelt District,20.3,Breach of Police Order,20,Breach of Violence Restraint Order,1-Feb-20,2019-20,Wheatbelt DistrictBreach of Police OrderBreach...,22020,1.0


In [2]:
# DROP IRRELEVANT COLUMNS
del raw_crime_data['WAPOL_Hierarchy_order_Lvl1']

In [3]:
del raw_crime_data['WAPOL_Hierarchy_order_Lvl2']

In [4]:
del raw_crime_data['Year']

In [5]:
del raw_crime_data['Key']

In [6]:
del raw_crime_data['MonthYear']

In [7]:
raw_crime_data = raw_crime_data.dropna()

In [8]:
raw_crime_data = raw_crime_data.rename(columns={"Website Region":"District", 
                               "WAPOL_Hierarchy_Lvl1":"Offence Group", 
                               "WAPOL_Hierarchy_Lvl2":"Offence", 
                               "Period":"Offence Date", 
                               "COUNT_of_Offnc_ID":"Total Number of Offences"})
raw_crime_data

Unnamed: 0,District,Offence Group,Offence,Offence Date,Total Number of Offences
2,Armadale District,Murder,Homicide,1-Mar-07,1.0
26,Armadale District,Murder,Homicide,1-Mar-09,1.0
36,Armadale District,Murder,Homicide,1-Jan-10,2.0
37,Armadale District,Murder,Homicide,1-Feb-10,2.0
51,Armadale District,Murder,Homicide,1-Apr-11,1.0
...,...,...,...,...,...
143095,Wheatbelt District,Breach of Police Order,Breach of Violence Restraint Order,1-Nov-19,5.0
143096,Wheatbelt District,Breach of Police Order,Breach of Violence Restraint Order,1-Dec-19,7.0
143097,Wheatbelt District,Breach of Police Order,Breach of Violence Restraint Order,1-Jan-20,5.0
143098,Wheatbelt District,Breach of Police Order,Breach of Violence Restraint Order,1-Feb-20,1.0


In [9]:
list(raw_crime_data.columns)

['District',
 'Offence Group',
 'Offence',
 'Offence Date',
 'Total Number of Offences']

In [10]:
# new data frame with split value columns 
new = raw_crime_data["Offence Date"].str.split("-", n = 2, expand = True) 
# making separate first name column from new data frame 
raw_crime_data["Month"]= new[1]
# making separate last name column from new data frame 
raw_crime_data["Year"]= new[2]
raw_crime_data

Unnamed: 0,District,Offence Group,Offence,Offence Date,Total Number of Offences,Month,Year
2,Armadale District,Murder,Homicide,1-Mar-07,1.0,Mar,07
26,Armadale District,Murder,Homicide,1-Mar-09,1.0,Mar,09
36,Armadale District,Murder,Homicide,1-Jan-10,2.0,Jan,10
37,Armadale District,Murder,Homicide,1-Feb-10,2.0,Feb,10
51,Armadale District,Murder,Homicide,1-Apr-11,1.0,Apr,11
...,...,...,...,...,...,...,...
143095,Wheatbelt District,Breach of Police Order,Breach of Violence Restraint Order,1-Nov-19,5.0,Nov,19
143096,Wheatbelt District,Breach of Police Order,Breach of Violence Restraint Order,1-Dec-19,7.0,Dec,19
143097,Wheatbelt District,Breach of Police Order,Breach of Violence Restraint Order,1-Jan-20,5.0,Jan,20
143098,Wheatbelt District,Breach of Police Order,Breach of Violence Restraint Order,1-Feb-20,1.0,Feb,20


In [11]:
del raw_crime_data['Offence Date']

In [12]:
#convert year column from string into integer
raw_crime_data['Year'] = raw_crime_data['Year'].astype(int)

In [13]:
#remove data prior to 2010
raw_crime_data.loc[raw_crime_data["Year"]>=10]

Unnamed: 0,District,Offence Group,Offence,Total Number of Offences,Month,Year
36,Armadale District,Murder,Homicide,2.0,Jan,10
37,Armadale District,Murder,Homicide,2.0,Feb,10
51,Armadale District,Murder,Homicide,1.0,Apr,11
55,Armadale District,Murder,Homicide,1.0,Aug,11
60,Armadale District,Murder,Homicide,2.0,Jan,12
...,...,...,...,...,...,...
143095,Wheatbelt District,Breach of Police Order,Breach of Violence Restraint Order,5.0,Nov,19
143096,Wheatbelt District,Breach of Police Order,Breach of Violence Restraint Order,7.0,Dec,19
143097,Wheatbelt District,Breach of Police Order,Breach of Violence Restraint Order,5.0,Jan,20
143098,Wheatbelt District,Breach of Police Order,Breach of Violence Restraint Order,1.0,Feb,20


In [None]:
#groupby offences
