In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import OneHotEncoder

import datetime
import calendar

In [2]:
loc = "./data/Arrests_2020.csv"
df = pd.read_csv(loc)

In [3]:
valid_cols = ["NHA_NAME", "sex", "age", "date_arr", "fel_misd"]
df = df[valid_cols].dropna()

In [4]:
# Format day, month, sex, fel_misd
date_arr = []
month_arr = []
for i in range(len(df)):
    date = list(df["date_arr"])[i][:10]                           # extract date from df
    calc_date = datetime.datetime.strptime(date, '%Y/%m/%d') 
    
    day = (calc_date.weekday() + 1) % 7
    date_arr.append(day)
    month_arr.append(calc_date.month-1)
    
    # Set sex to [0,1]
    if df.iloc[i]["sex"] == 'F':
        df.iloc[i]["sex"] = 0
    else:
        df.iloc[i]["sex"] = 1
        
    # Set fel_misd to [0,5]
    if df.iloc[i]["fel_misd"] == 'P':
        df.iloc[i]["fel_misd"] = 0
    elif df.iloc[i]["fel_misd"] == 'F':
        df.iloc[i]["fel_misd"] = 1
    elif df.iloc[i]["fel_misd"] == 'M':
        df.iloc[i]["fel_misd"] = 2
    elif df.iloc[i]["fel_misd"] == 'S':
        df.iloc[i]["fel_misd"] = 3
    elif df.iloc[i]["fel_misd"] == 'C':
        df.iloc[i]["fel_misd"] = 4
    else:
        df.iloc[i]["fel_misd"] = 5
        
df.head()

Unnamed: 0,NHA_NAME,sex,age,date_arr,fel_misd
12,Dodge Flower,1,21,2020/01/02 00:00:00+00,2
13,Dodge Flower,1,21,2020/01/02 00:00:00+00,2
15,West University,1,24,2020/01/03 00:00:00+00,2
16,West University,1,24,2020/01/03 00:00:00+00,2
17,Dodge Flower,1,21,2020/01/02 00:00:00+00,2


In [5]:
df = df.drop("date_arr", axis=1)
df["day"] = date_arr
df["month"] = month_arr

sex_col = df["sex"]
fel_misd_col = df["fel_misd"]

print(f"Range of days: [{min(date_arr)}, {max(date_arr)}]")
print(f"Range of months: [{min(month_arr)},{max(month_arr)}]")
print(f"Range of sex: [{min(sex_col)},{max(sex_col)}]")
print(f"Range of fel_misd: [{min(fel_misd_col)},{max(fel_misd_col)}]")

Range of days: [0, 6]
Range of months: [0,11]
Range of sex: [0,1]
Range of fel_misd: [0,5]


In [6]:
df["year"] = [2020 for i in range(len(df))]
year_col = df["year"]
print(f"Range of year: [{min(year_col)},{max(year_col)}]")

Range of year: [2020,2020]


In [7]:
df = df[["NHA_NAME", "sex", "age", "day", "month","year", "fel_misd"]]
df.head()

Unnamed: 0,NHA_NAME,sex,age,day,month,year,fel_misd
12,Dodge Flower,1,21,4,0,2020,2
13,Dodge Flower,1,21,4,0,2020,2
15,West University,1,24,5,0,2020,2
16,West University,1,24,5,0,2020,2
17,Dodge Flower,1,21,4,0,2020,2


In [8]:
# save df
df_1 = df

In [9]:
loc = "./data/Arrests_2021.csv"
df = pd.read_csv(loc)

In [10]:
df = df[valid_cols].dropna()

In [11]:
# Format day, month, sex, fel_misd
date_arr = []
month_arr = []
for i in range(len(df)):
    date = list(df["date_arr"])[i][:10]                           # extract date from df
    calc_date = datetime.datetime.strptime(date, '%Y/%m/%d') 
    
    day = (calc_date.weekday() + 1) % 7
    date_arr.append(day)
    month_arr.append(calc_date.month-1)
    
    # Set sex to [0,1]
    if df.iloc[i]["sex"] == 'F':
        df.iloc[i]["sex"] = 0
    else:
        df.iloc[i]["sex"] = 1
        
    # Set fel_misd to [0,5]
    if df.iloc[i]["fel_misd"] == 'P':
        df.iloc[i]["fel_misd"] = 0
    elif df.iloc[i]["fel_misd"] == 'F':
        df.iloc[i]["fel_misd"] = 1
    elif df.iloc[i]["fel_misd"] == 'M':
        df.iloc[i]["fel_misd"] = 2
    elif df.iloc[i]["fel_misd"] == 'S':
        df.iloc[i]["fel_misd"] = 3
    elif df.iloc[i]["fel_misd"] == 'C':
        df.iloc[i]["fel_misd"] = 4
    else:
        df.iloc[i]["fel_misd"] = 5
        
df.head()

Unnamed: 0,NHA_NAME,sex,age,date_arr,fel_misd
3,Eastside,1,15,2021/01/02 00:00:00+00,2
4,Eastside,1,15,2021/01/02 00:00:00+00,2
5,Rita Ranch,0,25,2021/01/02 20:05:12+00,2
6,Garden District,1,34,2021/01/02 00:00:00+00,1
8,Garden District,1,34,2021/01/02 00:00:00+00,1


In [12]:
df = df.drop("date_arr", axis=1)
df["day"] = date_arr
df["month"] = month_arr

sex_col = df["sex"]
fel_misd_col = df["fel_misd"]

print(f"Range of days: [{min(date_arr)}, {max(date_arr)}]")
print(f"Range of months: [{min(month_arr)},{max(month_arr)}]")
print(f"Range of sex: [{min(sex_col)},{max(sex_col)}]")
print(f"Range of fel_misd: [{min(fel_misd_col)},{max(fel_misd_col)}]")

Range of days: [0, 6]
Range of months: [0,11]
Range of sex: [0,1]
Range of fel_misd: [0,5]


In [13]:
df["year"] = [2021 for i in range(len(df))]
year_col = df["year"]
print(f"Range of year: [{min(year_col)},{max(year_col)}]")

Range of year: [2021,2021]


In [14]:
df = df[["NHA_NAME", "sex", "age", "day", "month","year", "fel_misd"]]
df.head()

Unnamed: 0,NHA_NAME,sex,age,day,month,year,fel_misd
3,Eastside,1,15,6,0,2021,2
4,Eastside,1,15,6,0,2021,2
5,Rita Ranch,0,25,6,0,2021,2
6,Garden District,1,34,6,0,2021,1
8,Garden District,1,34,6,0,2021,1


In [15]:
df_1.to_csv('./clean_data/Arrests_2020_cleaned.csv', index=False)
df.to_csv('./clean_data/Arrests_2021_cleaned.csv', index=False)