---
# Clean Arrests Data
- This file takes in the raw csv from the TPD arrests site (arrests_2020 and arrests_2021) and
  extracts the relevent information based on the hypothesis. 
- Uses the pands and datetime modules

In [None]:
import pandas as pd
import datetime

---
## Relevant Columns

In [None]:
valid_cols = ["NHA_NAME", "sex", "age", "datetime_arr", "fel_misd"]

---
## Function that Converts the Data into DF

In [None]:
def process_df(df):
    df = df[valid_cols].dropna()
    date_arr = []
    month_arr = []
    proper_time_arr = []
    for i in range(len(df)):
        datetime_arr = list(df["datetime_arr"])[i].split(" ") # extract date and time str

        date = datetime_arr[0].strip()
        time = datetime_arr[1].strip().split(":")
    
        realtime = int(time[0]) - 7
        if (realtime < 0):
            realtime = 24 + realtime
        proper_time_arr.append(f"{realtime}{time[1]}")
        
        calc_date = datetime.datetime.strptime(date, '%Y/%m/%d') 
        day = (calc_date.weekday() + 1) % 7
        date_arr.append(day)
        month_arr.append(calc_date.month-1)

    df = df.drop("datetime_arr", axis=1)
    df["day"] = date_arr
    df["month"] = month_arr
    df["time"] = proper_time_arr

    return df

---
## Create the 2020 DF

In [None]:
arrests_2020 = pd.read_csv("./data/Arrests_2020.csv")
arrests_2020 = process_df(arrests_2020)
arrests_2020

## Create the 2021 DF

In [None]:
arrests_2021 = pd.read_csv("./data/Arrests_2021.csv")
arrests_2021 = process_df(arrests_2021)
arrests_2021

## Save the DFs

In [None]:
arrests_2020.to_csv('./clean_data/Arrests_2020_cleaned.csv', index=False)
arrests_2021.to_csv('./clean_data/Arrests_2021_cleaned.csv', index=False)