---
# Clean All the Data 
- Extract the three TPD CSVs and get the relevent info
- Combine three dataframes after mapping the neighborhoods
- Save the CSV
- Depends on pandas and datetime modules

In [None]:
intermediate_write_out = True
print_debug = True

In [None]:
import pandas as pd
import datetime

---
## Arrests Cleaning
- Get the valid columns from the arrests data
- Use the process_df function to make the data interpretable
---
### Relevent Data Columns

In [None]:
valid_cols = ["NHA_NAME", "sex", "age", "datetime_arr", "fel_misd"]

---
### Function for Interpreting Arrests Data

In [None]:
def process_df(df):
    df = df[valid_cols].dropna()
    date_arr = []
    month_arr = []
    proper_time_arr = []
    for i in range(len(df)):
        datetime_arr = list(df["datetime_arr"])[i].split(" ") # extract date and time str

        date = datetime_arr[0].strip()
        time = datetime_arr[1].strip().split(":")
    
        realtime = int(time[0]) - 7
        if (realtime < 0):
            realtime = 24 + realtime
        proper_time_arr.append(f"{realtime}{time[1]}")
        
        calc_date = datetime.datetime.strptime(date, '%Y/%m/%d') 
        day = (calc_date.weekday() + 1) % 7
        date_arr.append(day)
        month_arr.append(calc_date.month-1)

    df = df.drop("datetime_arr", axis=1)
    df["day"] = date_arr
    df["month"] = month_arr
    df["time"] = proper_time_arr

    return df

---
### Find the DFs for Both Years

In [None]:
arrests_2020 = pd.read_csv("./data/Arrests_2020.csv")
arrests_2020 = process_df(arrests_2020)

if print_debug:
    print(arrests_2020)

In [None]:
arrests_2021 = pd.read_csv("./data/Arrests_2021.csv")
arrests_2021 = process_df(arrests_2021)

if print_debug:
    print(arrests_2021)

In [None]:
if intermediate_write_out:
    arrests_2020.to_csv('./clean_data/Arrests_2020_cleaned.csv', index=False)
    arrests_2021.to_csv('./clean_data/Arrests_2021_cleaned.csv', index=False)

---
## Neighborhood Wealth Cleaning:
---
### Extracting Relevent Columns

In [None]:
neighborhood = pd.read_csv("./data/Neighborhood_Income.csv")
neighborhood.set_index('NAME', inplace=True)
neighborhood = neighborhood[["MEDHINC_CY", "WLTHINDXCY", "TOTHH_CY"]]

if print_debug:
    print(neighborhood)

In [None]:
if intermediate_write_out:
    neighborhood.to_csv('./clean_data/neighborhood_cleaned.csv', index=True)

---
## Merging DFs:

In [None]:
arrests_df = pd.concat([arrests_2020, arrests_2021])

if print_debug:
    print(arrests_df)

---
### Map the Wealth and Total Household to the Arrest Neighborhood

In [None]:
arrests_df['MEDHINC_CY'] = arrests_df['NHA_NAME'].map(neighborhood['MEDHINC_CY'])
arrests_df['WLTHINDXCY'] = arrests_df['NHA_NAME'].map(neighborhood['WLTHINDXCY'])
arrests_df['TOTHH_CY'] = arrests_df['NHA_NAME'].map(neighborhood['TOTHH_CY'])
arrests_df.dropna()
arrests_df = arrests_df[["NHA_NAME","sex","age","day","month","MEDHINC_CY","WLTHINDXCY", "time", "TOTHH_CY", "fel_misd"]]

if print_debug:
    print(arrests_df)

---
### Remove the Neighborhood Name Column

In [None]:
arrests_df = arrests_df.drop(columns="NHA_NAME")

---
## Finding Non-Numeric Age Indices

In [None]:
x = list(arrests_df["age"])
arr = []
for i, val in enumerate(x):
    try:
        int(val)
    except:
        arr.append(i)

if print_debug:
    print(arr)

---
### Cleaning Age Column and Resetting Indices

In [None]:
df = arrests_df.set_axis([i for i in range(len(arrests_df))])
df = df.drop(arr, axis=0) # drop rows with age = ' '

if print_debug:
    print(df)

---
## Save DF

In [None]:
df.to_csv('./clean_data/fully_merged_data.csv', index=False)