## Data Merging and Cleaning

This file assembles two "master" dataframes for experimentation. The first dataframe contains every crime throughout each day, with associated daily weather information. The second dataframe contains the number of arrests each day with associated mean weather information.

In [None]:
import pandas as pd

### Arrest Data

In [None]:
nypd_historic = pd.read_csv('./input/NYPD_Arrests_Data__Historic_.csv')
nypd_historic.head()

In [None]:
nypd_historic.describe()

In [None]:
nypd_current = pd.read_csv('./input/NYPD_Arrest_Data__Year_to_Date_.csv')
nypd_current.head()

In [None]:
nypd_current.describe()

In [None]:
len(nypd_current['ARREST_DATE'].unique()) + len(nypd_historic['ARREST_DATE'].unique())

In [None]:
len(nypd_current['ARREST_DATE']) + len(nypd_historic['ARREST_DATE'])

In [None]:
nypd = nypd_current.append(nypd_historic)
nypd.head()

In [None]:
len(nypd)

In [None]:
def format_date_nypd(date):
    split = date.split('/')
    return split[2] + split[1] + split[0]

In [None]:
nypd['date'] = nypd['ARREST_DATE'].apply(format_date_nypd)
nypd_sum = nypd.groupby(['date']).sum()
nypd_sum.head()

In [None]:
len(nypd_sum)

### Weather Data
#### Hourly weather

In [None]:
weath = pd.read_csv('./input/daily_new_york_data.csv')
weath = weath[['dt', 'temp', 'feels_like', 'temp_min', 'temp_max', 
               'humidity', 'wind_speed', 'wind_deg', 'rain_1h', 
               'rain_3h', 'snow_1h', 'snow_3h', 'clouds_all']]
weath.head()

In [None]:
len(weath)

In [None]:
# weath.dtypes

In [None]:
from datetime import datetime
from pytz import timezone # for timezone awareness

def format_date_weath(dt):
    localtz = timezone('America/New_York')
    dt_unaware = datetime.fromtimestamp(dt)
    dt_aware = localtz.localize(dt_unaware, is_dst=None)
    return datetime.utcfromtimestamp(dt_aware).strftime('%Y%m%d')

In [None]:
weath['date'] = weath['dt'].apply(lambda dt: datetime.utcfromtimestamp(dt).strftime('%Y%m%d'))
weath.head()

In [None]:
weath_mean = weath
weath_mean[['rain_1h', 'rain_3h', 
            'snow_1h', 'snow_3h']] = weath_mean[['rain_1h', 'rain_3h', 
                                                'snow_1h', 'snow_3h']].fillna(value=0)

weath_mean = weath.groupby(['date']).agg({'temp':'mean', 'feels_like':'mean', 
                                        'temp_min': 'min', 'temp_max': 'max',
                                        'humidity': 'mean', 'wind_speed': 'mean',
                                        'wind_deg': 'mean', 'rain_1h': 'mean',
                                        'rain_3h': 'mean', 'snow_1h': 'mean',
                                        'snow_3h': 'mean', 'clouds_all': 'mean'})

weath_mean['date'] = weath_mean.index
weath_mean.head()

In [None]:
weath_mean.dtypes

In [None]:
len(weath_mean)

### Daily crimes: each crime with weather data

This dataframe contains more detail, including each arrest's description and location in the city, by borough. This dataset may or may not be used, depending if we have time to do extra fancy visualizations. Otherwise, the next dataframe showing daily summaries are what we'll focus on first.

In [None]:
daily_nypd = nypd.set_index(['date'])
daily_crimes = pd.merge(left=daily_nypd, right=weath_mean, how='left',
                        left_index=True, right_index=True)
daily_crimes = daily_crimes[['PD_DESC', 'OFNS_DESC', 'ARREST_BORO', 'temp', 
                             'feels_like', 'temp_min', 'temp_max', 'humidity', 
                             'wind_speed', 'wind_deg', 'rain_1h', 'rain_3h', 
                             'snow_1h', 'snow_3h']]
daily_crimes.rename(columns={'PD_DESC': 'pd_desc', 'OFNS_DESC': 'ofns_desc', 
                             'ARREST_BORO': 'arrest_boro'}, inplace=True)
daily_crimes.dropna(inplace=True)
daily_crimes.head()
# daily_crimes[760:810] # For checking that weather is matched with the correct date.

In [None]:
len(daily_nypd)

In [None]:
# daily_crimes.dtypes

In [None]:
len(daily_crimes)

In [None]:
def get_borough(b):
    if b == 'B':
        return 'The Bronx'
    elif b == 'K':
        return 'Brooklyn'
    elif b == 'M':
        return 'Manhattan'
    else:
        return 'Queens'

In [None]:
daily_crimes['pd_desc'] = daily_crimes['pd_desc'].str.capitalize()
daily_crimes['ofns_desc'] = daily_crimes['ofns_desc'].str.capitalize()
daily_crimes['arrest_boro'] = daily_crimes['arrest_boro'].apply(get_borough)
daily_crimes.head()

In [None]:
len(daily_crimes)

### Write to file

In [None]:
daily_crimes.to_csv('./output/daily_crimes.csv')

### Daily means: day's arrest count and weather means

This dataframe will likely be our primary dataset, since it's giving us a day-by-day arrest count with the mean weather conditions for that day.

In [None]:
daily_mean = daily_crimes.groupby(['date']).count()
daily_mean.drop(columns=['ofns_desc', 'arrest_boro'], inplace=True)
daily_mean.rename(columns={'pd_desc': 'num_arrests'}, inplace=True)
daily_mean = daily_mean[['num_arrests']]
daily_mean = pd.merge(left=daily_mean, right=weath_mean, 
                      how='left', left_index=True, right_index=True)
daily_mean.head()

In [None]:
daily_mean['num_arrests'].sum()

In [None]:
# daily_mean.dtypes

In [None]:
len(daily_mean)

### Write to file

In [None]:
daily_mean.to_csv('./output/daily_mean.csv')