In [8]:
import pandas as pd
import numpy as np
from scipy import stats 
from sklearn.linear_model import LinearRegression
from geopy.distance import great_circle
import math



pd.set_option('display.float_format', '{:,.2f}'.format)
df = pd.read_csv('Arrest_Data_from_2010_to_Present.csv',encoding='Latin5')

df.dtypes
df.head()

df['Arrest Date2'] = pd.to_datetime(df['Arrest Date'])

df['year'] = df['Arrest Date2'].dt.year
df['year'] = df['year'].apply(str)
year_2018 = df[df['year'].str.startswith('2018')]

'''Question1: How many bookings of arrestees were made in 2018?'''

year_2018['Total'] = 0
year_2018.groupby('year').count()[['Total']]


'''Question2: How many bookings of arrestees were made in the area with the most arrests in 2018?'''
year_2018.groupby('Area Name').count()[['Total']].sort_values(by='Total',ascending=False).head


'''Question3:What is the 95% quantile of the age of the arrestee in 2018? Only consider the following charge groups for your analysis:

Vehicle Theft
Robbery
Burglary
Receive Stolen Property'''

qt_2018 = year_2018.loc[(year_2018['Charge Group Description'] == 'Vehicle Theft') |
                (year_2018['Charge Group Description'] == 'Robbery') | 
                (year_2018['Charge Group Description'] == 'Burglary')|
                (year_2018['Charge Group Description'] == 'Receive Stolen Property') ]

qt_2018 = qt_2018.loc[:, ('Charge Group Description', 'Age')]
np.around(qt_2018.quantile(0.95), decimals=10).head



'''Question 4 There are differences between the average age of an arrestee for the various charge groups. 
Are these differences statistically significant? For this question, 
calculate the Z-score of the average age for each charge group.
Report the largest absolute value among the calculated Z-scores.

Only consider data for 2018
Do not consider "Pre-Delinquency" and "Non-Criminal Detention" as these charge groups are reserved for minors
Exclude any arrests where the charge group description is not known'''

year_2018 = df[df['year']==('2018')]
z_2018 = year_2018.loc[(year_2018['Charge Group Description'] != 'Non-Criminal Detention') &
                    (year_2018['Charge Group Description'] != 'Pre-Delinquency') & ~year_2018['Charge Group Description'].isnull()]


z_2018_2 = z_2018.loc[:, ('Charge Group Description','Age','zscore')]
zscore_age =  np.around(stats.zscore(z_2018_2['Age']), decimals=10)
z_2018_2['zscore_age']= zscore_age

z_2018_2.groupby(['Charge Group Description'])['zscore_age'].max().sort_values(ascending=False)
 

'''Question 5: Felony arrest incidents have been dropping over the years. Using a trend line (linear estimation) 
for the data from 2010 and 2018 (inclusive), what is the projected number of felony arrests in 2019? 
Round to the nearest integer. Note, the data set includes arrests for misdemeanor, felonies, etc.'''
df['Arrest Date2'] = pd.to_datetime(df['Arrest Date'])
df.set_index('Arrest Date2', inplace = True)
ax = df.resample('A')['Area Name'].count().plot()
ax.set_frame_on(False)
ax.set_title('Projected number of felony arrests in 2019')


ln = df
ln.head()
ln['Total'] = 0
ln['Arrest Date2'] = pd.to_datetime(ln['Arrest Date'])

ln['Year'] = ln['Arrest Date2'].dt.year
ln['Year'] = ln['year'].apply(str)
feature = ln.groupby('Year').count()[['Total']].sort_values(by='Total',ascending=False).reset_index()
x = feature['Year'].to_numpy().reshape((-1, 1))
y = feature['Total'].to_numpy()
model = LinearRegression().fit(x, y)
r_sq = model.score(x, y)
print('coefficient of determination:', r_sq)
print('intercept:', model.intercept_)
y_pred = model.predict(x)
print('predicted response:', y_pred, sep='\n')


'''How many arrest incidents occurred within 2 km from the Bradbury Building in 2018? Use (34.050536, -118.247861) 
for the coordinates of the Bradbury Building . For simplicity, please use the spherical Earth projected 
to a plane equation for calculating distances. Use the radius of the Earth as 6371 km. Note, 
some arrest records are missing location data and the location is listed as (0, 0). These records should not factor in your calculation.'''


year_2018 = year_2018.loc[~year_2018['Location'].isnull()]
bradbury = "34.050536, -118.247861"
arrests = pd.DataFrame({'Area Name': [], 'Location': [], 'Total': []})
print(eval(bradbury))

for index, row in year_2018.iterrows():
    if np.around(great_circle(eval(bradbury), eval(row['Location'])).kilometers, decimals=1) <= 2 :
           arrests = arrests.append({'Area Name': row['Area Name'], 'Location': row['Location'], 'Total': row['Total']}, ignore_index=True)

arrests.groupby('Area Name').count()[['Total']].sum().head



[0. 0. 1. 2.]
