In [187]:
import json
import pandas as pd
import numpy as np

import re

from sqlalchemy import create_engine
import psycopg2

import matplotlib.pyplot as plt

from datetime import datetime

In [188]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder

# importing libraries for Geographic and Interactive visualizations
from folium import Choropleth, Circle, Marker, Map
from folium.plugins import HeatMap, MarkerCluster
import plotly.express as px
import plotly.graph_objects as go
import math


# Boston Crime Data

In [189]:
bo_df = pd.read_csv('https://capstone-crime-bucket.s3.amazonaws.com/crime.csv', encoding='latin-1')

bo_df.head()

Unnamed: 0,INCIDENT_NUMBER,OFFENSE_CODE,OFFENSE_CODE_GROUP,OFFENSE_DESCRIPTION,DISTRICT,REPORTING_AREA,SHOOTING,OCCURRED_ON_DATE,YEAR,MONTH,DAY_OF_WEEK,HOUR,UCR_PART,STREET,Lat,Long,Location
0,I182070945,619,Larceny,LARCENY ALL OTHERS,D14,808,,2018-09-02 13:00:00,2018,9,Sunday,13,Part One,LINCOLN ST,42.357791,-71.139371,"(42.35779134, -71.13937053)"
1,I182070943,1402,Vandalism,VANDALISM,C11,347,,2018-08-21 00:00:00,2018,8,Tuesday,0,Part Two,HECLA ST,42.306821,-71.0603,"(42.30682138, -71.06030035)"
2,I182070941,3410,Towed,TOWED MOTOR VEHICLE,D4,151,,2018-09-03 19:27:00,2018,9,Monday,19,Part Three,CAZENOVE ST,42.346589,-71.072429,"(42.34658879, -71.07242943)"
3,I182070940,3114,Investigate Property,INVESTIGATE PROPERTY,D4,272,,2018-09-03 21:16:00,2018,9,Monday,21,Part Three,NEWCOMB ST,42.334182,-71.078664,"(42.33418175, -71.07866441)"
4,I182070938,3114,Investigate Property,INVESTIGATE PROPERTY,B3,421,,2018-09-03 21:05:00,2018,9,Monday,21,Part Three,DELHI ST,42.275365,-71.090361,"(42.27536542, -71.09036101)"


In [190]:
print("Total number of crimes in the dataset: {}".format(len(bo_df)))
bo_df.head()

Total number of crimes in the dataset: 319073


Unnamed: 0,INCIDENT_NUMBER,OFFENSE_CODE,OFFENSE_CODE_GROUP,OFFENSE_DESCRIPTION,DISTRICT,REPORTING_AREA,SHOOTING,OCCURRED_ON_DATE,YEAR,MONTH,DAY_OF_WEEK,HOUR,UCR_PART,STREET,Lat,Long,Location
0,I182070945,619,Larceny,LARCENY ALL OTHERS,D14,808,,2018-09-02 13:00:00,2018,9,Sunday,13,Part One,LINCOLN ST,42.357791,-71.139371,"(42.35779134, -71.13937053)"
1,I182070943,1402,Vandalism,VANDALISM,C11,347,,2018-08-21 00:00:00,2018,8,Tuesday,0,Part Two,HECLA ST,42.306821,-71.0603,"(42.30682138, -71.06030035)"
2,I182070941,3410,Towed,TOWED MOTOR VEHICLE,D4,151,,2018-09-03 19:27:00,2018,9,Monday,19,Part Three,CAZENOVE ST,42.346589,-71.072429,"(42.34658879, -71.07242943)"
3,I182070940,3114,Investigate Property,INVESTIGATE PROPERTY,D4,272,,2018-09-03 21:16:00,2018,9,Monday,21,Part Three,NEWCOMB ST,42.334182,-71.078664,"(42.33418175, -71.07866441)"
4,I182070938,3114,Investigate Property,INVESTIGATE PROPERTY,B3,421,,2018-09-03 21:05:00,2018,9,Monday,21,Part Three,DELHI ST,42.275365,-71.090361,"(42.27536542, -71.09036101)"


In [191]:
bo_df.dtypes

INCIDENT_NUMBER         object
OFFENSE_CODE             int64
OFFENSE_CODE_GROUP      object
OFFENSE_DESCRIPTION     object
DISTRICT                object
REPORTING_AREA          object
SHOOTING                object
OCCURRED_ON_DATE        object
YEAR                     int64
MONTH                    int64
DAY_OF_WEEK             object
HOUR                     int64
UCR_PART                object
STREET                  object
Lat                    float64
Long                   float64
Location                object
dtype: object

In [192]:
for col in bo_df.columns:
        print(col)

INCIDENT_NUMBER
OFFENSE_CODE
OFFENSE_CODE_GROUP
OFFENSE_DESCRIPTION
DISTRICT
REPORTING_AREA
SHOOTING
OCCURRED_ON_DATE
YEAR
MONTH
DAY_OF_WEEK
HOUR
UCR_PART
STREET
Lat
Long
Location


In [193]:
crimeByType = bo_df['OFFENSE_DESCRIPTION'].value_counts()
crimeByType

SICK/INJURED/MEDICAL - PERSON                  18783
INVESTIGATE PERSON                             18754
M/V - LEAVING SCENE - PROPERTY DAMAGE          16323
VANDALISM                                      15154
ASSAULT SIMPLE - BATTERY                       14791
                                               ...  
KILLING OF FELON BY POLICE                         1
AUTO THEFT - OUTSIDE - RECOVERED IN BOSTON         1
DRUGS - GLUE INHALATION                            1
ROBBERY - KNIFE - STREET                           1
DRUGS - POSS CLASS D - INTENT MFR DIST DISP        1
Name: OFFENSE_DESCRIPTION, Length: 244, dtype: int64

In [194]:
crimeByCode = bo_df['OFFENSE_CODE'].value_counts()
crimeByCode

3006    18783
3115    18754
3831    16323
1402    15154
802     14799
        ...  
404         1
112         1
2609        1
2606        1
1864        1
Name: OFFENSE_CODE, Length: 222, dtype: int64

In [195]:
bo_df.nunique()

INCIDENT_NUMBER        282517
OFFENSE_CODE              222
OFFENSE_CODE_GROUP         67
OFFENSE_DESCRIPTION       244
DISTRICT                   12
REPORTING_AREA            879
SHOOTING                    1
OCCURRED_ON_DATE       233229
YEAR                        4
MONTH                      12
DAY_OF_WEEK                 7
HOUR                       24
UCR_PART                    4
STREET                   4657
Lat                     18178
Long                    18178
Location                18194
dtype: int64

In [196]:
bo_df.DISTRICT.value_counts()

B2     49945
C11    42530
D4     41915
A1     35717
B3     35442
C6     23460
D14    20127
E13    17536
E18    17348
A7     13544
E5     13239
A15     6505
Name: DISTRICT, dtype: int64

In [197]:
bo_df.head()

Unnamed: 0,INCIDENT_NUMBER,OFFENSE_CODE,OFFENSE_CODE_GROUP,OFFENSE_DESCRIPTION,DISTRICT,REPORTING_AREA,SHOOTING,OCCURRED_ON_DATE,YEAR,MONTH,DAY_OF_WEEK,HOUR,UCR_PART,STREET,Lat,Long,Location
0,I182070945,619,Larceny,LARCENY ALL OTHERS,D14,808,,2018-09-02 13:00:00,2018,9,Sunday,13,Part One,LINCOLN ST,42.357791,-71.139371,"(42.35779134, -71.13937053)"
1,I182070943,1402,Vandalism,VANDALISM,C11,347,,2018-08-21 00:00:00,2018,8,Tuesday,0,Part Two,HECLA ST,42.306821,-71.0603,"(42.30682138, -71.06030035)"
2,I182070941,3410,Towed,TOWED MOTOR VEHICLE,D4,151,,2018-09-03 19:27:00,2018,9,Monday,19,Part Three,CAZENOVE ST,42.346589,-71.072429,"(42.34658879, -71.07242943)"
3,I182070940,3114,Investigate Property,INVESTIGATE PROPERTY,D4,272,,2018-09-03 21:16:00,2018,9,Monday,21,Part Three,NEWCOMB ST,42.334182,-71.078664,"(42.33418175, -71.07866441)"
4,I182070938,3114,Investigate Property,INVESTIGATE PROPERTY,B3,421,,2018-09-03 21:05:00,2018,9,Monday,21,Part Three,DELHI ST,42.275365,-71.090361,"(42.27536542, -71.09036101)"


In [198]:
bo_df.describe()

Unnamed: 0,OFFENSE_CODE,YEAR,MONTH,HOUR,Lat,Long
count,319073.0,319073.0,319073.0,319073.0,299074.0,299074.0
mean,2317.546956,2016.560586,6.609719,13.118205,42.214381,-70.908272
std,1185.285543,0.996344,3.273691,6.294205,2.159766,3.493618
min,111.0,2015.0,1.0,0.0,-1.0,-71.178674
25%,1001.0,2016.0,4.0,9.0,42.297442,-71.097135
50%,2907.0,2017.0,7.0,14.0,42.325538,-71.077524
75%,3201.0,2017.0,9.0,18.0,42.348624,-71.062467
max,3831.0,2018.0,12.0,23.0,42.395042,-1.0


In [199]:
group_crime = bo_df.groupby('OFFENSE_DESCRIPTION')
group_crime = bo_df['OFFENSE_DESCRIPTION'].value_counts()
group_crime

SICK/INJURED/MEDICAL - PERSON                  18783
INVESTIGATE PERSON                             18754
M/V - LEAVING SCENE - PROPERTY DAMAGE          16323
VANDALISM                                      15154
ASSAULT SIMPLE - BATTERY                       14791
                                               ...  
KILLING OF FELON BY POLICE                         1
AUTO THEFT - OUTSIDE - RECOVERED IN BOSTON         1
DRUGS - GLUE INHALATION                            1
ROBBERY - KNIFE - STREET                           1
DRUGS - POSS CLASS D - INTENT MFR DIST DISP        1
Name: OFFENSE_DESCRIPTION, Length: 244, dtype: int64

In [200]:
group_crime.describe()

count      244.000000
mean      1307.676230
std       3053.979367
min          1.000000
25%          6.750000
50%        188.000000
75%        963.000000
max      18783.000000
Name: OFFENSE_DESCRIPTION, dtype: float64

In [201]:
#find null values
for column in bo_df.columns:
    print(f"Column {column} has {bo_df[column].isnull().sum()} null values")

Column INCIDENT_NUMBER has 0 null values
Column OFFENSE_CODE has 0 null values
Column OFFENSE_CODE_GROUP has 0 null values
Column OFFENSE_DESCRIPTION has 0 null values
Column DISTRICT has 1765 null values
Column REPORTING_AREA has 0 null values
Column SHOOTING has 318054 null values
Column OCCURRED_ON_DATE has 0 null values
Column YEAR has 0 null values
Column MONTH has 0 null values
Column DAY_OF_WEEK has 0 null values
Column HOUR has 0 null values
Column UCR_PART has 90 null values
Column STREET has 10871 null values
Column Lat has 19999 null values
Column Long has 19999 null values
Column Location has 0 null values


In [202]:
# Drop null rows
bo_clean = bo_df.dropna()

In [203]:
bo_clean.drop(["INCIDENT_NUMBER", "OFFENSE_CODE_GROUP", "REPORTING_AREA", "SHOOTING", "MONTH", "DAY_OF_WEEK", "HOUR", "UCR_PART", "STREET", "DISTRICT"], axis = 1, inplace = True)
bo_clean.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,OFFENSE_CODE,OFFENSE_DESCRIPTION,OCCURRED_ON_DATE,YEAR,Lat,Long,Location
1295,413,ASSAULT - AGGRAVATED - BATTERY,2018-08-29 23:19:00,2018,42.250405,-71.131737,"(42.25040550, -71.13173740)"
1860,413,ASSAULT - AGGRAVATED - BATTERY,2018-08-27 22:56:00,2018,42.321042,-71.072153,"(42.32104169, -71.07215291)"
3259,111,"MURDER, NON-NEGLIGIENT MANSLAUGHTER",2018-08-22 15:00:00,2018,42.315689,-71.089514,"(42.31568920, -71.08951391)"
3260,2662,BALLISTICS EVIDENCE/FOUND,2018-08-22 15:00:00,2018,42.315689,-71.089514,"(42.31568920, -71.08951391)"
4107,413,ASSAULT - AGGRAVATED - BATTERY,2018-08-19 22:46:00,2018,42.378085,-71.057841,"(42.37808452, -71.05784115)"


In [204]:
#find null values
for column in bo_clean.columns:
    print(f"Column {column} has {bo_clean[column].isnull().sum()} null values")

Column OFFENSE_CODE has 0 null values
Column OFFENSE_DESCRIPTION has 0 null values
Column OCCURRED_ON_DATE has 0 null values
Column YEAR has 0 null values
Column Lat has 0 null values
Column Long has 0 null values
Column Location has 0 null values


In [205]:
bo_clean.head()

Unnamed: 0,OFFENSE_CODE,OFFENSE_DESCRIPTION,OCCURRED_ON_DATE,YEAR,Lat,Long,Location
1295,413,ASSAULT - AGGRAVATED - BATTERY,2018-08-29 23:19:00,2018,42.250405,-71.131737,"(42.25040550, -71.13173740)"
1860,413,ASSAULT - AGGRAVATED - BATTERY,2018-08-27 22:56:00,2018,42.321042,-71.072153,"(42.32104169, -71.07215291)"
3259,111,"MURDER, NON-NEGLIGIENT MANSLAUGHTER",2018-08-22 15:00:00,2018,42.315689,-71.089514,"(42.31568920, -71.08951391)"
3260,2662,BALLISTICS EVIDENCE/FOUND,2018-08-22 15:00:00,2018,42.315689,-71.089514,"(42.31568920, -71.08951391)"
4107,413,ASSAULT - AGGRAVATED - BATTERY,2018-08-19 22:46:00,2018,42.378085,-71.057841,"(42.37808452, -71.05784115)"


In [206]:
bo_clean["City"] = 1
bo_clean.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,OFFENSE_CODE,OFFENSE_DESCRIPTION,OCCURRED_ON_DATE,YEAR,Lat,Long,Location,City
1295,413,ASSAULT - AGGRAVATED - BATTERY,2018-08-29 23:19:00,2018,42.250405,-71.131737,"(42.25040550, -71.13173740)",1
1860,413,ASSAULT - AGGRAVATED - BATTERY,2018-08-27 22:56:00,2018,42.321042,-71.072153,"(42.32104169, -71.07215291)",1
3259,111,"MURDER, NON-NEGLIGIENT MANSLAUGHTER",2018-08-22 15:00:00,2018,42.315689,-71.089514,"(42.31568920, -71.08951391)",1
3260,2662,BALLISTICS EVIDENCE/FOUND,2018-08-22 15:00:00,2018,42.315689,-71.089514,"(42.31568920, -71.08951391)",1
4107,413,ASSAULT - AGGRAVATED - BATTERY,2018-08-19 22:46:00,2018,42.378085,-71.057841,"(42.37808452, -71.05784115)",1


In [207]:
bo_clean = bo_clean.reset_index(drop=True)

In [208]:
bo_clean.to_csv("https://capstone-crime-bucket.s3.amazonaws.com/crime.csv",header=True)

# Los Angeles Crime Data


In [209]:
la_df = pd.read_csv('https://capstone-crime-bucket.s3.amazonaws.com/Crimes_2012-2016.csv', encoding='latin-1')

la_df.head()

Unnamed: 0.1,Unnamed: 0,DR.NO,DATE.OCC,TIME.OCC,AREA,AREA.NAME,RD,Crm.Cd,CrmCd.Desc,Location.1,lat,long,Year
0,0,132007717,03/20/2013,2015,20,Olympic,2004,997,TRAFFIC DR #,"(34.0776, -118.308)",34.0776,-118.308,2013
1,1,130608787,03/10/2013,445,6,Hollywood,635,997,TRAFFIC DR #,"(34.1113, -118.3336)",34.1113,-118.3336,2013
2,2,131820260,12/18/2013,745,18,Southeast,1839,997,TRAFFIC DR #,"(33.9406, -118.2338)",33.9406,-118.2338,2013
3,3,131817514,10/18/2013,1730,18,Southeast,1827,997,TRAFFIC DR #,"(33.9449, -118.2332)",33.9449,-118.2332,2013
4,4,130510483,05/25/2013,2000,5,Harbor,507,440,THEFT PLAIN - PETTY (UNDER $400),"(33.8135, -118.2992)",33.8135,-118.2992,2013


In [210]:
print("Total number of crimes in the dataset: {}".format(len(la_df)))
la_df.head()

Total number of crimes in the dataset: 1132034


Unnamed: 0.1,Unnamed: 0,DR.NO,DATE.OCC,TIME.OCC,AREA,AREA.NAME,RD,Crm.Cd,CrmCd.Desc,Location.1,lat,long,Year
0,0,132007717,03/20/2013,2015,20,Olympic,2004,997,TRAFFIC DR #,"(34.0776, -118.308)",34.0776,-118.308,2013
1,1,130608787,03/10/2013,445,6,Hollywood,635,997,TRAFFIC DR #,"(34.1113, -118.3336)",34.1113,-118.3336,2013
2,2,131820260,12/18/2013,745,18,Southeast,1839,997,TRAFFIC DR #,"(33.9406, -118.2338)",33.9406,-118.2338,2013
3,3,131817514,10/18/2013,1730,18,Southeast,1827,997,TRAFFIC DR #,"(33.9449, -118.2332)",33.9449,-118.2332,2013
4,4,130510483,05/25/2013,2000,5,Harbor,507,440,THEFT PLAIN - PETTY (UNDER $400),"(33.8135, -118.2992)",33.8135,-118.2992,2013


In [211]:
for col in la_df.columns:
        print(col)

Unnamed: 0
DR.NO
DATE.OCC
TIME.OCC
AREA
AREA.NAME
RD
Crm.Cd
CrmCd.Desc
Location.1
lat
long
Year


In [212]:
crimeByType = la_df['CrmCd.Desc'].value_counts()
crimeByType

TRAFFIC DR #                               164915
BATTERY - SIMPLE ASSAULT                    89993
VEHICLE - STOLEN                            73127
BURGLARY FROM VEHICLE                       72291
BURGLARY                                    69339
                                            ...  
TILL TAP - GRAND THEFT ($950.01 & OVER)         3
MANSLAUGHTER, NEGLIGENT                         2
BLOCKING DOOR INDUCTION CENTER                  2
THEFT, COIN MACHINE - GRAND                     1
HOMICIDE (NON-UCR)                              1
Name: CrmCd.Desc, Length: 161, dtype: int64

In [213]:
la_df.dtypes

Unnamed: 0      int64
DR.NO           int64
DATE.OCC       object
TIME.OCC        int64
AREA            int64
AREA.NAME      object
RD              int64
Crm.Cd          int64
CrmCd.Desc     object
Location.1     object
lat           float64
long          float64
Year            int64
dtype: object

In [214]:
crimeByCode = la_df['Crm.Cd'].value_counts()
crimeByCode

997    164915
624     89993
440     74815
510     73127
330     72291
        ...  
347         4
470         3
113         2
432         2
111         1
Name: Crm.Cd, Length: 136, dtype: int64

In [215]:
la_df.nunique()

Unnamed: 0    1132034
DR.NO         1131013
DATE.OCC         1770
TIME.OCC         1439
AREA               21
AREA.NAME          21
RD               1303
Crm.Cd            136
CrmCd.Desc        161
Location.1      68551
lat              5440
long             5139
Year                5
dtype: int64

In [216]:
# Drop null rows
la_clean = la_df.dropna()

In [217]:
la_clean.drop(['Unnamed: 0', 'DR.NO', 'TIME.OCC', 'RD', 'AREA.NAME', 'AREA'], axis = 1, inplace = True)
#'LOCATION', 'Status', 'Status.Desc', 'Cross.Street', 'AREA', 'Neighborhood'
la_clean.head()

Unnamed: 0,DATE.OCC,Crm.Cd,CrmCd.Desc,Location.1,lat,long,Year
0,03/20/2013,997,TRAFFIC DR #,"(34.0776, -118.308)",34.0776,-118.308,2013
1,03/10/2013,997,TRAFFIC DR #,"(34.1113, -118.3336)",34.1113,-118.3336,2013
2,12/18/2013,997,TRAFFIC DR #,"(33.9406, -118.2338)",33.9406,-118.2338,2013
3,10/18/2013,997,TRAFFIC DR #,"(33.9449, -118.2332)",33.9449,-118.2332,2013
4,05/25/2013,440,THEFT PLAIN - PETTY (UNDER $400),"(33.8135, -118.2992)",33.8135,-118.2992,2013


In [218]:
la_clean.describe()

Unnamed: 0,Crm.Cd,lat,long,Year
count,1132034.0,1132034.0,1132034.0,1132034.0
mean,579.1831,33.81956,-117.4601,2013.932
std,258.9706,2.962788,10.28295,1.397677
min,110.0,0.0,-118.8551,2012.0
25%,341.0,34.0094,-118.4378,2013.0
50%,510.0,34.062,-118.3308,2014.0
75%,745.0,34.1749,-118.2782,2015.0
max,997.0,34.8087,0.0,2016.0


In [219]:
#find null values
for column in la_clean.columns:
    print(f"Column {column} has {la_clean[column].isnull().sum()} null values")

Column DATE.OCC has 0 null values
Column Crm.Cd has 0 null values
Column CrmCd.Desc has 0 null values
Column Location.1 has 0 null values
Column lat has 0 null values
Column long has 0 null values
Column Year has 0 null values


In [220]:
import ast
la_clean['Location.1'] = la_clean['Location.1'].apply(ast.literal_eval)

In [221]:
la_clean['lat'] = la_clean['Location.1'].str[0]
la_clean['long'] = la_clean['Location.1'].str[1]

In [222]:
la_clean["City"] = 2
la_clean.head()

Unnamed: 0,DATE.OCC,Crm.Cd,CrmCd.Desc,Location.1,lat,long,Year,City
0,03/20/2013,997,TRAFFIC DR #,"(34.0776, -118.308)",34.0776,-118.308,2013,2
1,03/10/2013,997,TRAFFIC DR #,"(34.1113, -118.3336)",34.1113,-118.3336,2013,2
2,12/18/2013,997,TRAFFIC DR #,"(33.9406, -118.2338)",33.9406,-118.2338,2013,2
3,10/18/2013,997,TRAFFIC DR #,"(33.9449, -118.2332)",33.9449,-118.2332,2013,2
4,05/25/2013,440,THEFT PLAIN - PETTY (UNDER $400),"(33.8135, -118.2992)",33.8135,-118.2992,2013,2


In [223]:
la_clean.head()

Unnamed: 0,DATE.OCC,Crm.Cd,CrmCd.Desc,Location.1,lat,long,Year,City
0,03/20/2013,997,TRAFFIC DR #,"(34.0776, -118.308)",34.0776,-118.308,2013,2
1,03/10/2013,997,TRAFFIC DR #,"(34.1113, -118.3336)",34.1113,-118.3336,2013,2
2,12/18/2013,997,TRAFFIC DR #,"(33.9406, -118.2338)",33.9406,-118.2338,2013,2
3,10/18/2013,997,TRAFFIC DR #,"(33.9449, -118.2332)",33.9449,-118.2332,2013,2
4,05/25/2013,440,THEFT PLAIN - PETTY (UNDER $400),"(33.8135, -118.2992)",33.8135,-118.2992,2013,2


In [224]:
la_clean['Year'] = pd.DatetimeIndex(la_clean['DATE.OCC']).year
la_clean.head()

Unnamed: 0,DATE.OCC,Crm.Cd,CrmCd.Desc,Location.1,lat,long,Year,City
0,03/20/2013,997,TRAFFIC DR #,"(34.0776, -118.308)",34.0776,-118.308,2013,2
1,03/10/2013,997,TRAFFIC DR #,"(34.1113, -118.3336)",34.1113,-118.3336,2013,2
2,12/18/2013,997,TRAFFIC DR #,"(33.9406, -118.2338)",33.9406,-118.2338,2013,2
3,10/18/2013,997,TRAFFIC DR #,"(33.9449, -118.2332)",33.9449,-118.2332,2013,2
4,05/25/2013,440,THEFT PLAIN - PETTY (UNDER $400),"(33.8135, -118.2992)",33.8135,-118.2992,2013,2


In [225]:
la_clean = la_clean.reset_index(drop=True)

In [226]:
la_clean.to_csv("https://capstone-crime-bucket.s3.amazonaws.com/Crimes_2012-2016.csv",header=True)

# Baltimore Crime Data

In [227]:
bal_df = pd.read_csv('https://capstone-crime-bucket.s3.amazonaws.com/Baltimore_Crime_Data.csv', encoding='latin-1')

bal_df.head()

Unnamed: 0,CrimeDate,CrimeTime,CrimeCode,Location,Description,Inside/Outside,Weapon,Post,District,Neighborhood,Longitude,Latitude,Location 1,Premise,Total Incidents
0,09/02/2017,23:30:00,3JK,4200 AUDREY AVE,ROBBERY - RESIDENCE,I,KNIFE,913.0,SOUTHERN,Brooklyn,-76.60541,39.22951,"(39.2295100000, -76.6054100000)",ROW/TOWNHO,1
1,09/02/2017,23:00:00,7A,800 NEWINGTON AVE,AUTO THEFT,O,,133.0,CENTRAL,Reservoir Hill,-76.63217,39.3136,"(39.3136000000, -76.6321700000)",STREET,1
2,09/02/2017,22:53:00,9S,600 RADNOR AV,SHOOTING,Outside,FIREARM,524.0,NORTHERN,Winston-Govans,-76.60697,39.34768,"(39.3476800000, -76.6069700000)",Street,1
3,09/02/2017,22:50:00,4C,1800 RAMSAY ST,AGG. ASSAULT,I,OTHER,934.0,SOUTHERN,Carrollton Ridge,-76.64526,39.28315,"(39.2831500000, -76.6452600000)",ROW/TOWNHO,1
4,09/02/2017,22:31:00,4E,100 LIGHT ST,COMMON ASSAULT,O,HANDS,113.0,CENTRAL,Downtown West,-76.61365,39.28756,"(39.2875600000, -76.6136500000)",STREET,1


In [228]:
print("Total number of crimes in the dataset: {}".format(len(bal_df)))
bal_df.head()

Total number of crimes in the dataset: 276529


Unnamed: 0,CrimeDate,CrimeTime,CrimeCode,Location,Description,Inside/Outside,Weapon,Post,District,Neighborhood,Longitude,Latitude,Location 1,Premise,Total Incidents
0,09/02/2017,23:30:00,3JK,4200 AUDREY AVE,ROBBERY - RESIDENCE,I,KNIFE,913.0,SOUTHERN,Brooklyn,-76.60541,39.22951,"(39.2295100000, -76.6054100000)",ROW/TOWNHO,1
1,09/02/2017,23:00:00,7A,800 NEWINGTON AVE,AUTO THEFT,O,,133.0,CENTRAL,Reservoir Hill,-76.63217,39.3136,"(39.3136000000, -76.6321700000)",STREET,1
2,09/02/2017,22:53:00,9S,600 RADNOR AV,SHOOTING,Outside,FIREARM,524.0,NORTHERN,Winston-Govans,-76.60697,39.34768,"(39.3476800000, -76.6069700000)",Street,1
3,09/02/2017,22:50:00,4C,1800 RAMSAY ST,AGG. ASSAULT,I,OTHER,934.0,SOUTHERN,Carrollton Ridge,-76.64526,39.28315,"(39.2831500000, -76.6452600000)",ROW/TOWNHO,1
4,09/02/2017,22:31:00,4E,100 LIGHT ST,COMMON ASSAULT,O,HANDS,113.0,CENTRAL,Downtown West,-76.61365,39.28756,"(39.2875600000, -76.6136500000)",STREET,1


In [229]:
for col in bal_df.columns:
        print(col)

CrimeDate
CrimeTime
CrimeCode
Location
Description
Inside/Outside
Weapon
Post
District
Neighborhood
Longitude
Latitude
Location 1
Premise
Total Incidents


In [230]:
crimeByType = bal_df['Description'].value_counts()
crimeByType

LARCENY                 60528
COMMON ASSAULT          45518
BURGLARY                42538
LARCENY FROM AUTO       36295
AGG. ASSAULT            27513
AUTO THEFT              26838
ROBBERY - STREET        17691
ROBBERY - COMMERCIAL     4141
ASSAULT BY THREAT        3503
SHOOTING                 2910
ROBBERY - RESIDENCE      2866
RAPE                     1637
HOMICIDE                 1559
ROBBERY - CARJACKING     1528
ARSON                    1464
Name: Description, dtype: int64

In [231]:
bal_df.dtypes

CrimeDate           object
CrimeTime           object
CrimeCode           object
Location            object
Description         object
Inside/Outside      object
Weapon              object
Post               float64
District            object
Neighborhood        object
Longitude          float64
Latitude           float64
Location 1          object
Premise             object
Total Incidents      int64
dtype: object

In [232]:
crimeByCode = bal_df['CrimeCode'].value_counts()
crimeByCode

4E     45518
6D     36295
5A     25699
7A     25230
6G     15920
       ...  
3N         5
8CV        4
8DO        2
3LK        2
6K         1
Name: CrimeCode, Length: 81, dtype: int64

In [233]:
bal_df.nunique()

CrimeDate           2072
CrimeTime           3017
CrimeCode             81
Location           26008
Description           15
Inside/Outside         4
Weapon                 4
Post                 179
District               9
Neighborhood         278
Longitude          17678
Latitude           13507
Location 1         95497
Premise              123
Total Incidents        1
dtype: int64

In [234]:
# Drop null rows
bal_clean = bal_df.dropna()


In [235]:
bal_clean.describe()

Unnamed: 0,Post,Longitude,Latitude,Total Incidents
count,89943.0,89943.0,89943.0,89943.0
mean,512.234815,-76.618072,39.305959,1.0
std,261.351552,0.041477,0.028629,0.0
min,111.0,-76.71127,39.20041,1.0
25%,311.0,-76.64888,39.289285,1.0
50%,511.0,-76.61445,39.30277,1.0
75%,732.0,-76.58863,39.32368,1.0
max,943.0,-76.52977,39.37197,1.0


In [236]:
bal_clean.drop(['Total Incidents','Weapon', 'Post', 'Premise', 'Inside/Outside', 'Location', 'CrimeTime', 'Neighborhood', 'District'], axis = 1, inplace = True)
bal_clean.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,CrimeDate,CrimeCode,Description,Longitude,Latitude,Location 1
0,09/02/2017,3JK,ROBBERY - RESIDENCE,-76.60541,39.22951,"(39.2295100000, -76.6054100000)"
2,09/02/2017,9S,SHOOTING,-76.60697,39.34768,"(39.3476800000, -76.6069700000)"
3,09/02/2017,4C,AGG. ASSAULT,-76.64526,39.28315,"(39.2831500000, -76.6452600000)"
4,09/02/2017,4E,COMMON ASSAULT,-76.61365,39.28756,"(39.2875600000, -76.6136500000)"
6,09/02/2017,1F,HOMICIDE,-76.56827,39.28202,"(39.2820200000, -76.5682700000)"


In [237]:
#find null values
for column in bal_clean.columns:
    print(f"Column {column} has {bal_clean[column].isnull().sum()} null values")

Column CrimeDate has 0 null values
Column CrimeCode has 0 null values
Column Description has 0 null values
Column Longitude has 0 null values
Column Latitude has 0 null values
Column Location 1 has 0 null values


In [238]:
bal_clean["City"] = 3
bal_clean.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,CrimeDate,CrimeCode,Description,Longitude,Latitude,Location 1,City
0,09/02/2017,3JK,ROBBERY - RESIDENCE,-76.60541,39.22951,"(39.2295100000, -76.6054100000)",3
2,09/02/2017,9S,SHOOTING,-76.60697,39.34768,"(39.3476800000, -76.6069700000)",3
3,09/02/2017,4C,AGG. ASSAULT,-76.64526,39.28315,"(39.2831500000, -76.6452600000)",3
4,09/02/2017,4E,COMMON ASSAULT,-76.61365,39.28756,"(39.2875600000, -76.6136500000)",3
6,09/02/2017,1F,HOMICIDE,-76.56827,39.28202,"(39.2820200000, -76.5682700000)",3


In [239]:
bal_clean.head()

Unnamed: 0,CrimeDate,CrimeCode,Description,Longitude,Latitude,Location 1,City
0,09/02/2017,3JK,ROBBERY - RESIDENCE,-76.60541,39.22951,"(39.2295100000, -76.6054100000)",3
2,09/02/2017,9S,SHOOTING,-76.60697,39.34768,"(39.3476800000, -76.6069700000)",3
3,09/02/2017,4C,AGG. ASSAULT,-76.64526,39.28315,"(39.2831500000, -76.6452600000)",3
4,09/02/2017,4E,COMMON ASSAULT,-76.61365,39.28756,"(39.2875600000, -76.6136500000)",3
6,09/02/2017,1F,HOMICIDE,-76.56827,39.28202,"(39.2820200000, -76.5682700000)",3


In [240]:
bal_clean['Year'] = pd.DatetimeIndex(bal_clean['CrimeDate']).year
bal_clean.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,CrimeDate,CrimeCode,Description,Longitude,Latitude,Location 1,City,Year
0,09/02/2017,3JK,ROBBERY - RESIDENCE,-76.60541,39.22951,"(39.2295100000, -76.6054100000)",3,2017
2,09/02/2017,9S,SHOOTING,-76.60697,39.34768,"(39.3476800000, -76.6069700000)",3,2017
3,09/02/2017,4C,AGG. ASSAULT,-76.64526,39.28315,"(39.2831500000, -76.6452600000)",3,2017
4,09/02/2017,4E,COMMON ASSAULT,-76.61365,39.28756,"(39.2875600000, -76.6136500000)",3,2017
6,09/02/2017,1F,HOMICIDE,-76.56827,39.28202,"(39.2820200000, -76.5682700000)",3,2017


In [241]:
bal_clean = bal_clean.reset_index(drop=True)

In [242]:
bal_clean.to_csv("https://capstone-crime-bucket.s3.amazonaws.com/Baltimore_Crime_Data.csv",header=True)

# Merge Datasets

In [243]:
bo_clean.columns.tolist()

['OFFENSE_CODE',
 'OFFENSE_DESCRIPTION',
 'OCCURRED_ON_DATE',
 'YEAR',
 'Lat',
 'Long',
 'Location',
 'City']

In [244]:
la_clean.columns.tolist()

['DATE.OCC',
 'Crm.Cd',
 'CrmCd.Desc',
 'Location.1',
 'lat',
 'long',
 'Year',
 'City']

In [245]:
bal_clean.columns.tolist()

['CrimeDate',
 'CrimeCode',
 'Description',
 'Longitude',
 'Latitude',
 'Location 1',
 'City',
 'Year']

In [246]:
 
bo_clean.rename(columns={'OFFENSE_CODE': 'CrimeCode', 'YEAR': 'Year', 'Lat': 'Latitude', 'Long': 'Longitude', 'OCCURRED_ON_DATE': 'CrimeDate',  'OFFENSE_DESCRIPTION': 'Description', 'REPORTING_AREA': 'Neighborhood'}, inplace=True)

la_clean.rename(columns={'Crm.Cd': 'CrimeCode', 'lat': 'Latitude', 'long': 'Longitude', 'DATE.OCC': 'CrimeDate', 'AREA.NAME': 'Neighborhood', 'CrmCd.Desc': 'Description', 'Crm.Cd': 'CrimeCode', 'Location.1': 'Location'}, inplace=True)

bal_clean.rename(columns={'Location 1': 'Location'}, inplace=True)

In [247]:
bo_clean.columns.tolist()


['CrimeCode',
 'Description',
 'CrimeDate',
 'Year',
 'Latitude',
 'Longitude',
 'Location',
 'City']

In [248]:
la_clean.columns.tolist()

['CrimeDate',
 'CrimeCode',
 'Description',
 'Location',
 'Latitude',
 'Longitude',
 'Year',
 'City']

In [249]:
#remove all string variables; need lat and long, 

In [252]:
bal_clean.columns.tolist()

['CrimeDate',
 'CrimeCode',
 'Description',
 'Longitude',
 'Latitude',
 'Location',
 'City',
 'Year']

In [253]:
#df1 = bo_clean['CrimeCode', 'Description', 'CrimeDate', 'Year', 'City', 'Latitude', 'Longitude']


In [254]:
df_merged=pd.concat([bo_clean,la_clean,bal_clean])
df_merged.head()

Unnamed: 0,CrimeCode,Description,CrimeDate,Year,Latitude,Longitude,Location,City
0,413,ASSAULT - AGGRAVATED - BATTERY,2018-08-29 23:19:00,2018,42.250405,-71.131737,"(42.25040550, -71.13173740)",1
1,413,ASSAULT - AGGRAVATED - BATTERY,2018-08-27 22:56:00,2018,42.321042,-71.072153,"(42.32104169, -71.07215291)",1
2,111,"MURDER, NON-NEGLIGIENT MANSLAUGHTER",2018-08-22 15:00:00,2018,42.315689,-71.089514,"(42.31568920, -71.08951391)",1
3,2662,BALLISTICS EVIDENCE/FOUND,2018-08-22 15:00:00,2018,42.315689,-71.089514,"(42.31568920, -71.08951391)",1
4,413,ASSAULT - AGGRAVATED - BATTERY,2018-08-19 22:46:00,2018,42.378085,-71.057841,"(42.37808452, -71.05784115)",1


In [255]:
df_merged.nunique()          

CrimeCode         220
Description       219
CrimeDate        2670
Year                7
Latitude        17051
Longitude       20145
Location       106492
City                3
dtype: int64

In [256]:
#df_merged['CrimeCode'] = df.Column2.str.replace('b,?' , '')

In [257]:
df_merged.to_csv('/Users/rotation/Desktop/DataClass/capstone-project/MergedCrimeData.csv', index=False)
