In [1]:
import pandas as pd
import numpy as np


In [2]:
df=pd.read_csv('data/crime_csv_all_years.csv')

In [3]:
df.dropna(inplace=True)
#remove missing data, all (or nearly all) of which is the non-property crime data

In [4]:
df.rename(index=str, columns={"YEAR": "year", "MONTH": "month", "DAY":"day","HOUR":"hour", "MINUTE":"minute", "NEIGHBOURHOOD":"neighbourhood"},inplace=True)
#rename columns as all caps is annoying

In [5]:
df.head()

Unnamed: 0,TYPE,year,month,day,hour,minute,HUNDRED_BLOCK,neighbourhood,X,Y
0,Theft from Vehicle,2003,11,23,0.0,1.0,13XX W GEORGIA ST,Central Business District,490745.08,5459529.81
1,Theft from Vehicle,2003,5,9,18.0,0.0,30XX W 8TH AVE,Kitsilano,487465.51,5456929.11
4,Break and Enter Commercial,2003,2,7,0.0,30.0,71XX VICTORIA DR,Victoria-Fraserview,495196.35,5451832.55
5,Break and Enter Commercial,2003,6,2,3.0,4.0,71XX VICTORIA DR,Victoria-Fraserview,495196.35,5451832.55
6,Mischief,2003,12,31,8.0,40.0,10XX W GEORGIA ST,West End,491128.32,5459137.86


In [6]:
df.sort_values(['year','month','day','hour','minute'],inplace=True)
#sort by date

In [7]:
df.head(20)

Unnamed: 0,TYPE,year,month,day,hour,minute,HUNDRED_BLOCK,neighbourhood,X,Y
2148,Theft from Vehicle,2003,1,1,0.0,0.0,2XX DRAKE ST,Central Business District,490968.94,5457901.26
3215,Theft of Bicycle,2003,1,1,0.0,0.0,6X KEEFER PL,Central Business District,492231.06,5458545.72
7453,Theft from Vehicle,2003,1,1,0.0,0.0,24XX W 1ST AVE,Kitsilano,488324.36,5457580.13
18527,Mischief,2003,1,1,0.0,0.0,17XX PARKER ST,Grandview-Woodland,494981.08,5458118.12
32204,Theft from Vehicle,2003,1,1,0.0,0.0,11XX HOWE ST,Central Business District,490849.97,5458442.5
34232,Theft from Vehicle,2003,1,1,0.0,0.0,57XX CULLODEN ST,Sunset,494285.25,5453254.66
34819,Theft from Vehicle,2003,1,1,0.0,0.0,10XX W 26TH AVE,Shaughnessy,490707.25,5455045.5
45038,Break and Enter Residential/Other,2003,1,1,0.0,0.0,9XX W 15TH AVE,Fairview,490791.12,5456127.88
48249,Break and Enter Commercial,2003,1,1,0.0,0.0,64XX VICTORIA DR,Victoria-Fraserview,495220.95,5452631.14
48331,Theft from Vehicle,2003,1,1,0.0,0.0,33XX ROSEMONT DR,Killarney,497388.6,5451449.73


In [8]:
df['TYPE'].unique()
#perhaps remove the vehicle entries before removing type

array(['Theft from Vehicle', 'Theft of Bicycle', 'Mischief',
       'Break and Enter Residential/Other', 'Break and Enter Commercial',
       'Vehicle Collision or Pedestrian Struck (with Injury)',
       'Theft of Vehicle', 'Other Theft',
       'Vehicle Collision or Pedestrian Struck (with Fatality)'], dtype=object)

In [9]:
df.drop(['minute', 'HUNDRED_BLOCK','TYPE'], axis=1,inplace=True)
#remove extraneous data 

In [10]:
df.apply(pd.to_numeric, errors='ignore')

Unnamed: 0,year,month,day,hour,neighbourhood,X,Y
2148,2003,1,1,0.0,Central Business District,490968.94,5457901.26
3215,2003,1,1,0.0,Central Business District,492231.06,5458545.72
7453,2003,1,1,0.0,Kitsilano,488324.36,5457580.13
18527,2003,1,1,0.0,Grandview-Woodland,494981.08,5458118.12
32204,2003,1,1,0.0,Central Business District,490849.97,5458442.50
34232,2003,1,1,0.0,Sunset,494285.25,5453254.66
34819,2003,1,1,0.0,Shaughnessy,490707.25,5455045.50
45038,2003,1,1,0.0,Fairview,490791.12,5456127.88
48249,2003,1,1,0.0,Victoria-Fraserview,495220.95,5452631.14
48331,2003,1,1,0.0,Killarney,497388.60,5451449.73


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 501528 entries, 2148 to 558908
Data columns (total 7 columns):
year             501528 non-null int64
month            501528 non-null int64
day              501528 non-null int64
hour             501528 non-null float64
neighbourhood    501528 non-null object
X                501528 non-null float64
Y                501528 non-null float64
dtypes: float64(3), int64(3), object(1)
memory usage: 30.6+ MB


In [12]:
#bin by 12am-8am, 8am-4pm, 4pm -12am

In [13]:
hourbins = [-0.1,8.0,16.0,24.1]
hourlabels = ['1200am-0759am', '0800am-0359pm', '0400pm-1159pm']
df['day_segment'] = pd.cut(df["hour"], bins=hourbins,labels=hourlabels)

#df['day_segment'] = pd.cut(df['hour'], [0, 8, 16,24], labels=['1200am-0759am', '0800am-0359pm', '0400pm-1159pm'])

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 501528 entries, 2148 to 558908
Data columns (total 8 columns):
year             501528 non-null int64
month            501528 non-null int64
day              501528 non-null int64
hour             501528 non-null float64
neighbourhood    501528 non-null object
X                501528 non-null float64
Y                501528 non-null float64
day_segment      501528 non-null category
dtypes: category(1), float64(3), int64(3), object(1)
memory usage: 31.1+ MB


In [15]:
df.head(100)

Unnamed: 0,year,month,day,hour,neighbourhood,X,Y,day_segment
2148,2003,1,1,0.0,Central Business District,490968.94,5457901.26,1200am-0759am
3215,2003,1,1,0.0,Central Business District,492231.06,5458545.72,1200am-0759am
7453,2003,1,1,0.0,Kitsilano,488324.36,5457580.13,1200am-0759am
18527,2003,1,1,0.0,Grandview-Woodland,494981.08,5458118.12,1200am-0759am
32204,2003,1,1,0.0,Central Business District,490849.97,5458442.50,1200am-0759am
34232,2003,1,1,0.0,Sunset,494285.25,5453254.66,1200am-0759am
34819,2003,1,1,0.0,Shaughnessy,490707.25,5455045.50,1200am-0759am
45038,2003,1,1,0.0,Fairview,490791.12,5456127.88,1200am-0759am
48249,2003,1,1,0.0,Victoria-Fraserview,495220.95,5452631.14,1200am-0759am
48331,2003,1,1,0.0,Killarney,497388.60,5451449.73,1200am-0759am


In [16]:
df_temp=df[['year', 'month', 'day', 'day_segment', 'neighbourhood']]
#group by neighborhood, by day_segment, with a count



In [17]:
df_temp.head()

Unnamed: 0,year,month,day,day_segment,neighbourhood
2148,2003,1,1,1200am-0759am,Central Business District
3215,2003,1,1,1200am-0759am,Central Business District
7453,2003,1,1,1200am-0759am,Kitsilano
18527,2003,1,1,1200am-0759am,Grandview-Woodland
32204,2003,1,1,1200am-0759am,Central Business District


In [18]:
df_temp2=df_temp.groupby(df_temp.columns.tolist()).size()

In [19]:
df_temp2.head(20)


year  month  day  day_segment    neighbourhood            
2003  1      1    1200am-0759am  Central Business District    14
                                 Dunbar-Southlands             1
                                 Fairview                      1
                                 Grandview-Woodland            6
                                 Hastings-Sunrise              3
                                 Kensington-Cedar Cottage      3
                                 Kerrisdale                    1
                                 Killarney                     1
                                 Kitsilano                     3
                                 Marpole                       3
                                 Mount Pleasant                3
                                 Renfrew-Collingwood           3
                                 Riley Park                    2
                                 Shaughnessy                   1
                               

In [20]:
df_temp2[0]

14

In [21]:
df_new=pd.DataFrame(df_temp2).reset_index()

In [22]:
df_new.rename(index=str, columns={ 0 :"number_of_crimes"},inplace=True)


In [23]:
df_new.head()

Unnamed: 0,year,month,day,day_segment,neighbourhood,number_of_crimes
0,2003,1,1,1200am-0759am,Central Business District,14
1,2003,1,1,1200am-0759am,Dunbar-Southlands,1
2,2003,1,1,1200am-0759am,Fairview,1
3,2003,1,1,1200am-0759am,Grandview-Woodland,6
4,2003,1,1,1200am-0759am,Hastings-Sunrise,3


In [24]:
wdf=pd.read_csv('data/Blaine.csv')

In [25]:
wdf.head()

Unnamed: 0,STATION,NAME,DATE,MDPR,PRCP,SNOW,SNWD,TMAX,TMIN,TOBS,WT01
0,USC00450729,"BLAINE, WA US",2003-01-01,,0.43,0.0,0.0,45.0,38.0,39.0,
1,USC00450729,"BLAINE, WA US",2003-01-02,,0.87,0.0,0.0,54.0,39.0,54.0,
2,USC00450729,"BLAINE, WA US",2003-01-03,,0.33,0.0,0.0,54.0,40.0,47.0,
3,USC00450729,"BLAINE, WA US",2003-01-04,,0.77,0.0,0.0,53.0,43.0,51.0,
4,USC00450729,"BLAINE, WA US",2003-01-05,,0.0,0.0,0.0,52.0,37.0,47.0,


In [26]:
wdf=wdf[['DATE','PRCP','TMAX','TMIN']]
wdf.rename(index=str, columns={ "PRCP": "prcp", "TMAX":"tmax","TMIN":"tmin"},inplace=True)


In [27]:
wdf.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4718 entries, 0 to 4717
Data columns (total 4 columns):
DATE    4718 non-null object
prcp    4684 non-null float64
tmax    4675 non-null float64
tmin    4673 non-null float64
dtypes: float64(3), object(1)
memory usage: 184.3+ KB


In [28]:
wdf.interpolate(inplace=True)

In [29]:
wdf.isnull().sum()

DATE    0
prcp    0
tmax    0
tmin    0
dtype: int64

In [30]:
wdf['year'] = wdf.DATE.str.split('-').str.get(0)
wdf['month'] = wdf.DATE.str.split('-').str.get(1) 
wdf['day'] = wdf.DATE.str.split('-').str.get(2) 
wdf.drop('DATE', axis=1,inplace=True)

In [31]:
wdf.head()

Unnamed: 0,prcp,tmax,tmin,year,month,day
0,0.43,45.0,38.0,2003,1,1
1,0.87,54.0,39.0,2003,1,2
2,0.33,54.0,40.0,2003,1,3
3,0.77,53.0,43.0,2003,1,4
4,0.0,52.0,37.0,2003,1,5


In [32]:
wdf.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4718 entries, 0 to 4717
Data columns (total 6 columns):
prcp     4718 non-null float64
tmax     4718 non-null float64
tmin     4718 non-null float64
year     4718 non-null object
month    4718 non-null object
day      4718 non-null object
dtypes: float64(3), object(3)
memory usage: 258.0+ KB


In [33]:
wdf=wdf.apply(pd.to_numeric, errors='ignore')

In [34]:
wdf.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4718 entries, 0 to 4717
Data columns (total 6 columns):
prcp     4718 non-null float64
tmax     4718 non-null float64
tmin     4718 non-null float64
year     4718 non-null int64
month    4718 non-null int64
day      4718 non-null int64
dtypes: float64(3), int64(3)
memory usage: 258.0+ KB


In [35]:
new_df1=pd.merge(df_new,wdf, how='inner', on=['year','month','day'])

In [36]:
new_df1.head(200)

Unnamed: 0,year,month,day,day_segment,neighbourhood,number_of_crimes,prcp,tmax,tmin
0,2003,1,1,1200am-0759am,Central Business District,14,0.43,45.0,38.0
1,2003,1,1,1200am-0759am,Dunbar-Southlands,1,0.43,45.0,38.0
2,2003,1,1,1200am-0759am,Fairview,1,0.43,45.0,38.0
3,2003,1,1,1200am-0759am,Grandview-Woodland,6,0.43,45.0,38.0
4,2003,1,1,1200am-0759am,Hastings-Sunrise,3,0.43,45.0,38.0
5,2003,1,1,1200am-0759am,Kensington-Cedar Cottage,3,0.43,45.0,38.0
6,2003,1,1,1200am-0759am,Kerrisdale,1,0.43,45.0,38.0
7,2003,1,1,1200am-0759am,Killarney,1,0.43,45.0,38.0
8,2003,1,1,1200am-0759am,Kitsilano,3,0.43,45.0,38.0
9,2003,1,1,1200am-0759am,Marpole,3,0.43,45.0,38.0
