In [1]:
import pandas as pd
import numpy as np


In [2]:
df=pd.read_csv('data/crime_csv_all_years.csv',parse_dates={'dttime':[1,2,3]}, keep_date_col=True)
df.head()

Unnamed: 0,dttime,TYPE,YEAR,MONTH,DAY,HOUR,MINUTE,HUNDRED_BLOCK,NEIGHBOURHOOD,X,Y
0,2003-11-23,Theft from Vehicle,2003,11,23,0.0,1.0,13XX W GEORGIA ST,Central Business District,490745.08,5459529.81
1,2003-05-09,Theft from Vehicle,2003,5,9,18.0,0.0,30XX W 8TH AVE,Kitsilano,487465.51,5456929.11
2,2003-03-09,Offence Against a Person,2003,3,9,,,OFFSET TO PROTECT PRIVACY,,0.0,0.0
3,2003-01-20,Offence Against a Person,2003,1,20,,,OFFSET TO PROTECT PRIVACY,,0.0,0.0
4,2003-02-07,Break and Enter Commercial,2003,2,7,0.0,30.0,71XX VICTORIA DR,Victoria-Fraserview,495196.35,5451832.55


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 549774 entries, 0 to 549773
Data columns (total 11 columns):
dttime           549774 non-null datetime64[ns]
TYPE             549774 non-null object
YEAR             549774 non-null object
MONTH            549774 non-null object
DAY              549774 non-null object
HOUR             493779 non-null float64
MINUTE           493779 non-null float64
HUNDRED_BLOCK    549761 non-null object
NEIGHBOURHOOD    491472 non-null object
X                549774 non-null float64
Y                549774 non-null float64
dtypes: datetime64[ns](1), float64(4), object(6)
memory usage: 46.1+ MB


In [4]:
df['day_of_week']=df['dttime'].dt.weekday_name

In [5]:
df_temp=df.copy()

In [6]:
df_temp.head()

Unnamed: 0,dttime,TYPE,YEAR,MONTH,DAY,HOUR,MINUTE,HUNDRED_BLOCK,NEIGHBOURHOOD,X,Y,day_of_week
0,2003-11-23,Theft from Vehicle,2003,11,23,0.0,1.0,13XX W GEORGIA ST,Central Business District,490745.08,5459529.81,Sunday
1,2003-05-09,Theft from Vehicle,2003,5,9,18.0,0.0,30XX W 8TH AVE,Kitsilano,487465.51,5456929.11,Friday
2,2003-03-09,Offence Against a Person,2003,3,9,,,OFFSET TO PROTECT PRIVACY,,0.0,0.0,Sunday
3,2003-01-20,Offence Against a Person,2003,1,20,,,OFFSET TO PROTECT PRIVACY,,0.0,0.0,Monday
4,2003-02-07,Break and Enter Commercial,2003,2,7,0.0,30.0,71XX VICTORIA DR,Victoria-Fraserview,495196.35,5451832.55,Friday


In [7]:
df2=df_temp.dropna()
#remove missing data, all (or nearly all) of which is the non-property crime data

In [8]:
df3=df2.rename(index=str, columns={"YEAR": "year", "MONTH": "month", "DAY":"day","HOUR":"hour", "MINUTE":"minute", "NEIGHBOURHOOD":"neighborhood"})
#rename columns as all caps is annoying

In [9]:
df3.head()

Unnamed: 0,dttime,TYPE,year,month,day,hour,minute,HUNDRED_BLOCK,neighborhood,X,Y,day_of_week
0,2003-11-23,Theft from Vehicle,2003,11,23,0.0,1.0,13XX W GEORGIA ST,Central Business District,490745.08,5459529.81,Sunday
1,2003-05-09,Theft from Vehicle,2003,5,9,18.0,0.0,30XX W 8TH AVE,Kitsilano,487465.51,5456929.11,Friday
4,2003-02-07,Break and Enter Commercial,2003,2,7,0.0,30.0,71XX VICTORIA DR,Victoria-Fraserview,495196.35,5451832.55,Friday
5,2003-06-02,Break and Enter Commercial,2003,6,2,3.0,4.0,71XX VICTORIA DR,Victoria-Fraserview,495196.35,5451832.55,Monday
6,2003-12-31,Mischief,2003,12,31,8.0,40.0,10XX W GEORGIA ST,West End,491128.32,5459137.86,Wednesday


In [10]:
df4=df3.sort_values(['year','month','day','hour','minute'])
#sort by date

In [11]:
df5=df4.drop(['minute', 'HUNDRED_BLOCK','TYPE'], axis=1)
#remove extraneous data 

In [12]:
df6=df5.apply(pd.to_numeric, errors='ignore')

In [13]:
#bin by 12am-8am, 8am-4pm, 4pm -12am
hourbins = [-0.1,8.0,16.0,24.1]
hourlabels = ['1200am-0759am', '0800am-0359pm', '0400pm-1159pm']
df6['day_segment'] = pd.cut(df6["hour"], bins=hourbins,labels=hourlabels)



In [14]:
df7=df6[['year', 'month', 'day', 'day_segment', 'neighborhood']]
#group by neighborhood, by day_segment



In [15]:
df7.head()

Unnamed: 0,year,month,day,day_segment,neighborhood
2148,2003,1,1,1200am-0759am,Central Business District
3215,2003,1,1,1200am-0759am,Central Business District
7453,2003,1,1,1200am-0759am,Kitsilano
18527,2003,1,1,1200am-0759am,Grandview-Woodland
32204,2003,1,1,1200am-0759am,Central Business District


In [16]:
df8=df7.groupby(df7.columns.tolist()).size()

In [17]:
df8.head()

year  month  day  day_segment    neighborhood             
2003  1      1    1200am-0759am  Central Business District    14
                                 Dunbar-Southlands             1
                                 Fairview                      1
                                 Grandview-Woodland            6
                                 Hastings-Sunrise              3
dtype: int64

In [18]:
df9=pd.DataFrame(df8).reset_index()

In [19]:
df9.head()

Unnamed: 0,year,month,day,day_segment,neighborhood,0
0,2003,1,1,1200am-0759am,Central Business District,14
1,2003,1,1,1200am-0759am,Dunbar-Southlands,1
2,2003,1,1,1200am-0759am,Fairview,1
3,2003,1,1,1200am-0759am,Grandview-Woodland,6
4,2003,1,1,1200am-0759am,Hastings-Sunrise,3


In [20]:
df10=df9.rename(index=str, columns={ 0 :"number_of_crimes"})


In [21]:
df10.info()

<class 'pandas.core.frame.DataFrame'>
Index: 206124 entries, 0 to 206123
Data columns (total 6 columns):
year                206124 non-null int64
month               206124 non-null int64
day                 206124 non-null int64
day_segment         206124 non-null category
neighborhood        206124 non-null object
number_of_crimes    206124 non-null int64
dtypes: category(1), int64(4), object(1)
memory usage: 9.6+ MB


In [22]:
df10.isnull().sum()

year                0
month               0
day                 0
day_segment         0
neighborhood        0
number_of_crimes    0
dtype: int64

In [23]:
df10.iloc[7270,:]

year                               2003
month                                 6
day                                  12
day_segment               0400pm-1159pm
neighborhood        Victoria-Fraserview
number_of_crimes                      3
Name: 7270, dtype: object

In [24]:
df_final=df10.copy()

In [25]:
df_final.head()

Unnamed: 0,year,month,day,day_segment,neighborhood,number_of_crimes
0,2003,1,1,1200am-0759am,Central Business District,14
1,2003,1,1,1200am-0759am,Dunbar-Southlands,1
2,2003,1,1,1200am-0759am,Fairview,1
3,2003,1,1,1200am-0759am,Grandview-Woodland,6
4,2003,1,1,1200am-0759am,Hastings-Sunrise,3


In [26]:
wdf=pd.read_csv('data/BA_weather_data.csv')

In [27]:
wdf_temp=wdf.copy()

In [28]:
wdf_temp.head()

Unnamed: 0,STN---,WBAN,YEARMODA,TMAX,TMIN,DEWP,Unnamed: 7,SLP,.1,STP,...,.3,WDSP,.4,MXSPD,GUST,MAX,MIN,PRCP,SNDP,FRSHTT
0,727976,24217,20030101,44.8,24,37.2,24,1017.0,23,9999.9,...,24,11.9,24,20.0,28.0,48.2*,39.2*,0.00G,999.9,10000
1,727976,24217,20030102,49.5,23,43.3,23,1008.5,22,9999.9,...,23,18.2,23,35.9,53.0,55.4*,39.2*,0.54G,999.9,10000
2,727976,24217,20030103,51.0,24,40.6,24,1013.1,24,9999.9,...,24,14.3,24,33.0,42.9,55.4*,42.8*,0.17G,999.9,10010
3,727976,24217,20030104,49.7,24,43.0,24,1012.7,23,9999.9,...,24,8.9,24,22.9,30.9,57.9,43,0.08G,999.9,10000
4,727976,24217,20030105,46.4,24,42.6,24,1028.7,24,9999.9,...,24,6.0,24,9.9,999.9,57.9,39,0.39G,999.9,0


In [29]:
wdf2=wdf_temp[[' YEARMODA', 'TMAX', 'TMIN','PRCP']]
wdf2.head()


Unnamed: 0,YEARMODA,TMAX,TMIN,PRCP
0,20030101,44.8,24,0.00G
1,20030102,49.5,23,0.54G
2,20030103,51.0,24,0.17G
3,20030104,49.7,24,0.08G
4,20030105,46.4,24,0.39G


In [30]:
wdf3=wdf2.rename(index=str, columns={ " YEARMODA":"date","PRCP": "prcp", "TMAX":"tmax","TMIN":"tmin"})


In [31]:
def get_year(x):
    return int (str(x)[0:4])

def get_month(x):
    return int (str(x)[4:6])

def get_day(x):
    return int(str(x)[6:8])

def get_prcp(x):
    return float(x[1:5])
    
get_year(20030101)

2003

In [32]:
wdf3['year'] = wdf3['date'].apply(get_year)
wdf3['month'] =  wdf3['date'].apply(get_month)
wdf3['day'] = wdf3['date'].apply(get_day)
wdf3['precipitation']=wdf3['prcp'].apply(get_prcp)

In [33]:
wdf3.head()


Unnamed: 0,date,tmax,tmin,prcp,year,month,day,precipitation
0,20030101,44.8,24,0.00G,2003,1,1,0.0
1,20030102,49.5,23,0.54G,2003,1,2,0.54
2,20030103,51.0,24,0.17G,2003,1,3,0.17
3,20030104,49.7,24,0.08G,2003,1,4,0.08
4,20030105,46.4,24,0.39G,2003,1,5,0.39


In [34]:
wdf4=wdf3[['year','month','day','tmax', 'tmin', 'precipitation']]

In [35]:
wdf4.head()

Unnamed: 0,year,month,day,tmax,tmin,precipitation
0,2003,1,1,44.8,24,0.0
1,2003,1,2,49.5,23,0.54
2,2003,1,3,51.0,24,0.17
3,2003,1,4,49.7,24,0.08
4,2003,1,5,46.4,24,0.39


In [36]:
wdf4.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5478 entries, 0 to 5477
Data columns (total 6 columns):
year             5478 non-null int64
month            5478 non-null int64
day              5478 non-null int64
tmax             5478 non-null float64
tmin             5478 non-null int64
precipitation    5478 non-null float64
dtypes: float64(2), int64(4)
memory usage: 299.6+ KB


In [37]:
wdf_final=wdf4.copy()


In [38]:
wdf_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5478 entries, 0 to 5477
Data columns (total 6 columns):
year             5478 non-null int64
month            5478 non-null int64
day              5478 non-null int64
tmax             5478 non-null float64
tmin             5478 non-null int64
precipitation    5478 non-null float64
dtypes: float64(2), int64(4)
memory usage: 299.6+ KB


In [39]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 206124 entries, 0 to 206123
Data columns (total 6 columns):
year                206124 non-null int64
month               206124 non-null int64
day                 206124 non-null int64
day_segment         206124 non-null category
neighborhood        206124 non-null object
number_of_crimes    206124 non-null int64
dtypes: category(1), int64(4), object(1)
memory usage: 9.6+ MB


In [40]:
wdf_final.head()

Unnamed: 0,year,month,day,tmax,tmin,precipitation
0,2003,1,1,44.8,24,0.0
1,2003,1,2,49.5,23,0.54
2,2003,1,3,51.0,24,0.17
3,2003,1,4,49.7,24,0.08
4,2003,1,5,46.4,24,0.39


In [41]:
df_final.tail(10)

Unnamed: 0,year,month,day,day_segment,neighborhood,number_of_crimes
206114,2017,12,31,0400pm-1159pm,Central Business District,17
206115,2017,12,31,0400pm-1159pm,Grandview-Woodland,1
206116,2017,12,31,0400pm-1159pm,Killarney,3
206117,2017,12,31,0400pm-1159pm,Kitsilano,1
206118,2017,12,31,0400pm-1159pm,Mount Pleasant,3
206119,2017,12,31,0400pm-1159pm,Renfrew-Collingwood,2
206120,2017,12,31,0400pm-1159pm,Shaughnessy,1
206121,2017,12,31,0400pm-1159pm,Strathcona,2
206122,2017,12,31,0400pm-1159pm,Sunset,3
206123,2017,12,31,0400pm-1159pm,West End,3


In [42]:
wdf_final.tail(10)

Unnamed: 0,year,month,day,tmax,tmin,precipitation
5468,2017,12,22,34.2,24,0.0
5469,2017,12,23,30.6,24,0.0
5470,2017,12,24,28.0,24,0.0
5471,2017,12,25,30.9,24,0.0
5472,2017,12,26,29.6,24,0.0
5473,2017,12,27,28.8,24,0.0
5474,2017,12,28,33.7,24,0.16
5475,2017,12,29,37.0,24,0.31
5476,2017,12,30,38.1,24,0.72
5477,2017,12,31,30.2,24,0.01


In [43]:
new_df1=pd.merge(wdf_final,df_final, how='left', on=['year','month','day'])

In [44]:
new_df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 206074 entries, 0 to 206073
Data columns (total 9 columns):
year                206074 non-null int64
month               206074 non-null int64
day                 206074 non-null int64
tmax                206074 non-null float64
tmin                206074 non-null int64
precipitation       206074 non-null float64
day_segment         206074 non-null category
neighborhood        206074 non-null object
number_of_crimes    206074 non-null int64
dtypes: category(1), float64(2), int64(5), object(1)
memory usage: 14.3+ MB


In [45]:
new_df1.isnull().sum()

year                0
month               0
day                 0
tmax                0
tmin                0
precipitation       0
day_segment         0
neighborhood        0
number_of_crimes    0
dtype: int64

In [46]:
cpi_df=pd.read_csv('data/consumer_price_index_nohead.csv')
cpi_df2=cpi_df.copy()

In [47]:
cpi_df2['year'] = cpi_df2.date.str.split('-').str.get(1)
cpi_df2['month'] = cpi_df2.date.str.split('-').str.get(0) 
cpi_df2.drop('date', axis=1,inplace=True)

In [48]:
import calendar
d=dict((v,k) for k,v in enumerate(calendar.month_abbr))
cpi_df2.month=cpi_df2.month.map(d)

cpi_df2.year='20'+ cpi_df2.year

In [49]:
cpi_df2=cpi_df2.apply(pd.to_numeric, errors='ignore')


In [50]:
new_df2=pd.merge(new_df1,cpi_df2, how='inner', on=['year','month'])

In [51]:
new_df2.info(0)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 206074 entries, 0 to 206073
Data columns (total 10 columns):
year                    206074 non-null int64
month                   206074 non-null int64
day                     206074 non-null int64
tmax                    206074 non-null float64
tmin                    206074 non-null int64
precipitation           206074 non-null float64
day_segment             206074 non-null category
neighborhood            206074 non-null object
number_of_crimes        206074 non-null int64
consumer_price_index    206074 non-null float64
dtypes: category(1), float64(3), int64(5), object(1)
memory usage: 15.9+ MB


In [52]:
new_df2.isnull().sum()

year                    0
month                   0
day                     0
tmax                    0
tmin                    0
precipitation           0
day_segment             0
neighborhood            0
number_of_crimes        0
consumer_price_index    0
dtype: int64

In [53]:
gdp_dftemp=pd.read_csv('data/gdp_2007dollars_nohead.csv')

In [54]:
gdp_df=gdp_dftemp.copy()

In [55]:
gdp_df.head()

Unnamed: 0,date,gdp_millions_2007
0,Jan-03,1305716
1,Feb-03,1309824
2,Mar-03,1309047
3,Apr-03,1305168
4,May-03,1309303


In [56]:
gdp_df['year'] = gdp_df.date.str.split('-').str.get(1)
gdp_df['month'] = gdp_df.date.str.split('-').str.get(0) 
gdp_df.drop('date', axis=1,inplace=True)

In [57]:
import calendar
d=dict((v,k) for k,v in enumerate(calendar.month_abbr))
gdp_df.month=gdp_df.month.map(d)

gdp_df.year='20'+ gdp_df.year

In [58]:
gdp_df=gdp_df.apply(pd.to_numeric, errors='ignore')


In [59]:
new_df3=pd.merge(new_df2,gdp_df, how='inner', on=['year','month'])

In [60]:
new_df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 206074 entries, 0 to 206073
Data columns (total 11 columns):
year                    206074 non-null int64
month                   206074 non-null int64
day                     206074 non-null int64
tmax                    206074 non-null float64
tmin                    206074 non-null int64
precipitation           206074 non-null float64
day_segment             206074 non-null category
neighborhood            206074 non-null object
number_of_crimes        206074 non-null int64
consumer_price_index    206074 non-null float64
gdp_millions_2007       206074 non-null int64
dtypes: category(1), float64(3), int64(6), object(1)
memory usage: 17.5+ MB


In [61]:
new_df3.isnull().sum()

year                    0
month                   0
day                     0
tmax                    0
tmin                    0
precipitation           0
day_segment             0
neighborhood            0
number_of_crimes        0
consumer_price_index    0
gdp_millions_2007       0
dtype: int64

In [62]:
new_df3.head()

Unnamed: 0,year,month,day,tmax,tmin,precipitation,day_segment,neighborhood,number_of_crimes,consumer_price_index,gdp_millions_2007
0,2003,1,1,44.8,24,0.0,1200am-0759am,Central Business District,14,100.9,1305716
1,2003,1,1,44.8,24,0.0,1200am-0759am,Dunbar-Southlands,1,100.9,1305716
2,2003,1,1,44.8,24,0.0,1200am-0759am,Fairview,1,100.9,1305716
3,2003,1,1,44.8,24,0.0,1200am-0759am,Grandview-Woodland,6,100.9,1305716
4,2003,1,1,44.8,24,0.0,1200am-0759am,Hastings-Sunrise,3,100.9,1305716


In [63]:
emp_df_init=pd.read_csv('data/employment_nohead.csv')
emp_df=emp_df_init.copy()

In [64]:
emp_df.head()

Unnamed: 0,date,seasonally_adjusted_unemployment,unadjusted_unemployment
0,Jan-03,7.9,7.5
1,Feb-03,7.6,7.5
2,Mar-03,7.0,7.2
3,Apr-03,7.1,7.1
4,May-03,7.2,7.1


In [65]:
emp_df['year'] = emp_df.date.str.split('-').str.get(1)
emp_df['month'] = emp_df.date.str.split('-').str.get(0) 
emp_df.drop('date', axis=1,inplace=True)

In [66]:
import calendar
d=dict((v,k) for k,v in enumerate(calendar.month_abbr))
emp_df.month=emp_df.month.map(d)

emp_df.year='20'+ emp_df.year

In [67]:
emp_df=emp_df.apply(pd.to_numeric, errors='ignore')


In [68]:
new_df4=pd.merge(new_df3,emp_df, how='inner', on=['year','month'])

In [69]:
new_df4.head()

Unnamed: 0,year,month,day,tmax,tmin,precipitation,day_segment,neighborhood,number_of_crimes,consumer_price_index,gdp_millions_2007,seasonally_adjusted_unemployment,unadjusted_unemployment
0,2003,1,1,44.8,24,0.0,1200am-0759am,Central Business District,14,100.9,1305716,7.9,7.5
1,2003,1,1,44.8,24,0.0,1200am-0759am,Dunbar-Southlands,1,100.9,1305716,7.9,7.5
2,2003,1,1,44.8,24,0.0,1200am-0759am,Fairview,1,100.9,1305716,7.9,7.5
3,2003,1,1,44.8,24,0.0,1200am-0759am,Grandview-Woodland,6,100.9,1305716,7.9,7.5
4,2003,1,1,44.8,24,0.0,1200am-0759am,Hastings-Sunrise,3,100.9,1305716,7.9,7.5


In [70]:
new_df4.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 206074 entries, 0 to 206073
Data columns (total 13 columns):
year                                206074 non-null int64
month                               206074 non-null int64
day                                 206074 non-null int64
tmax                                206074 non-null float64
tmin                                206074 non-null int64
precipitation                       206074 non-null float64
day_segment                         206074 non-null category
neighborhood                        206074 non-null object
number_of_crimes                    206074 non-null int64
consumer_price_index                206074 non-null float64
gdp_millions_2007                   206074 non-null int64
seasonally_adjusted_unemployment    206074 non-null float64
unadjusted_unemployment             206074 non-null float64
dtypes: category(1), float64(5), int64(6), object(1)
memory usage: 20.6+ MB


In [71]:
new_df4.isnull().sum()

year                                0
month                               0
day                                 0
tmax                                0
tmin                                0
precipitation                       0
day_segment                         0
neighborhood                        0
number_of_crimes                    0
consumer_price_index                0
gdp_millions_2007                   0
seasonally_adjusted_unemployment    0
unadjusted_unemployment             0
dtype: int64

In [72]:
new_df4.head(10)

Unnamed: 0,year,month,day,tmax,tmin,precipitation,day_segment,neighborhood,number_of_crimes,consumer_price_index,gdp_millions_2007,seasonally_adjusted_unemployment,unadjusted_unemployment
0,2003,1,1,44.8,24,0.0,1200am-0759am,Central Business District,14,100.9,1305716,7.9,7.5
1,2003,1,1,44.8,24,0.0,1200am-0759am,Dunbar-Southlands,1,100.9,1305716,7.9,7.5
2,2003,1,1,44.8,24,0.0,1200am-0759am,Fairview,1,100.9,1305716,7.9,7.5
3,2003,1,1,44.8,24,0.0,1200am-0759am,Grandview-Woodland,6,100.9,1305716,7.9,7.5
4,2003,1,1,44.8,24,0.0,1200am-0759am,Hastings-Sunrise,3,100.9,1305716,7.9,7.5
5,2003,1,1,44.8,24,0.0,1200am-0759am,Kensington-Cedar Cottage,3,100.9,1305716,7.9,7.5
6,2003,1,1,44.8,24,0.0,1200am-0759am,Kerrisdale,1,100.9,1305716,7.9,7.5
7,2003,1,1,44.8,24,0.0,1200am-0759am,Killarney,1,100.9,1305716,7.9,7.5
8,2003,1,1,44.8,24,0.0,1200am-0759am,Kitsilano,3,100.9,1305716,7.9,7.5
9,2003,1,1,44.8,24,0.0,1200am-0759am,Marpole,3,100.9,1305716,7.9,7.5


In [73]:
drugs_init=pd.read_csv('data/drug_offences_2006_to_2016.csv')
drugs_init

Unnamed: 0,year,Total Drug violations,"Possession, cocaine","Heroin, possession","Other Controlled Drugs and Substances Act, possession","Methamphetamines (crystal meth), possession"
0,2006,25630,4682,515,1562,1
1,2007,28472,4428,378,2265,306
2,2008,27032,3992,389,1881,334
3,2009,23948,2891,347,1079,310
4,2010,26928,2642,316,1165,429
5,2011,27178,2524,379,1196,539
6,2012,25455,2597,508,1106,663
7,2013,26354,2654,617,997,1058
8,2014,24321,2310,853,1130,1549
9,2015,22132,2068,1139,1278,1828


In [74]:
drugs_init.columns


Index(['year', 'Total Drug violations ', 'Possession, cocaine ',
       'Heroin, possession ',
       'Other Controlled Drugs and Substances Act, possession',
       'Methamphetamines (crystal meth), possession '],
      dtype='object')

In [75]:
drugs_df=drugs_init.copy()

In [76]:
drugs_df2=drugs_df[['year','Possession, cocaine ',
       'Heroin, possession ',]]

In [80]:
drugs_df3=drugs_df2.copy()

In [81]:
drugs_df3.loc[11]=[2017, 2047,1550]

In [82]:
drugs_df3.loc[12]=[2003, 4682,515]
drugs_df3.loc[13]=[2004, 4682,515]
drugs_df3.loc[14]=[2005, 4682,515]

In [83]:
drugs_df3

Unnamed: 0,year,"Possession, cocaine","Heroin, possession"
0,2006,4682,515
1,2007,4428,378
2,2008,3992,389
3,2009,2891,347
4,2010,2642,316
5,2011,2524,379
6,2012,2597,508
7,2013,2654,617
8,2014,2310,853
9,2015,2068,1139


In [84]:
new_df5=pd.merge(new_df4,drugs_df3, how='inner', on=['year'])

In [85]:
new_df5.head(10)

Unnamed: 0,year,month,day,tmax,tmin,precipitation,day_segment,neighborhood,number_of_crimes,consumer_price_index,gdp_millions_2007,seasonally_adjusted_unemployment,unadjusted_unemployment,"Possession, cocaine","Heroin, possession"
0,2003,1,1,44.8,24,0.0,1200am-0759am,Central Business District,14,100.9,1305716,7.9,7.5,4682,515
1,2003,1,1,44.8,24,0.0,1200am-0759am,Dunbar-Southlands,1,100.9,1305716,7.9,7.5,4682,515
2,2003,1,1,44.8,24,0.0,1200am-0759am,Fairview,1,100.9,1305716,7.9,7.5,4682,515
3,2003,1,1,44.8,24,0.0,1200am-0759am,Grandview-Woodland,6,100.9,1305716,7.9,7.5,4682,515
4,2003,1,1,44.8,24,0.0,1200am-0759am,Hastings-Sunrise,3,100.9,1305716,7.9,7.5,4682,515
5,2003,1,1,44.8,24,0.0,1200am-0759am,Kensington-Cedar Cottage,3,100.9,1305716,7.9,7.5,4682,515
6,2003,1,1,44.8,24,0.0,1200am-0759am,Kerrisdale,1,100.9,1305716,7.9,7.5,4682,515
7,2003,1,1,44.8,24,0.0,1200am-0759am,Killarney,1,100.9,1305716,7.9,7.5,4682,515
8,2003,1,1,44.8,24,0.0,1200am-0759am,Kitsilano,3,100.9,1305716,7.9,7.5,4682,515
9,2003,1,1,44.8,24,0.0,1200am-0759am,Marpole,3,100.9,1305716,7.9,7.5,4682,515
