In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df=pd.read_csv('data/crime_csv_all_years.csv',parse_dates={'dttime':[1,2,3]}, keep_date_col=True)


In [3]:
df['day_of_week']=df['dttime'].dt.weekday_name

In [4]:
df_temp=df.copy()

In [5]:
df_temp.head()

Unnamed: 0,dttime,TYPE,YEAR,MONTH,DAY,HOUR,MINUTE,HUNDRED_BLOCK,NEIGHBOURHOOD,X,Y,day_of_week
0,2003-11-23,Theft from Vehicle,2003,11,23,0.0,1.0,13XX W GEORGIA ST,Central Business District,490745.08,5459529.81,Sunday
1,2003-05-09,Theft from Vehicle,2003,5,9,18.0,0.0,30XX W 8TH AVE,Kitsilano,487465.51,5456929.11,Friday
2,2003-03-09,Offence Against a Person,2003,3,9,,,OFFSET TO PROTECT PRIVACY,,0.0,0.0,Sunday
3,2003-01-20,Offence Against a Person,2003,1,20,,,OFFSET TO PROTECT PRIVACY,,0.0,0.0,Monday
4,2003-02-07,Break and Enter Commercial,2003,2,7,0.0,30.0,71XX VICTORIA DR,Victoria-Fraserview,495196.35,5451832.55,Friday


In [6]:
df2=df_temp.dropna()
#remove missing data, all (or nearly all) of which is the non-property crime data

In [7]:
df3=df2.rename(index=str, columns={"YEAR": "year", "MONTH": "month", "DAY":"day","HOUR":"hour", "MINUTE":"minute", "NEIGHBOURHOOD":"neighborhood"})
#rename columns as all caps is annoying

In [8]:
df4=df3.sort_values(['year','month','day','hour','minute'])
#sort by date

In [9]:
df5=df4.drop(['minute', 'HUNDRED_BLOCK','TYPE'], axis=1)
#remove extraneous data 

In [10]:
df6=df5.apply(pd.to_numeric, errors='ignore')

In [11]:
#bin by 12am-8am, 8am-4pm, 4pm -12am
hourbins = [-0.1,8.0,16.0,24.1]
hourlabels = ['1200am-0759am', '0800am-0359pm', '0400pm-1159pm']
df6['day_segment'] = pd.cut(df6["hour"], bins=hourbins,labels=hourlabels)
#group by neighborhood, by day_segment



In [12]:
df7=df6[['year', 'month', 'day', 'day_of_week','day_segment', 'neighborhood']]



In [13]:
df8=df7.groupby(df7.columns.tolist()).size()
#group by neighborhood, by day_segment

In [14]:
df9=pd.DataFrame(df8).reset_index()

In [15]:
df10=df9.rename(index=str, columns={ 0 :"number_of_crimes"})


In [19]:
df11=df10.copy()

In [20]:
df11.groupby('neighborhood', as_index=False)['number_of_crimes'].mean()

Unnamed: 0,neighborhood,number_of_crimes
0,Arbutus Ridge,1.343011
1,Central Business District,7.118849
2,Dunbar-Southlands,1.464528
3,Fairview,2.548079
4,Grandview-Woodland,2.2436
5,Hastings-Sunrise,1.833561
6,Kensington-Cedar Cottage,2.104495
7,Kerrisdale,1.430356
8,Killarney,1.517892
9,Kitsilano,2.267956


In [23]:
df11['neighborhood'].unique()


array(['Central Business District', 'Dunbar-Southlands', 'Fairview',
       'Grandview-Woodland', 'Hastings-Sunrise',
       'Kensington-Cedar Cottage', 'Kerrisdale', 'Killarney', 'Kitsilano',
       'Marpole', 'Mount Pleasant', 'Renfrew-Collingwood', 'Riley Park',
       'Shaughnessy', 'Strathcona', 'Sunset', 'Victoria-Fraserview',
       'West End', 'Arbutus Ridge', 'South Cambie', 'Oakridge',
       'West Point Grey', 'Musqueam', 'Stanley Park'], dtype=object)

In [27]:
df11.head()

Unnamed: 0,year,month,day,day_of_week,day_segment,neighborhood,number_of_crimes
0,2003,1,1,Wednesday,1200am-0759am,Central Business District,14
1,2003,1,1,Wednesday,1200am-0759am,Dunbar-Southlands,1
2,2003,1,1,Wednesday,1200am-0759am,Fairview,1
3,2003,1,1,Wednesday,1200am-0759am,Grandview-Woodland,6
4,2003,1,1,Wednesday,1200am-0759am,Hastings-Sunrise,3


In [24]:
nbhds1_175=[ 'Arbutus Ridge', 'Dunbar-Southlands',  'Kerrisdale', 'Killarney', 
            'Marpole', 'Musqueam','Oakridge','Riley Park','Shaughnessy', 'South Cambie', 'Stanley Park' ,
            'Victoria-Fraserview','West Point Grey']
nbhds175_3=['Fairview','Grandview-Woodland', 'Hastings-Sunrise','Kensington-Cedar Cottage',  'Kitsilano',
            'Mount Pleasant', 'Renfrew-Collingwood','Strathcona', 'Sunset', 'West End']
nbhdscbr=['Central Business District']

In [31]:
df_final=df11[df11['neighborhood'].isin(nbhds1_175)]

In [32]:
df_final.head()

Unnamed: 0,year,month,day,day_of_week,day_segment,neighborhood,number_of_crimes
1,2003,1,1,Wednesday,1200am-0759am,Dunbar-Southlands,1
6,2003,1,1,Wednesday,1200am-0759am,Kerrisdale,1
7,2003,1,1,Wednesday,1200am-0759am,Killarney,1
9,2003,1,1,Wednesday,1200am-0759am,Marpole,3
12,2003,1,1,Wednesday,1200am-0759am,Riley Park,2


In [None]:
wdf=pd.read_csv('data/BA_weather_data.csv')

In [None]:
wdf_temp=wdf.copy()

In [None]:
wdf2=wdf_temp[[' YEARMODA', 'TMAX', 'TMIN','PRCP']]


In [None]:
wdf3=wdf2.rename(index=str, columns={ " YEARMODA":"date","PRCP": "prcp", "TMAX":"tmax","TMIN":"tmin"})


In [None]:
def get_year(x):
    return int (str(x)[0:4])

def get_month(x):
    return int (str(x)[4:6])

def get_day(x):
    return int(str(x)[6:8])

def get_prcp(x):
    return float(x[1:5])
    
get_year(20030101)

In [None]:
wdf3['year'] = wdf3['date'].apply(get_year)
wdf3['month'] =  wdf3['date'].apply(get_month)
wdf3['day'] = wdf3['date'].apply(get_day)
wdf3['precipitation']=wdf3['prcp'].apply(get_prcp)

In [None]:
wdf3.head()


In [None]:
wdf4=wdf3[['year','month','day','tmax', 'tmin', 'precipitation']]

In [None]:
wdf_final=wdf4.copy()


In [None]:
new_df1=pd.merge(wdf_final,df_final, how='left', on=['year','month','day'])

In [None]:
new_df1.columns

In [None]:
cpi_df=pd.read_csv('data/consumer_price_index_nohead.csv')
cpi_df2=cpi_df.copy()

In [None]:
cpi_df2['year'] = cpi_df2.date.str.split('-').str.get(1)
cpi_df2['month'] = cpi_df2.date.str.split('-').str.get(0) 
cpi_df2.drop('date', axis=1,inplace=True)

In [None]:
import calendar
d=dict((v,k) for k,v in enumerate(calendar.month_abbr))
cpi_df2.month=cpi_df2.month.map(d)

cpi_df2.year='20'+ cpi_df2.year

In [None]:
cpi_df2=cpi_df2.apply(pd.to_numeric, errors='ignore')


In [None]:
new_df2=pd.merge(new_df1,cpi_df2, how='inner', on=['year','month'])

In [None]:
new_df2.columns

In [None]:
gdp_dftemp=pd.read_csv('data/gdp_2007dollars_nohead.csv')

In [None]:
gdp_df=gdp_dftemp.copy()

In [None]:
gdp_df['year'] = gdp_df.date.str.split('-').str.get(1)
gdp_df['month'] = gdp_df.date.str.split('-').str.get(0) 
gdp_df.drop('date', axis=1,inplace=True)

In [None]:
import calendar
d=dict((v,k) for k,v in enumerate(calendar.month_abbr))
gdp_df.month=gdp_df.month.map(d)

gdp_df.year='20'+ gdp_df.year

In [None]:
gdp_df=gdp_df.apply(pd.to_numeric, errors='ignore')


In [None]:
new_df3=pd.merge(new_df2,gdp_df, how='inner', on=['year','month'])

In [None]:
new_df3.columns

In [None]:
emp_df_init=pd.read_csv('data/employment_nohead.csv')
emp_df=emp_df_init.copy()


In [None]:
emp_df.head()

In [None]:
emp_df['year'] = emp_df.date.str.split('-').str.get(1)
emp_df['month'] = emp_df.date.str.split('-').str.get(0) 
emp_df.drop('date', axis=1,inplace=True)

In [None]:
import calendar
d=dict((v,k) for k,v in enumerate(calendar.month_abbr))
emp_df.month=emp_df.month.map(d)

emp_df.year='20'+ emp_df.year

In [None]:
emp_df=emp_df.apply(pd.to_numeric, errors='ignore')


In [None]:
new_df4=pd.merge(new_df3,emp_df, how='inner', on=['year','month'])

In [None]:
new_df4.columns

In [None]:
drugs_init=pd.read_csv('data/drug_offences_2006_to_2016.csv')
drugs_init

In [None]:
drugs_init.columns


In [None]:
drugs_df=drugs_init.copy()

In [None]:
drugs_df2=drugs_df[['year','Possession, cocaine ',
       'Heroin, possession ',]]

In [None]:
drugs_df3=drugs_df2.copy()

In [None]:
drugs_df3.loc[11]=[2017, 2047,1550]

In [None]:
drugs_df3.loc[12]=[2003, 4682,515]
drugs_df3.loc[13]=[2004, 4682,515]
drugs_df3.loc[14]=[2005, 4682,515]
#need real 2003, 2010, 2017
drugs_df3['Average_Heroin4_Price']=[210,191.3,141.1,261.6,271,282.3,335.5,298,254.5,254.5,304.4,257.95,281.16,257.95,245]
#drugs_df3['Average_Cocaine_Price']=['2003','2004','2005','2006','2007',70.2,96.8,'2010',80.5,'2012','2013',86,'2015','2016','2017']

In [None]:
new_df5=pd.merge(new_df4,drugs_df3, how='inner', on=['year'])

In [None]:
new_df6=new_df5.copy()
new_df6.columns

In [None]:
new_df6.columns

In [None]:
new_df6.info()

In [None]:
new_df6.apply(pd.to_numeric, errors='ignore')

In [None]:
day_segment_number=['day_segment']
neighborhood_number=['neighborhood']
day_of_week_number=['day_of_week']

In [None]:
new_df7=pd.get_dummies(new_df6,columns=neighborhood_number, drop_first=True)

In [None]:
new_df8=pd.get_dummies(new_df7,columns=day_segment_number, drop_first=True)

In [None]:
new_df9=pd.get_dummies(new_df8,columns=day_of_week_number, drop_first=True)

In [None]:
new_df9.dropna()

In [None]:
new_df9['number_of_crimes'].mean()

In [None]:
new_df9.describe()

In [None]:
new_df9.loc[new_df9['number_of_crimes']==471]

In [None]:
#remove outlier from 2011 riot
new_df9.loc[121067, 'number_of_crimes']=7

In [None]:
new_df9.describe()

In [None]:
new_df9.loc[new_df9['number_of_crimes']==97]

In [None]:
new_df9.loc[79928, 'number_of_crimes']=7

In [None]:
new_df9.describe()

In [None]:
new_df9.isna().sum()

In [None]:
new_df_final=new_df9.copy()
sns.distplot(new_df_final['number_of_crimes'])

In [None]:
nbhds1_175=[0,2,7,8,10,12,13,15, 16,17,18,21,23]
nbhds175_3=[3,4,5,6,9,11,14,19,20,22]
nbhdscbr=[1]

In [None]:
new_df_final.columns

In [None]:
df_high=new_df_final[new_df_final['neighborhodd']]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.3, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)  # Don't cheat - fit only on training data
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)  # apply same transformation to test data

In [None]:
#sns.heatmap(new_df_final)

In [None]:
corr = new_df_final.corr()
#sns.heatmap(corr, 
            #xticklabels=corr.columns.values,
            #yticklabels=corr.columns.values)

In [None]:
corr

In [None]:
#pd.scatter_matrix(new_df_final, alpha = 0.3, figsize = (14,8), diagonal = 'kde');


In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lm=LinearRegression()

In [None]:
lm.fit(X_train, y_train)

In [None]:
print(lm.intercept_)

In [None]:
coeff_df=pd.DataFrame(lm.coef_, X.columns, columns=['Coefficient'])
coeff_df

In [None]:
from sklearn import metrics
predictions=lm.predict(X_test)

In [None]:
sns.distplot(y_test-predictions)

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test,predictions)

In [None]:
from sklearn import linear_model
clf=linear_model.Lasso(alpha=0.1)
clf.fit(X_train,y_train)

In [None]:
print(clf.coef_)

In [None]:
print(clf.intercept_)

In [None]:
coeff_df=pd.DataFrame(clf.coef_, X.columns, columns=['Coefficeient'])
coeff_df

In [None]:
predictions2=clf.predict(X_test)

In [None]:
sns.distplot(y_test-predictions2)

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test, predictions2))
print('MSE:', metrics.mean_squared_error(y_test, predictions2))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions2)))

In [None]:
r2_score(y_test,predictions2)

In [None]:
from sklearn.ensemble import RandomForestRegressor

regr=RandomForestRegressor(max_depth=2, random_state=42)
regr.fit(X_train,y_train)

In [None]:
predictions3=regr.predict(X_test)

In [None]:
sns.distplot(y_test-predictions3)

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test, predictions3))
print('MSE:', metrics.mean_squared_error(y_test, predictions3))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions3)))

In [None]:
r2_score(y_test,predictions3)

In [None]:
# from sklearn.grid_search import GridSearchCV


# param_dist = {"n_estimators":[10, 25, 50, 100],
#               "max_depth": [2, 5, 10, 30, 50],
#               "max_features": [3,4,5]}
              
# grid=GridSearchCV(regr,param_dist)
# grid.fit(X_train, y_train)

In [None]:
estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)


In [None]:
estimator.fit(X_train, y_train)

predictions4 = estimator.predict(X_test)

In [None]:
sns.distplot(y_test-predictions4)

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test, predictions4))
print('MSE:', metrics.mean_squared_error(y_test, predictions4))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions4)))

In [None]:
r2_score(y_test,predictions4)

In [None]:
# from sklearn.grid_search import GridSearchCV


# param_dist = {"n_estimators":[10, 25, 50, 100,200,500],
#               "max_depth": [2,3,4,5, 10, 25, 50, 100],
#               "max_features": [3,4,5,7, 10,20]}
              
# grid=GridSearchCV(regr,param_dist)
# grid.fit(X_train, y_train)

In [None]:
estimator5=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [None]:
estimator5.fit(X_train, y_train)

predictions5 = estimator5.predict(X_test)

In [None]:
sns.distplot(y_test-predictions5)

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test, predictions5))
print('MSE:', metrics.mean_squared_error(y_test, predictions5))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions5)))

In [None]:
r2_score(y_test,predictions5)

In [None]:
from sklearn.ensemble import AdaBoostRegressor

In [None]:
# ada=AdaBoostRegressor()
# from sklearn.grid_search import GridSearchCV


# param_dist = {"n_estimators":[10, 25, 50, 100],
#               "learning_rate": [1,0.5,0.1,0.05,0.01],
#               "random_state": [42]}
              
# grid=GridSearchCV(ada,param_dist)
# grid.fit(X_train, y_train)

In [None]:
estimator6=AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
         n_estimators=50, random_state=None)

In [None]:
estimator6.fit(X_train, y_train)

predictions6 = estimator6.predict(X_test)

In [None]:
sns.distplot(y_test-predictions6)

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test, predictions6))
print('MSE:', metrics.mean_squared_error(y_test, predictions6))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions6)))

In [None]:
r2_score(y_test,predictions6)

In [None]:
from sklearn.linear_model import SGDRegressor

In [None]:
sgdr=SGDRegressor(shuffle=True)
sgdr.fit(X_train,y_train)

In [None]:
predictions7=sgdr.predict(X_test)

In [None]:
sns.distplot(y_test-predictions7)

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test, predictions7))
print('MSE:', metrics.mean_squared_error(y_test, predictions7))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions7)))

In [None]:
r2_score(y_test,predictions7)

In [None]:
sgdr.get_params()

In [None]:
#  from sklearn.grid_search import GridSearchCV



# param_dist = {"alpha":[0.01,0.001,0.001],
#               'epsilon':[0.2,0.1,0.05,0.01],
#               "loss": ['squared_loss','huber','epsilon_insensitive'],
#               "penalty": ['l2','l1','elasticnet'],
#              "average":[True, False]}
              
# grid=GridSearchCV(sgdr,param_dist)
# grid.fit(X_train, y_train)

In [None]:
estimator8=SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.01,
       fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling',
       loss='squared_loss', max_iter=None, n_iter=None, penalty='l2',
       power_t=0.25, random_state=None, shuffle=True, tol=None, verbose=0,
       warm_start=False)

In [None]:
estimator8.fit(X_train, y_train)

predictions8 = estimator8.predict(X_test)

In [None]:
sns.distplot(y_test-predictions8)

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test, predictions8))
print('MSE:', metrics.mean_squared_error(y_test, predictions8))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions8)))

In [None]:
r2_score(y_test,predictions8)

In [None]:
from sklearn.linear_model import ElasticNetCV

In [None]:
elan=ElasticNetCV(normalize=False)

In [None]:
elan.fit(X_train,y_train)

In [None]:
predictions9=elan.predict(X_test)

In [None]:
sns.distplot(y_test-predictions9)

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test, predictions9))
print('MSE:', metrics.mean_squared_error(y_test, predictions9))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions9)))

In [None]:
r2_score(y_test,predictions9)

In [None]:
from sklearn.linear_model import BayesianRidge

In [None]:
br = BayesianRidge(normalize=False)
br.fit(X_train,y_train)

In [None]:
predictions10=br.predict(X_test)
print('MAE:', metrics.mean_absolute_error(y_test, predictions10))
print('MSE:', metrics.mean_squared_error(y_test, predictions10))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions10)))

In [None]:
r2_score(y_test,predictions10)

In [None]:
sns.distplot(y_test-predictions10)

In [None]:
# from sklearn.grid_search import GridSearchCV
# br = BayesianRidge(normalize=False)

# param_dist = {"n_iter":[100,300,500,1000]}
              
# grid=GridSearchCV(sgdr,param_dist)
# grid.fit(X_train, y_train)

In [None]:
estimator11=SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.01,
       fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling',
       loss='squared_loss', max_iter=None, n_iter=None, penalty='l2',
       power_t=0.25, random_state=None, shuffle=True, tol=None, verbose=0,
       warm_start=False)

estimator11.fit(X_train, y_train)

predictions11 = estimator11.predict(X_test)

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test, predictions11))
print('MSE:', metrics.mean_squared_error(y_test, predictions11))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions11)))

In [None]:
r2_score(y_test,predictions11)

In [None]:
 sns.distplot(y_test-predictions11)

In [None]:
from sklearn.ensemble import BaggingRegressor
bagreg=BaggingRegressor()
bagreg.fit(X_train,y_train)


In [None]:
predictions12=bagreg.predict(X_test)
print('MAE:', metrics.mean_absolute_error(y_test, predictions12))
print('MSE:', metrics.mean_squared_error(y_test, predictions12))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions12)))

In [None]:
r2_score(y_test,predictions12)

In [None]:
sns.distplot(y_test-predictions12)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
gbr=GradientBoostingRegressor()
gbr.fit(X_train,y_train)

In [None]:
predictions13=gbr.predict(X_test)
print('MAE:', metrics.mean_absolute_error(y_test, predictions13))
print('MSE:', metrics.mean_squared_error(y_test, predictions13))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions13)))

In [None]:
r2_score(y_test,predictions13)

In [None]:
sns.distplot(y_test-predictions13)

In [None]:
# from sklearn.grid_search import GridSearchCV
              
# grid=GridSearchCV(gbr, {"max_depth":[2,4,6,8,10],
#               'n_estimators':[50,100,200]},
#               verbose=1)
# grid.fit(X_train, y_train)

In [None]:
from xgboost.sklearn import XGBRegressor
xgb=XGBRegressor()
xgb.fit(X_train,y_train)

In [None]:
predictions14=xgb.predict(X_test)
print('MAE:', metrics.mean_absolute_error(y_test, predictions14))
print('MSE:', metrics.mean_squared_error(y_test, predictions14))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions14)))

In [None]:
r2_score(y_test,predictions14)

In [None]:
sns.distplot(y_test-predictions14)

In [None]:
xgb.feature_importances_


In [None]:
coeff_df2=pd.DataFrame(xgb.feature_importances_, X.columns, columns=['Coefficient'])
coeff_df

In [None]:
# from sklearn.grid_search import GridSearchCV
              
# grid=GridSearchCV(xgb, {"max_depth":[2,4,6,8,10],
#               'n_estimators':[50,100,200]},
#               verbose=1)
# grid.fit(X_train, y_train)

In [None]:
estimator15=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)
estimator15.fit(X_train,y_train)

In [None]:
predictions15=estimator15.predict(X_test)
print('MAE:', metrics.mean_absolute_error(y_test, predictions15))
print('MSE:', metrics.mean_squared_error(y_test, predictions15))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions15)))

In [None]:
r2_score(y_test,predictions15)

In [None]:
sns.distplot(y_test-predictions15)

In [None]:
from sklearn.neural_network import MLPRegressor
mlp=MLPRegressor()
mlp.fit(X_train,y_train)
predictions16=mlp.predict(X_test)
print('MAE:', metrics.mean_absolute_error(y_test, predictions16))
print('MSE:', metrics.mean_squared_error(y_test, predictions16))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions16)))

In [None]:
r2_score(y_test,predictions16)

In [None]:
sns.distplot(y_test-predictions16)

In [None]:
gs = GridSearchCV(mlp, param_grid={
    'learning_rate': [0.05, 0.01, 0.005, 0.001],
    'hidden0__units': [4, 8, 12],
    'hidden0__type': ["Rectifier", "Sigmoid", "Tanh"]})
gs.fit(X_train,y_train)