In [1]:
!pip install xgboost

Collecting xgboost
  Using cached xgboost-1.6.2-py3-none-manylinux2014_x86_64.whl (255.9 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.6.2


In [2]:
import pandas as pd
import xgboost as xgb
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from google.cloud import bigquery

Data we have used:

We use the BigQuery public dataset of natality which describes all United States births registered in the 50 States, the District of Columbia, and New York City from 1969 to 2008.
In that, we use the weight (in pounds), sex, mother's age, plurality and gestation period data from the year 2000 onwards for our Case Study. We use the following query to extract the necessary dataset.

In [3]:
query="""
SELECT
  weight_pounds,
  is_male,
  mother_age,
  plurality,
  gestation_weeks
FROM
  publicdata.samples.natality
WHERE year > 2000
LIMIT 10000
"""

In [4]:
df=bigquery.Client().query(query).to_dataframe()

In [5]:
df.head()# showing the head of the dataset

Unnamed: 0,weight_pounds,is_male,mother_age,plurality,gestation_weeks
0,7.561856,False,18,1,39.0
1,4.750962,True,22,1,40.0
2,7.374463,False,28,1,37.0
3,7.813183,False,31,1,41.0
4,3.688334,False,29,2,35.0


Now we need to check the number of male babies in the data set through value counts and check whether the data set is balanced or not

In [6]:
df['is_male'].value_counts()

True     5152
False    4848
Name: is_male, dtype: int64

In [7]:
df=df.dropna()#use dropna to delete the rows which contains the null values

In [8]:
#shuffle the data
df=shuffle(df, random_state=42)#using shuffle in the piror of the model to create more representative traing and testing set
df.head()

Unnamed: 0,weight_pounds,is_male,mother_age,plurality,gestation_weeks
5309,7.813183,True,34,1,39.0
514,7.828615,False,35,1,39.0
4359,7.716179,False,24,1,38.0
8445,6.311835,True,23,1,39.0
9104,4.373971,False,37,3,34.0


In [9]:
df.head(10)

Unnamed: 0,weight_pounds,is_male,mother_age,plurality,gestation_weeks
5309,7.813183,True,34,1,39.0
514,7.828615,False,35,1,39.0
4359,7.716179,False,24,1,38.0
8445,6.311835,True,23,1,39.0
9104,4.373971,False,37,3,34.0
8711,7.835229,True,39,1,37.0
7085,6.503637,False,35,1,37.0
4000,8.688418,True,33,1,37.0
9253,3.6244,False,42,2,32.0
1322,6.311835,True,33,1,39.0


In [10]:
labels= df['weight_pounds']
data=df.drop(columns=['weight_pounds'])

In [11]:
data['is_male']=data['is_male'].astype(int)#converting the is_male column from boolean to integer

In [12]:
data.head()

Unnamed: 0,is_male,mother_age,plurality,gestation_weeks
5309,1,34,1,39.0
514,0,35,1,39.0
4359,0,24,1,38.0
8445,1,23,1,39.0
9104,0,37,3,34.0


In [13]:
x,y=data,labels
x_train,x_test,y_train,y_test=train_test_split(x,y)

In [14]:
model= xgb.XGBRegressor(objective='reg:linear')#creating the model through xgboostregressor

In [15]:
model.fit(x_train, y_train)#train the model



XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, objective='reg:linear', predictor='auto',
             random_state=0, reg_alpha=0, ...)

In [16]:
y_pred= model.predict(x_test)#predict the test set model

In [17]:
for i in range(20):
    print('Predicted weight:', y_pred[i])
    print('Actual weight:', y_test.iloc[i])
    print()

Predicted weight: 7.4805546
Actual weight: 6.6910296517

Predicted weight: 7.24817
Actual weight: 6.5256829552

Predicted weight: 6.5114985
Actual weight: 5.43659938092

Predicted weight: 7.5639505
Actual weight: 7.20250209954

Predicted weight: 7.641326
Actual weight: 8.000575487979999

Predicted weight: 7.4151044
Actual weight: 6.2501051276999995

Predicted weight: 7.1612544
Actual weight: 8.62448368944

Predicted weight: 4.4595304
Actual weight: 3.42157430624

Predicted weight: 6.028756
Actual weight: 5.401325419

Predicted weight: 7.4352884
Actual weight: 9.31232594688

Predicted weight: 6.9320893
Actual weight: 8.298199541679999

Predicted weight: 7.4337134
Actual weight: 6.6359140862

Predicted weight: 7.7485557
Actual weight: 8.437090766739999

Predicted weight: 7.3122745
Actual weight: 8.14828520352

Predicted weight: 5.628828
Actual weight: 6.503636729

Predicted weight: 6.942165
Actual weight: 8.18796841068

Predicted weight: 7.8251133
Actual weight: 7.9608922808199996

Predi

In [18]:
model.save_model('kk=model.bst')

In [19]:
GCP_PROJECT='1428Kaushik Kar CTS'
MODEL_BUCKET='gs://kk-bucket'
VERSION_NAME='v1'
MODEL_NAME='model.bst'

In [20]:
!gsutil mb $MODEL_BUCKET

Creating gs://kk-bucket/...
ServiceException: 409 A Cloud Storage bucket named 'kk-bucket' already exists. Try another name. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.


In [21]:
!gsutil cp ./kk=model.bst $MODEL_BUCKET

Copying file://./kk=model.bst [Content-Type=application/octet-stream]...
/ [1 files][293.2 KiB/293.2 KiB]                                                
Operation completed over 1 objects/293.2 KiB.                                    


In [59]:
model1= xgb.XGBRegressor(objective='reg:linear',max_depth=7,learning_rate=0.1)

In [60]:
model1.fit(x_train, y_train)



XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.1, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=7, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, objective='reg:linear', predictor='auto',
             random_state=0, reg_alpha=0, ...)

In [61]:
y_pred1= model1.predict(x_test)

In [62]:
for i in range(20):
    print('Predicted weight:', y_pred1[i])
    print('Actual weight:', y_test.iloc[i])
    print()

Predicted weight: 7.5014534
Actual weight: 6.6910296517

Predicted weight: 7.2278037
Actual weight: 6.5256829552

Predicted weight: 6.5189414
Actual weight: 5.43659938092

Predicted weight: 7.5135183
Actual weight: 7.20250209954

Predicted weight: 7.5784307
Actual weight: 8.000575487979999

Predicted weight: 7.4096274
Actual weight: 6.2501051276999995

Predicted weight: 7.1798453
Actual weight: 8.62448368944

Predicted weight: 4.441638
Actual weight: 3.42157430624

Predicted weight: 6.0949483
Actual weight: 5.401325419

Predicted weight: 7.4621286
Actual weight: 9.31232594688

Predicted weight: 7.0019193
Actual weight: 8.298199541679999

Predicted weight: 7.3816953
Actual weight: 6.6359140862

Predicted weight: 7.746577
Actual weight: 8.437090766739999

Predicted weight: 7.302982
Actual weight: 8.14828520352

Predicted weight: 5.9219513
Actual weight: 6.503636729

Predicted weight: 7.0177374
Actual weight: 8.18796841068

Predicted weight: 7.8975396
Actual weight: 7.9608922808199996

Pr

CONCLUSION(Comparing the two above results)

In the above model we have seen that by changing the learning rate from 0.1 to 0.3 and having max depth 3 the model will not ne better furhter. So by changing again the learning rate to 0.1 and max_depth 7 we have seen the model predicts slightly better in the above case

In [None]:
#we can also use a for loop for different learning rate
lr = [0.001,0.01,0.05,0.1,0.25]

for i in lr:
    model_i = Xgb.XGBRegressor(objective='reg:linear',learning_rate=i)
    model_i.fit(x_train,y_train)
    y_pred_i = model_i.predict(x_test)
    print(f"R-square value when max_depth is {i} is: ",r2_score(y_test,y_pred_i))