In [None]:
import os
from google.cloud import bigquery

# Python Variable

In [None]:
# change these to try this notebook out
ACCOUNT = 'sandcorp2014@gmail.com'
SAC = 'jupyter-notebook-sac'
SAC_KEY_DESTINATION = '/media/mujahid7292/Data/Gcloud_Tem_SAC'
PROJECT = 'ml-practice-260405'
BUCKET = 'bucket-ml-practice-260405'
REGION = 'us-central1'

# Bash Variable

In [None]:
os.environ['ACCOUNT'] = ACCOUNT
os.environ['SAC'] = SAC
os.environ['SAC_KEY_DESTINATION'] = SAC_KEY_DESTINATION
os.environ['BUCKET'] = BUCKET
os.environ['PROJECT'] = PROJECT
os.environ['REGION'] = REGION

# LogIn To Google Cloud

In [None]:
%%bash
gcloud auth login $ACCOUNT

# Set Current Project

In [6]:
%%bash
gcloud config set project $PROJECT

Updated property [core/project].


# Create Servic Account For Jupyter Notebook

In [None]:
%%bash
gcloud iam service-accounts create $SAC \
    --description "This service account will help us to connect local jupyter notebook with server" \
    --display-name "jupyter-sac"

# Enabling The Service Account

In [None]:
%%bash
gcloud iam service-accounts enable $SAC@$PROJECT.iam.gserviceaccount.com --project $PROJECT

# See the list of service account

In [None]:
%%bash
gcloud iam service-accounts list

# Create New Key For Above Service Account

In [None]:
%%bash
gcloud iam service-accounts keys create $SAC_KEY_DESTINATION/$SAC.json \
  --iam-account $SAC@$PROJECT.iam.gserviceaccount.com

It may take up to 60 seconds before a newly created key can be used for authentication. If you experience authentication failures immediately after creating a new key, ensure that 60 seconds have elapsed before trying again.

# Activate the service account with above key

In [None]:
%%bash
gcloud auth activate-service-account \
--key-file=${SAC_KEY_DESTINATION}/${SAC}.json

# Set Google Application Credentials

In [None]:
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]='{}/{}.json'.format(SAC_KEY_DESTINATION,SAC)

Check Whether Google Application Credential Was Set Successfully Outside Virtual Environment

In [None]:
%%bash
set | grep GOOGLE_APPLICATION_CREDENTIALS 

# Set Default Project And Region

In [None]:
%%bash
gcloud config set account $ACCOUNT
gcloud config set project $PROJECT
gcloud config set compute/region $REGION

# Give BigQuery Access Permision To This Service Account

In [None]:
%%bash
gcloud projects add-iam-policy-binding $PROJECT \
    --member serviceAccount:$SAC@$PROJECT.iam.gserviceaccount.com \
    --role roles/bigquery.admin

# Give GCS Access Permision To This Service Account

In [None]:
%%bash
gcloud projects add-iam-policy-binding $PROJECT \
    --member serviceAccount:$SAC@$PROJECT.iam.gserviceaccount.com \
    --role roles/storage.admin

<h1> 1. Exploring natality dataset </h1>

This notebook illustrates:
<ol>
<li> Exploring a BigQuery dataset using AI Platform Notebooks.
</ol>

In [None]:
%%bash
if ! gsutil ls | grep -q gs://${BUCKET}/; then
  gsutil mb -l ${REGION} -p ${PROJECT} gs://${BUCKET}
fi

In [None]:
%%bash
gsutil mb -l us-central1 -p ml-practice-260405 gs://bucket-ml-practice-260405
#gsutil mb -l ${REGION} -p ${PROJECT} gs://${BUCKET}

<h2> Explore data </h2>

The data is  <a href='https://console.cloud.google.com/bigquery?GK=publicdata&page=table&t=natality&d=samples&p=publicdata&redirect_from_classic=true&project=corona-patient-finder&folder=&organizationId='>natality data</a> (record of births in the US). My goal is to predict the baby's weight given a number of factors about the pregnancy and the baby's mother.  Later, we will want to split the data into training and eval datasets. The hash of the year-month will be used for that -- this way, twins born on the same day won't end up in different cuts of the data.

In [None]:
# Create SQL query using natality data after the year 2000
query = """
SELECT
  weight_pounds,
  is_male,
  mother_age,
  plurality,
  gestation_weeks,
  FARM_FINGERPRINT(CONCAT(CAST(YEAR AS STRING), CAST(month AS STRING))) AS hashmonth
FROM
  publicdata.samples.natality
WHERE year > 2000
"""

In [None]:
# Call BigQuery and examine in dataframe
df = bigquery.Client().query(query + " LIMIT 100").to_dataframe()
df.head()

Let's write a query to find the unique values for each of the columns and the count of those values.
This is important to ensure that we have enough examples of each data value, and to verify our hunch that the parameter has predictive value.

In [None]:
# Create function that finds the number of records and the average weight 
# for each value of the chosen column
def get_distinct_values(column_name):
  sql = """
SELECT
  {0},
  COUNT(1) AS num_babies,
  AVG(weight_pounds) AS avg_wt
FROM
  publicdata.samples.natality
WHERE
  year > 2000
GROUP BY
  {0}
  """.format(column_name)
  return bigquery.Client().query(sql).to_dataframe()

In [None]:
# Bar plot to see is_male with avg_wt linear and num_babies logarithmic
df = get_distinct_values('is_male')
df.plot(x='is_male', y='num_babies', kind='bar');
df.plot(x='is_male', y='avg_wt', kind='bar');

In [None]:
# Line plots to see mother_age with avg_wt linear and num_babies logarithmic
df = get_distinct_values('mother_age')
df = df.sort_values('mother_age')
df.plot(x='mother_age', y='num_babies');
df.plot(x='mother_age', y='avg_wt');

In [None]:
# Bar plot to see plurality(singleton, twins, etc.) with avg_wt linear and num_babies logarithmic
df = get_distinct_values('plurality')
df = df.sort_values('plurality')
df.plot(x='plurality', y='num_babies', logy=True, kind='bar');
df.plot(x='plurality', y='avg_wt', kind='bar');

In [None]:
# Bar plot to see gestation_weeks with avg_wt linear and num_babies logarithmic
df = get_distinct_values('gestation_weeks')
df = df.sort_values('gestation_weeks')
df.plot(x='gestation_weeks', y='num_babies', logy=True, kind='bar');
df.plot(x='gestation_weeks', y='avg_wt', kind='bar');

All these factors seem to play a part in the baby's weight. Male babies are heavier on average than female babies. Teenaged and older moms tend to have lower-weight babies. Twins, triplets, etc. are lower weight than single births. Preemies weigh in lower as do babies born to single moms. In addition, it is important to check whether you have enough data (number of babies) for each input value. Otherwise, the model prediction against input values that doesn't have enough data may not be reliable.
<p>
In the next notebook, I will develop a machine learning model to combine all of these factors to come up with a prediction of a baby's weight.

Copyright 2017-2018 Google Inc. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License