In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense

import numpy as np
import json

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from google.cloud import bigquery
from witwidget.notebook.visualization import WitWidget, WitConfigBuilder

In [18]:
query="""
SELECT
  weight_pounds,
  is_male,
  mother_age,
  plurality,
  gestation_weeks
FROM
  publicdata.samples.natality
WHERE year > 2000
LIMIT 10000
"""
df = bigquery.Client().query(query).to_dataframe()
df.head()

Unnamed: 0,weight_pounds,is_male,mother_age,plurality,gestation_weeks
0,6.68662,True,18,1,43.0
1,9.360828,True,32,1,41.0
2,8.437091,False,30,1,39.0
3,6.124442,False,24,1,40.0
4,7.12534,False,26,1,41.0


In [19]:
df.describe()

Unnamed: 0,weight_pounds,mother_age,plurality,gestation_weeks
count,9989.0,10000.0,10000.0,9890.0
mean,7.297602,27.2989,1.0344,38.699798
std,1.291685,6.165838,0.192926,2.539957
min,0.612885,12.0,1.0,17.0
25%,6.624891,22.0,1.0,38.0
50%,7.374463,27.0,1.0,39.0
75%,8.124034,32.0,1.0,40.0
max,12.257702,50.0,3.0,47.0


In [20]:
df['is_male'].value_counts()

True     5150
False    4850
Name: is_male, dtype: int64

In [21]:
df['is_male'].value_counts()/len(df)

True     0.515
False    0.485
Name: is_male, dtype: float64

In [6]:
#Modelo de regresión - para predecir el peso de un bebé

In [22]:
df.isnull().sum()

weight_pounds       11
is_male              0
mother_age           0
plurality            0
gestation_weeks    110
dtype: int64

In [24]:
#let's drop rows with null values from the dataset and shuffle the data
df = df.dropna()
df = shuffle(df, random_state=2)
df.head()

Unnamed: 0,weight_pounds,is_male,mother_age,plurality,gestation_weeks
8103,7.12534,True,27,1,40.0
9816,4.563569,False,36,2,34.0
3310,8.653144,True,36,1,36.0
4843,7.275255,True,25,1,39.0
5719,6.937947,False,39,1,38.0


In [25]:
labels = df['weight_pounds']
data = df.drop(columns=['weight_pounds'])
data['is_male'] = data['is_male'].astype(int)

In [26]:
#Split your data into train and test sets
#Scikit Learn train_test_split utility to split our data before building our model:
x,y = data,labels
x_train,x_test,y_train,y_test = train_test_split(x,y)

In [27]:
#Build and train the TensorFlow model neural network처럼 처음에 많은 input 그리고 최종적 output 숫자 지정 
#building this model using the tf.keras Sequential model API, which lets us define our model as a stack of layers
model = Sequential([
    Dense(64, activation='relu', input_shape=(len(x_train.iloc[0]),)),
    Dense(32, activation='relu'),
    Dense(1)]
)

In [29]:
#Como medir el modelo -- metrics=['mae', 'mse']
model.compile(optimizer=tf.keras.optimizers.RMSprop(),
              loss=tf.keras.losses.MeanSquaredError(),
              metrics=['mae', 'mse'])       

In [30]:
#Train the model - epochs: 얼마나 트레이닝 시킬 것인지 pero entrena más, ojos, overfitting 
#Se puede ver loss 에러률이 어떻게 줄고 있는가 트레이닝의 횟수에 따라 
model.fit(x_train, y_train, epochs=10, validation_split=0.1)

Train on 6670 samples, validate on 742 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7ff4f2827f10>

In [31]:
len(x_test)

2471

In [32]:
#Generate predictions on test examples
num_examples = 10
predictions = model.predict(x_test[:num_examples])

In [33]:
for i in range(num_examples):
    print('Predicted val: ', predictions[i][0])
    print('Actual val: ',y_test.iloc[i])
    print()

Predicted val:  7.7905955
Actual val:  8.56275425608

Predicted val:  7.8353987
Actual val:  7.3744626639

Predicted val:  7.932739
Actual val:  7.62578964258

Predicted val:  7.4239016
Actual val:  7.12534030784

Predicted val:  7.437936
Actual val:  8.062304921339999

Predicted val:  8.073793
Actual val:  7.43839671988

Predicted val:  7.701111
Actual val:  8.1019881285

Predicted val:  8.153933
Actual val:  8.313631900019999

Predicted val:  7.8906646
Actual val:  9.6231777363

Predicted val:  7.776035
Actual val:  7.3744626639

