# ESTIMATORS


Let's go over the steps of using Estimators one by one (yet again):
#### PREPROCESSING
1. Clean the date
2. Create normalized columns.
3. Fix Missing records.
4. Split into train and test sets.

#### CREATE FEATURE COLUMNS
- Create a feature column for every column using tf.feature_column.
- Make a list of all columns created.

#### CREATING THE ESTIMATOR OBJECT / MODEL
1. Use tf.estimator to create an estimator object.
2. Pass the feature columns list and the number of classes to it.

#### CREATE INPUT, TEST AND PREDICTION FUNCTIONS.
1. INPUT
    - Input X_train and y_train.
    - Use tf.estimator.inputs
    - Will go through model.train
2. TEST
    - Input X_test and y_test.
    - Use tf.estimator.inputs
    - Will go through model.evaluate
3. PREDICTION
    - Input X_test.
    - Use tf.estimator.inputs
    - Will go through model.predict




In [17]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline
diabetes = pd.read_csv('../Datasets/diabetes.csv')
diabetes.head()

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Labels
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [18]:
diabetes.columns

Index(['Var1', 'Var2', 'Var3', 'Var4', 'Var5', 'Var6', 'Var7', 'Var8',
       'Labels'],
      dtype='object')

In [19]:
feature_scaling = ['Var1', 'Var2', 'Var3', 'Var4', 'Var5', 'Var6', 'Var7', 'Var8']
diabetes[feature_scaling] = diabetes[feature_scaling].apply(lambda x: (x-x.min())/(x.max()-x.min()))
diabetes.head()

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Labels
0,0.352941,0.743719,0.590164,0.353535,0.0,0.500745,0.234415,0.483333,1
1,0.058824,0.427136,0.540984,0.292929,0.0,0.396423,0.116567,0.166667,0
2,0.470588,0.919598,0.52459,0.0,0.0,0.347243,0.253629,0.183333,1
3,0.058824,0.447236,0.540984,0.232323,0.111111,0.418778,0.038002,0.0,0
4,0.0,0.688442,0.327869,0.353535,0.198582,0.642325,0.943638,0.2,1


## LET'S CREATE A SIMPLE LINEAR CLASSIFIER

In [20]:
#STEPS TO USE THE ESTIMATOR API 
#STEP1: DEFINE A LIST OF FEATURE COLUMNS
var1 = tf.feature_column.numeric_column('Var1')
var2 = tf.feature_column.numeric_column('Var2')
var3 = tf.feature_column.numeric_column('Var3')
var4 = tf.feature_column.numeric_column('Var4')
var5 = tf.feature_column.numeric_column('Var5')
var6 = tf.feature_column.numeric_column('Var6')
var7 = tf.feature_column.numeric_column('Var7')
var8 = tf.feature_column.numeric_column('Var8')
#===================================================
# Categorical Columns
#cat1 = tf.feature_column.categorical_column_with_vocabulary_list('Column Name', ['A', 'B', 'C','D'])
#cat2 = tf.feature_column.categorical_column_with_hash_bucket('Column Name',hash_bucket_size=10)
#===================================================
#Numeric to Categorical
#cat3 = tf.feature_column.bucketized_column(variable_name, boundaries=[10,20,30,40,50,60])
feat_cols = [var1,var2,var3,var4,var5,var6,var7,var8] 
labels = diabetes['Labels']
labels.head()

#SPLIT DATA
x_data = diabetes.drop('Labels', axis=1)
y_data = labels
print("============================================")
print(x_data.head())
print("============================================")
print(y_data.head())
print("============================================")
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_data,y_data,test_size=0.3)

#STEP2: CREATE AN ESTIMATOR OBJECT
model = tf.estimator.LinearClassifier(feature_columns = feat_cols, n_classes=2)

#STEP3: CREATE AN INPUT METHOD
input_func = tf.estimator.inputs.pandas_input_fn(x=x_train, y=y_train, batch_size=10, num_epochs = 100,shuffle = True)

#STEP4: TRAIN, EVALUATE AND PREDICT ON ESTIMATOR OBJECT
test_func = tf.estimator.inputs.pandas_input_fn(x=x_test, y=y_test, batch_size=10, num_epochs = 1,shuffle = False)
pred_func = tf.estimator.inputs.pandas_input_fn(x=x_test, batch_size=10, num_epochs = 1,shuffle = False)
print("============================================")
model.train(input_fn=input_func, steps=100)
print("============================================")
results = model.evaluate(test_func)
print("============================================")
prob = []
for p in model.predict(pred_func):
    prob.append(p['probabilities'])
prob[:10]



       Var1      Var2      Var3      Var4      Var5      Var6      Var7  \
0  0.352941  0.743719  0.590164  0.353535  0.000000  0.500745  0.234415   
1  0.058824  0.427136  0.540984  0.292929  0.000000  0.396423  0.116567   
2  0.470588  0.919598  0.524590  0.000000  0.000000  0.347243  0.253629   
3  0.058824  0.447236  0.540984  0.232323  0.111111  0.418778  0.038002   
4  0.000000  0.688442  0.327869  0.353535  0.198582  0.642325  0.943638   

       Var8  
0  0.483333  
1  0.166667  
2  0.183333  
3  0.000000  
4  0.200000  
0    1
1    0
2    1
3    0
4    1
Name: Labels, dtype: int64
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\HPPROB~1\\AppData\\Local\\Temp\\tmpt6dkr8a9', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': N

[array([0.5848643 , 0.41513574], dtype=float32),
 array([0.5614314 , 0.43856862], dtype=float32),
 array([0.70129   , 0.29871005], dtype=float32),
 array([0.7230214 , 0.27697858], dtype=float32),
 array([0.67705727, 0.32294276], dtype=float32),
 array([0.66021055, 0.33978948], dtype=float32),
 array([0.63302696, 0.36697304], dtype=float32),
 array([0.62387997, 0.37612006], dtype=float32),
 array([0.5737051 , 0.42629492], dtype=float32),
 array([0.6426825 , 0.35731748], dtype=float32)]

## DNN CLASSIFIER

In [22]:
#STEPS TO USE THE ESTIMATOR API 
#STEP1: DEFINE A LIST OF FEATURE COLUMNS
var1 = tf.feature_column.numeric_column('Var1')
var2 = tf.feature_column.numeric_column('Var2')
var3 = tf.feature_column.numeric_column('Var3')
var4 = tf.feature_column.numeric_column('Var4')
var5 = tf.feature_column.numeric_column('Var5')
var6 = tf.feature_column.numeric_column('Var6')
var7 = tf.feature_column.numeric_column('Var7')
var8 = tf.feature_column.numeric_column('Var8')
#===================================================
# Categorical Columns
#cat1 = tf.feature_column.categorical_column_with_vocabulary_list('Column Name', ['A', 'B', 'C','D'])
#cat2 = tf.feature_column.categorical_column_with_hash_bucket('Column Name',hash_bucket_size=10)
#embed_cat1 = tf.feature_column.embedding_column(cat1, dimension=4)
#feat_cols update
#===================================================
#Numeric to Categorical
#cat3 = tf.feature_column.bucketized_column(variable_name, boundaries=[10,20,30,40,50,60])
feat_cols = [var1,var2,var3,var4,var5,var6,var7,var8] 
labels = diabetes['Labels']
labels.head()

#SPLIT DATA
x_data = diabetes.drop('Labels', axis=1)
y_data = labels
print("============================================")
print(x_data.head())
print("============================================")
print(y_data.head())
print("============================================")
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_data,y_data,test_size=0.3)

#STEP2: CREATE AN ESTIMATOR OBJECT
model = tf.estimator.DNNClassifier(hidden_units= [10,10,10], feature_columns=feat_cols, n_classes=2)

#STEP3: CREATE AN INPUT METHOD
input_func = tf.estimator.inputs.pandas_input_fn(x=x_train, y=y_train, batch_size=10, num_epochs = 100,shuffle = True)

#STEP4: TRAIN, EVALUATE AND PREDICT ON ESTIMATOR OBJECT
test_func = tf.estimator.inputs.pandas_input_fn(x=x_test, y=y_test, batch_size=10, num_epochs = 1,shuffle = False)
pred_func = tf.estimator.inputs.pandas_input_fn(x=x_test, batch_size=10, num_epochs = 1,shuffle = False)
print("============================================")
model.train(input_fn=input_func, steps=1000)
print("============================================")
results = model.evaluate(test_func)
print("============================================")
prob = []
for p in model.predict(pred_func):
    prob.append(p['probabilities'])
prob[:10]



       Var1      Var2      Var3      Var4      Var5      Var6      Var7  \
0  0.352941  0.743719  0.590164  0.353535  0.000000  0.500745  0.234415   
1  0.058824  0.427136  0.540984  0.292929  0.000000  0.396423  0.116567   
2  0.470588  0.919598  0.524590  0.000000  0.000000  0.347243  0.253629   
3  0.058824  0.447236  0.540984  0.232323  0.111111  0.418778  0.038002   
4  0.000000  0.688442  0.327869  0.353535  0.198582  0.642325  0.943638   

       Var8  
0  0.483333  
1  0.166667  
2  0.183333  
3  0.000000  
4  0.200000  
0    1
1    0
2    1
3    0
4    1
Name: Labels, dtype: int64
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\HPPROB~1\\AppData\\Local\\Temp\\tmpi_raujsk', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': N

[array([0.71806836, 0.28193164], dtype=float32),
 array([0.48589292, 0.51410705], dtype=float32),
 array([0.6468724 , 0.35312763], dtype=float32),
 array([0.24678583, 0.7532142 ], dtype=float32),
 array([0.43355697, 0.566443  ], dtype=float32),
 array([0.14526527, 0.8547348 ], dtype=float32),
 array([0.9811777 , 0.01882234], dtype=float32),
 array([0.20710394, 0.79289603], dtype=float32),
 array([0.87874347, 0.12125654], dtype=float32),
 array([0.813821  , 0.18617903], dtype=float32)]

### WELCOME TO THE END OF THE TUTORIAL


---------------------------------------------------------------------------------------
Copyrights © 2018, All Rights Reserved.
- Author: Mahnoor Anjum.
- Course: The Complete Hands-On Machine Learning Course
- Date Created: 2018-07-11
- Date Modified: -