## Use the right version of TensorFlow



The following hidden code cell ensures that the Colab will run on TensorFlow 2.X.

In [31]:
# @title Run on TensorFlow 2.x
# %tensorflow_version 2.x

## Call the import statements

The following code imports the necessary modules.

In [32]:
#@title Load the imports

# from __future__ import absolute_import, division, print_function, unicode_literals

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import feature_column
from tensorflow.keras import layers
from matplotlib import pyplot as plt

# The following lines adjust the granularity of reporting.
pd.options.display.max_rows = 10
pd.options.display.float_format = "{:.1f}".format
# tf.keras.backend.set_floatx('float32')

print("Ran the import statements.")

Ran the import statements.


## Load the datasets 

The following code cell loads the separate .csv files and creates the following two pandas DataFrames:

* `train_df`, which contains the training set
* `test_df`, which contains the test set

In [44]:
train_df = pd.read_csv("Dataset/training/fsi-2006-to-2013.csv")
test_df = pd.read_csv("Dataset/testing/fsi-2014-to2020.csv")
train_df.dtypes

Country                   object
Year                       int64
Rank                      object
Total                    float64
Security_Apparatus       float64
                          ...   
Public_Services          float64
Human_Rights             float64
Demographic_Pressures    float64
Refugees_and_IDPs        float64
External_Intervention    float64
Length: 16, dtype: object

In [45]:
train_df['Total'] = train_df['Total']/10
train_df_mean = train_df.mean()
train_df_std = train_df.std()
train_df_norm = (train_df - train_df_mean)/train_df_std
train_df_norm.head()
train_df.dtypes

Country                   object
Year                       int64
Rank                      object
Total                    float64
Security_Apparatus       float64
                          ...   
Public_Services          float64
Human_Rights             float64
Demographic_Pressures    float64
Refugees_and_IDPs        float64
External_Intervention    float64
Length: 16, dtype: object

In [35]:

test_df['Total'] = test_df['Total']/10
test_df_mean = test_df.mean()
test_df_std  = test_df.std()
test_df_norm = (test_df - test_df_mean)/test_df_std
test_df_norm.head()

Unnamed: 0,Change_from_Previous_Year,Country,Demographic_Pressures,Economic_Inequality,Economy,External_Intervention,Factionalized_Elites,Group_Grievance,Human_Flight_and_Brain_Drain,Human_Rights,Public_Services,Rank,Refugees_and_IDPs,Security_Apparatus,State_Legitimacy,Total,Year
0,0.2,,1.7,0.9,1.9,1.7,1.5,1.7,0.7,1.7,1.6,,1.9,1.8,1.5,1.8,1.5
1,0.0,,1.7,1.7,1.8,1.3,1.5,1.2,1.6,1.3,1.4,,1.7,1.8,1.1,1.7,1.5
2,-0.0,,1.5,1.6,2.0,1.5,1.3,1.4,0.6,1.3,1.6,,1.9,1.6,1.5,1.7,1.5
3,0.4,,0.8,0.7,1.6,1.7,1.4,1.9,1.4,1.7,1.4,,2.0,1.8,1.5,1.7,1.5
4,0.4,,1.7,1.3,1.2,1.5,1.4,1.7,0.6,1.5,1.6,,2.0,1.2,1.4,1.7,1.5


In [36]:
# train_df.corr()
# train_df.info()
train_df.describe( )

Unnamed: 0,Year,Total,Security_Apparatus,Factionalized_Elites,Group_Grievance,Economy,Economic_Inequality,Human_Flight_and_Brain_Drain,State_Legitimacy,Public_Services,Human_Rights,Demographic_Pressures,Refugees_and_IDPs,External_Intervention
count,1387.0,1387.0,1387.0,1387.0,1387.0,1387.0,1387.0,1387.0,1387.0,1387.0,1387.0,1387.0,1387.0,1387.0
mean,2009.6,7.1,5.7,6.1,6.0,5.8,6.7,5.6,6.4,5.8,5.9,6.3,5.1,5.8
std,2.3,2.3,2.4,2.5,2.0,1.9,1.8,2.0,2.4,2.3,2.3,2.1,2.3,2.2
min,2006.0,1.7,0.9,0.7,1.0,1.0,1.0,1.0,0.5,1.0,1.0,0.8,0.9,0.8
25%,2008.0,5.6,4.1,4.2,4.5,4.3,5.7,4.3,5.1,4.0,4.3,4.9,3.2,4.4
50%,2010.0,7.7,6.0,6.8,6.0,5.9,7.1,6.0,7.0,6.0,6.3,6.5,5.0,6.1
75%,2012.0,8.8,7.5,8.0,7.4,7.3,8.0,7.1,8.1,7.8,7.6,8.0,6.8,7.3
max,2013.0,11.5,10.0,10.0,10.0,10.0,9.7,10.0,10.0,10.0,10.0,10.0,10.0,10.0


In [37]:
#@title Double-click for possible solutions.

# We arbitrarily set the threshold to 265,000, which is 
# the 75th percentile for median house values.  Every neighborhood
# with a median house price above 265,000 will be labeled 1, 
# and all other neighborhoods will be labeled 0.
threshold = 9
train_df_norm["median_house_value_is_high"] = (train_df["Total"] > threshold).astype(float)
test_df_norm["median_house_value_is_high"] = (test_df["Total"] > threshold).astype(float) 
train_df_norm["median_house_value_is_high"].head(8000)


# Alternatively, instead of picking the threshold
# based on raw house values, you can work with Z-scores.
# For example, the following possible solution uses a Z-score
# of +1.0 as the threshold, meaning that no more
# than 16% of the values in median_house_value_is_high
# will be labeled 1.

# threshold_in_Z = 1.0 
# train_df_norm["median_house_value_is_high"] = (train_df_norm["median_house_value"] > threshold_in_Z).astype(float)
# test_df_norm["median_house_value_is_high"] = (test_df_norm["median_house_value"] > threshold_in_Z).astype(float) 


0      1.0
1      1.0
2      1.0
3      1.0
4      1.0
        ..
1382   0.0
1383   0.0
1384   0.0
1385   0.0
1386   0.0
Name: median_house_value_is_high, Length: 1387, dtype: float64

## Represent features in feature columns

This code cell specifies the features that you'll ultimately train the model on and how each of those features will be represented. The transformations (collected in `feature_layer`) don't actually get applied until you pass a DataFrame to it, which will happen when we train the model. 

In [38]:
# Create an empty list that will eventually hold all created feature columns.
feature_columns = []

# Create a numerical feature column to represent median_income.
all_features = ['Security_Apparatus','Factionalized_Elites','Group_Grievance','Economy',
                'Economic_Inequality','Human_Flight_and_Brain_Drain','State_Legitimacy',
                'Public_Services','Human_Rights','Demographic_Pressures','Refugees_and_IDPs',
                'External_Intervention']
for f in all_features:
  x=tf.feature_column.numeric_column(f)
  feature_columns.append(x)
# median_income = tf.feature_column.numeric_column("Security_Apparatus")
# feature_columns.append(median_income)

# # Create a numerical feature column to represent total_rooms.
# tr = tf.feature_column.numeric_column("Factionalized_Elites")
# feature_columns.append(tr)

# Convert the list of feature columns into a layer that will later be fed into
# the model. 
feature_layer = layers.DenseFeatures(feature_columns)

# Print the first 3 and last 3 rows of the feature_layer's output when applied
# to train_df_norm:
feature_layer(dict(train_df_norm))

<tf.Tensor: shape=(1387, 12), dtype=float32, numpy=
array([[ 1.5307974,  0.9408891,  1.878555 , ...,  2.1330411,  1.6568434,
         1.2987458],
       [ 1.7723134,  1.159695 ,  1.4147588, ...,  2.1330411,  1.7807449,
         1.340441 ],
       [ 1.1926749,  0.9955906,  1.0540284, ...,  2.1330411,  1.698144 ,
         1.340441 ],
       ...,
       [-1.6089113, -2.5600057, -1.8318151, ..., -1.533415 , -1.9362992,
        -2.2453513],
       [-1.6089113, -2.5600057, -2.3471444, ..., -1.1020671, -1.9362992,
        -2.2453513],
       [-1.6089113, -2.5600057, -2.0379467, ..., -1.533415 , -1.9362992,
        -2.2453513]], dtype=float32)>

## Define functions that build and train a model

The following code cell defines two functions:

  * `create_model(my_learning_rate, feature_layer, my_metrics)`, which defines the model's
    topography.
  * `train_model(model, dataset, epochs, label_name, batch_size, shuffle)`, uses input features and labels to train the model.

Prior exercises used [ReLU](https://developers.google.com/machine-learning/glossary#ReLU) as the [activation function](https://developers.google.com/machine-learning/glossary#activation_function). By contrast, this exercise uses [sigmoid](https://developers.google.com/machine-learning/glossary#sigmoid_function) as the activation function. 

In [39]:
#@title Define the functions that create and train a model.
def create_model(my_learning_rate, feature_layer, my_metrics):
  """Create and compile a simple classification model."""
  # Most simple tf.keras models are sequential.
  model = tf.keras.models.Sequential()

  # Add the feature layer (the list of features and how they are represented)
  # to the model.
  model.add(feature_layer)

  # Funnel the regression value through a sigmoid function.
  model.add(tf.keras.layers.Dense(units=1, input_shape=(1,),
                                  activation=tf.sigmoid),)

  # Call the compile method to construct the layers into a model that
  # TensorFlow can execute.  Notice that we're using a different loss
  # function for classification than for regression.    
  model.compile(optimizer=tf.keras.optimizers.RMSprop(lr=my_learning_rate),                                                   
                loss=tf.keras.losses.BinaryCrossentropy(),
                metrics=my_metrics)

  return model        


def train_model(model, dataset, epochs, label_name,
                batch_size=None, shuffle=True):
  """Feed a dataset into the model in order to train it."""

  # The x parameter of tf.keras.Model.fit can be a list of arrays, where
  # each array contains the data for one feature.  Here, we're passing
  # every column in the dataset. Note that the feature_layer will filter
  # away most of those columns, leaving only the desired columns and their
  # representations as features.
  features = {name:np.array(value) for name, value in dataset.items()}
  label = np.array(features.pop(label_name)).astype('float64')
  history = model.fit(x=features, y=label, batch_size=batch_size,
                      epochs=epochs, shuffle=shuffle)
  
  # The list of epochs is stored separately from the rest of history.
  epochs = history.epoch

  # Isolate the classification metric for each epoch.
  hist = pd.DataFrame(history.history)

  return epochs, hist  

print("Defined the create_model and train_model functions.")   

Defined the create_model and train_model functions.


## Define a plotting function

The following [matplotlib](https://developers.google.com/machine-learning/glossary/#matplotlib) function plots one or more curves, showing how various classification metrics change with each epoch.

In [40]:
#@title Define the plotting function.
def plot_curve(epochs, hist, list_of_metrics):
  """Plot a curve of one or more classification metrics vs. epoch."""  
  # list_of_metrics should be one of the names shown in:
  # https://www.tensorflow.org/tutorials/structured_data/imbalanced_data#define_the_model_and_metrics  

  plt.figure()
  plt.xlabel("Epoch")
  plt.ylabel("Value")

  for m in list_of_metrics:
    x = hist[m]
    plt.plot(epochs[1:], x[1:], label=m)

  plt.legend()

print("Defined the plot_curve function.")

Defined the plot_curve function.


## Invoke the creating, training, and plotting functions

The following code cell calls specify the hyperparameters, and then invokes the 
functions to create and train the model, and then to plot the results.

In [41]:
# The following variables are the hyperparameters.
from time import time
st=time()
learning_rate = 0.001
epochs = 2000
batch_size = 50
label_name = "median_house_value_is_high"
classification_threshold = 0.4

# Establish the metrics the model will measure.
METRICS = [
           tf.keras.metrics.BinaryAccuracy(name='accuracy', 
                                           threshold=classification_threshold),
          ]

# Establish the model's topography.
my_model = create_model(learning_rate, feature_layer, METRICS)

# Train the model on the training set.
epochs, hist = train_model(my_model, train_df_norm, epochs, 
                           label_name, batch_size)

# Plot a graph of the metric(s) vs. epochs.
list_of_metrics_to_plot = ['accuracy'] 

plot_curve(epochs, hist, list_of_metrics_to_plot)
print('it took',time()-st,'sec')



ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type float).

Accuracy should gradually improve during training (until it can 
improve no more).

## Evaluate the model against the test set

At the end of model training, you ended up with a certain accuracy against the *training set*. Invoke the following code cell to determine your model's accuracy against the *test set*.

In [None]:
features = {name:np.array(value) for name, value in test_df_norm.items()}
label = np.array(features.pop(label_name))

my_model.evaluate(x = features, y = label, batch_size=batch_size)

## Add precision and recall as metrics

Relying solely on accuracy, particularly for a class-imbalanced data set (like ours), can be a poor way to judge a classification model.  Modify the code in the following code cell to enable the model to measure not only accuracy but also precision and recall. We have
added accuracy and precision; your task is to add recall. See the [TensorFlow Reference](https://www.tensorflow.org/api_docs/python/tf/keras/metrics/Recall) for details.



In [None]:
#@title Double-click to view the solution for Task 3.

# The following variables are the hyperparameters.
learning_rate = 0.001
epochs = 2000
batch_size = 50
label_name = "median_house_value_is_high"
classification_threshold = 0.4

# Here is the updated definition of METRICS:
METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy', 
                                      threshold=classification_threshold),
      tf.keras.metrics.Precision(thresholds=classification_threshold,
                                 name='precision' 
                                 ),
      tf.keras.metrics.Recall(thresholds=classification_threshold,
                              name="recall"),
]

# Establish the model's topography.
my_model = create_model(learning_rate, feature_layer, METRICS)

# Train the model on the training set.
epochs, hist = train_model(my_model, train_df_norm, epochs, 
                           label_name, batch_size)

# Plot metrics vs. epochs
list_of_metrics_to_plot = ['accuracy', "precision", "recall"] 
plot_curve(epochs, hist, list_of_metrics_to_plot)
print('it took',time()-st,'sec')

# The new graphs suggest that precision and recall are 
# somewhat in conflict. That is, improvements to one of
# those metrics may hurt the other metric.

## Summarize model performance 

If time permits, add one more metric that attempts to summarize the model's overall performance. 

In [None]:
#@title Double-click to view the solution for Task 5.

# The following variables are the hyperparameters.
learning_rate = 0.001
epochs = 2000
batch_size = 50
label_name = "median_house_value_is_high"
classification_threshold = 0.4

# AUC is a reasonable "summary" metric for 
# classification models.
# Here is the updated definition of METRICS to 
# measure AUC:
METRICS = [
      tf.keras.metrics.AUC(num_thresholds=100, name='auc'),
]

# Establish the model's topography.
my_model = create_model(learning_rate, feature_layer, METRICS)

# Train the model on the training set.
epochs, hist = train_model(my_model, train_df_norm, epochs, 
                           label_name, batch_size)

# Plot metrics vs. epochs
list_of_metrics_to_plot = ['auc'] 
plot_curve(epochs, hist, list_of_metrics_to_plot)