In [14]:
# Import the packages

# Utilities
import os
import logging

# For visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd

# For modelling
import tensorflow as tf
from tensorflow import feature_column as fc
from tensorflow.keras import layers, models

# Set TF logger to only print errors (dismiss warnings)
logging.getLogger("tensorflow").setLevel(logging.ERROR)

## Load dataset




Let's check that the files were copied correctly and look like we expect them to.

In [4]:
df = pd.read_csv('..\dataset\data_job_new_with_predicted_cat.csv')

In [5]:
df.head()

Unnamed: 0,ID,Company,Job_Title,YOE,Location,Est_Salary,Job_Description,Link,Min_level,Max_level,...,Data_Scientist_prob,Business_Analyst_prob,Business_Intelligence_prob,Others_prob,Data_Engineer,Data_Analyst,Data_Scientist,Business_Analyst,Business_Intelligence,Others
0,3617,FIDT,( Data ) Research Intern,0.0,HCM,65.0,Minimum 10 hours/week; Students complete at le...,https://www.linkedin.com/jobs/view/3631700502,0,0,...,0.000332037,1.42e-05,4.02e-09,0.8928759,0,0,0,0,0,1
1,1084,Kyanon Digital,Machine Learning Engineer Intern,0.0,HCM,85.0,"Familiar with Object-Oriented Programming, Mod...",https://glints.com/vn/opportunities/jobs/machi...,0,0,...,4.2e-10,2.54e-07,1.49e-07,0.9947981,0,0,0,0,0,1
2,2526,Paditech,Blockchain Intern,0.0,HN,85.0,"3rd, 4th, 5th year students or have graduated ...",https://www.topcv.vn/viec-lam/thuc-tap-sinh-bl...,0,0,...,0.003848903,0.002628318,2.48e-05,0.9846249,0,0,0,0,0,1
3,2858,Maico Group,BI Intern,0.0,HCM,85.0,The program will not be suitable for you who a...,https://www.careerlink.vn/tim-viec-lam/thuc-ta...,0,0,...,0.002765148,0.07810333,0.002572874,0.9400327,0,0,0,0,0,1
4,3423,DataGenius,Data Engineer | Data Analyst | AI Engineer,0.0,HCM,105.0,DataGenius company is recruiting Data Engineer...,https://www.facebook.com/groups/datanalyticsvn...,0,0,...,0.000205168,7.03e-07,3.56e-09,5.59e-10,1,1,0,0,0,0


In [10]:
col_to_drop = ['Job_Title','ID', 'Company', 'Link', 'Data_Engineer_prob','Data_Analyst_prob', 'Data_Scientist_prob','Business_Analyst_prob','Business_Intelligence_prob','Others_prob']
clean_df=df.drop(columns=col_to_drop)
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3631 entries, 0 to 3630
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   YOE                    2974 non-null   float64
 1   Location               3621 non-null   object 
 2   Est_Salary             3377 non-null   float64
 3   Job_Description        3631 non-null   object 
 4   Min_level              3631 non-null   int64  
 5   Max_level              3631 non-null   int64  
 6   VN                     3631 non-null   int64  
 7   Overseas               3631 non-null   int64  
 8   Remote                 3631 non-null   int64  
 9   Data_Engineer          3631 non-null   int64  
 10  Data_Analyst           3631 non-null   int64  
 11  Data_Scientist         3631 non-null   int64  
 12  Business_Analyst       3631 non-null   int64  
 13  Business_Intelligence  3631 non-null   int64  
 14  Others                 3631 non-null   int64  
dtypes: f

In [12]:
clean_df.dropna(subset='Est_Salary')
clean_df.fillna(value=0, inplace=True)

job_categories = ['Data_Engineer', 'Data_Analyst', 'Data_Scientist', 'Business_Analyst', 'Business_Intelligence','Others']
# Calculate the average YOE for each job category and round it
average_yoe_by_category = clean_df.groupby(job_categories)['YOE'].mean().round().reset_index()

# Fill in the missing 'YOE' values based on the job category
for _, row in average_yoe_by_category.iterrows():
    condition = (clean_df[job_categories] == tuple(row[job_categories])).all(axis=1)
    clean_df.loc[condition, 'YOE'] = row['YOE']
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3631 entries, 0 to 3630
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   YOE                    3631 non-null   float64
 1   Location               3631 non-null   object 
 2   Est_Salary             3631 non-null   float64
 3   Job_Description        3631 non-null   object 
 4   Min_level              3631 non-null   int64  
 5   Max_level              3631 non-null   int64  
 6   VN                     3631 non-null   int64  
 7   Overseas               3631 non-null   int64  
 8   Remote                 3631 non-null   int64  
 9   Data_Engineer          3631 non-null   int64  
 10  Data_Analyst           3631 non-null   int64  
 11  Data_Scientist         3631 non-null   int64  
 12  Business_Analyst       3631 non-null   int64  
 13  Business_Intelligence  3631 non-null   int64  
 14  Others                 3631 non-null   int64  
dtypes: f

## Create an input pipeline


In [20]:
# Specify which column is the target
LABEL_COLUMN = 'Est_Salary'

# Specify numerical columns
# Note you should create another list with STRING_COLS if you
# had text data but in this case all features are numerical
NUMERIC_COLS = ['YOE', 'Min_level',
                'Max_level', 'VN',
                'Overseas', 'Remote', 'dayofweek'
                'Data_Engineer','Data_Analyst',	'Data_Scientist','Business_Analyst','Business_Intelligence','Others']

CATEGORICAL_COLS  = ['Location']


# A function to separate features and labels
def features_and_labels(row_data):
    label = row_data.pop(LABEL_COLUMN)
    return row_data, label


# A utility method to create a tf.data dataset from a CSV file
def load_dataset(path, batch_size=1, mode='eval'):
    dataset = tf.data.experimental.make_csv_dataset(path, batch_size)

    dataset = dataset.map(features_and_labels)  # features, label
    if mode == 'train':
        # Notice the repeat method is used so this dataset will loop infinitely
        dataset = dataset.shuffle(1000).repeat()
        # take advantage of multi-threading; 1=AUTOTUNE
        dataset = dataset.prefetch(1)
    return dataset

## Create a DNN Model in Keras

Since the model is defined using `feature columns` the first layer might look different to what you are used to. This is done by declaring two dictionaries, one for the inputs (defined as Input layers) and one for the features (defined as feature columns).

Then computing the `DenseFeatures` tensor by passing in the feature columns to the constructor of the `DenseFeatures` layer and passing in the inputs to the resulting tensor (this is easier to understand with code):

In [27]:
def build_dnn_model():
    # Define input layers for numeric features
    numeric_inputs = {
        colname: layers.Input(name=colname, shape=(), dtype='float32')
        for colname in NUMERIC_COLS
    }

    # Define input layer for categorical feature
    categorical_inputs = {
        colname: layers.Input(name=colname, shape=(), dtype='string')
        for colname in CATEGORICAL_COLS
    }

    # Define feature columns for numeric features
    numeric_feature_columns = {
        colname: fc.numeric_column(colname)
        for colname in NUMERIC_COLS
    }

    # Define feature column for categorical feature
    categorical_feature_columns = {
        colname: fc.embedding_column(
            fc.categorical_column_with_vocabulary_list(colname, vocabulary_list=clean_df["Location"].unique()),
            dimension=8  # Specify the embedding dimension
        )
        for colname in CATEGORICAL_COLS
    }
     

    # Construct DenseFeatures for numeric features
    numeric_dnn_inputs = layers.DenseFeatures(numeric_feature_columns.values())(numeric_inputs)

    # Construct DenseFeatures for categorical features
    categorical_dnn_inputs = layers.DenseFeatures(categorical_feature_columns.values())(categorical_inputs)

    # Concatenate numeric and categorical features
    concatenated_inputs = layers.concatenate([numeric_dnn_inputs, categorical_dnn_inputs])

    # Two hidden layers of 32 and 8 units, respectively
    h1 = layers.Dense(32, activation='relu', name='h1')(concatenated_inputs)
    h2 = layers.Dense(8, activation='relu', name='h2')(h1)

    # Final output is a linear activation because this is a regression problem
    output = layers.Dense(1, activation='linear', name='output')(h2)

    # Create model with inputs and output
    model = models.Model({**numeric_inputs, **categorical_inputs}, output)

    # Compile model (Mean Squared Error is suitable for regression)
    model.compile(optimizer='adam',
                  loss='mse',
                  metrics=[
                      tf.keras.metrics.RootMeanSquaredError(name='rmse'),
                      'mse'
                  ])

    return model

We'll build our DNN model and inspect the model architecture.

In [28]:
# Save compiled model into a variable
model = build_dnn_model()

# Plot the layer architecture and relationship between input features
tf.keras.utils.plot_model(model, 'dnn_model.png', show_shapes=False, rankdir='LR')

ValueError: Exception encountered when calling layer "dense_features_5" (type DenseFeatures).

Can't convert Python sequence with mixed types to Tensor.

Call arguments received by layer "dense_features_5" (type DenseFeatures):
  • features={'Location': 'tf.Tensor(shape=(None,), dtype=string)'}
  • cols_to_output_tensors=None
  • training=False

With the model architecture defined it is time to train it!

## Train the model

You are going to train the model for 20 epochs using a batch size of 32.

In [None]:
NUM_EPOCHS = 20
TRAIN_BATCH_SIZE = 32
NUM_TRAIN_EXAMPLES = len(pd.read_csv('/tmp/data/taxi-train.csv'))
NUM_EVAL_EXAMPLES = len(pd.read_csv('/tmp/data/taxi-valid.csv'))

print(f"training split has {NUM_TRAIN_EXAMPLES} examples\n")
print(f"evaluation split has {NUM_EVAL_EXAMPLES} examples\n")

Use the previously defined function to load the datasets from the original csv files.

In [None]:
# Training dataset
trainds = load_dataset('/tmp/data/taxi-train*', TRAIN_BATCH_SIZE, 'train')

# Evaluation dataset
evalds = load_dataset('/tmp/data/taxi-valid*', 1000, 'eval').take(NUM_EVAL_EXAMPLES//1000)

# Needs to be specified since the dataset is infinite
# This happens because the repeat method was used when creating the dataset
steps_per_epoch = NUM_TRAIN_EXAMPLES // TRAIN_BATCH_SIZE

# Train the model and save the history
history = model.fit(trainds,
                    validation_data=evalds,
                    epochs=NUM_EPOCHS,
                    steps_per_epoch=steps_per_epoch)

### Visualize training curves

Now lets visualize the training history of the model with the raw features:

In [None]:
# Function for plotting metrics for a given history
def plot_curves(history, metrics):
    nrows = 1
    ncols = 2
    fig = plt.figure(figsize=(10, 5))

    for idx, key in enumerate(metrics):
        ax = fig.add_subplot(nrows, ncols, idx+1)
        plt.plot(history.history[key])
        plt.plot(history.history[f'val_{key}'])
        plt.title(f'model {key}')
        plt.ylabel(key)
        plt.xlabel('epoch')
        plt.legend(['train', 'validation'], loc='upper left')


# Plot history metrics
plot_curves(history, ['loss', 'mse'])

The training history doesn't look very promising showing an erratic behaviour. Looks like the training process struggled to transverse the high dimensional space that the current features create.

Nevertheless let's use it for prediction.

Notice that the latitude and longitude values should revolve around (`37`, `45`) and (`-70`, `-78`) respectively since these are the range of coordinates for New York city.

In [None]:
# Define a taxi ride (a data point)
taxi_ride = {
    'pickup_longitude': tf.convert_to_tensor([-73.982683]),
    'pickup_latitude': tf.convert_to_tensor([40.742104]),
    'dropoff_longitude': tf.convert_to_tensor([-73.983766]),
    'dropoff_latitude': tf.convert_to_tensor([40.755174]),
    'passenger_count': tf.convert_to_tensor([3.0]),
    'hourofday': tf.convert_to_tensor([3.0]),
    'dayofweek': tf.convert_to_tensor([3.0]),
}

# Use the model to predict
prediction = model.predict(taxi_ride, steps=1)

# Print prediction
print(f"the model predicted a fare total of {float(prediction):.2f} USD for the ride.")

The model predicted this particular ride to be around 12 USD. However you know the model performance is not the best as it was showcased by the training history. Let's improve it by using **Feature Engineering**.

## Improve Model Performance Using Feature Engineering

Going forward you will only use geo-spatial features as these are the most relevant when calculating the fare since this value is mostly dependant on the distance transversed:

In [None]:
# Drop dayofweek and hourofday features
NUMERIC_COLS = ['pickup_longitude', 'pickup_latitude',
                'dropoff_longitude', 'dropoff_latitude']

Since you are dealing exclusively with geospatial data you will create some transformations that are aware of this geospatial nature. This help the model make a better representation of the problem at hand.

For instance the model cannot magically understand what a coordinate is supposed to represent and since the data is taken from New York only, the latitude and longitude revolve around (`37`, `45`) and (`-70`, `-78`) respectively, which is arbitrary for the model. A good first step is to scale these values.

**Notice all transformations are created by defining functions**.

In [None]:
def scale_longitude(lon_column):
    return (lon_column + 78)/8.

In [None]:
def scale_latitude(lat_column):
    return (lat_column - 37)/8.

Another important fact is that the fare of a taxi ride is proportional to the distance of the ride. But as the features currently are, there is no way for the model to infer that the pair of (`pickup_latitude`, `pickup_longitude`) represent the point where the passenger started the ride and the pair (`dropoff_latitude`, `dropoff_longitude`) represent the point where the ride ended. More importantly, the model is not aware that the distance between these two points is crucial for predicting the fare.

To solve this, a new feature (which is a transformation of the other ones) that provides this information is required.

In [None]:
def euclidean(params):
    lon1, lat1, lon2, lat2 = params
    londiff = lon2 - lon1
    latdiff = lat2 - lat1
    return tf.sqrt(londiff*londiff + latdiff*latdiff)

### Applying transformations

Now you will define the `transform` function which will apply the previously defined transformation functions. To apply the actual transformations you will be using `Lambda` layers apply a function to values (in this case the inputs).


In [None]:
def transform(inputs, numeric_cols):

    # Make a copy of the inputs to apply the transformations to
    transformed = inputs.copy()

    # Define feature columns
    feature_columns = {
        colname: tf.feature_column.numeric_column(colname)
        for colname in numeric_cols
    }

    # Scaling longitude from range [-70, -78] to [0, 1]
    for lon_col in ['pickup_longitude', 'dropoff_longitude']:
        transformed[lon_col] = layers.Lambda(
            scale_longitude,
            name=f"scale_{lon_col}")(inputs[lon_col])

    # Scaling latitude from range [37, 45] to [0, 1]
    for lat_col in ['pickup_latitude', 'dropoff_latitude']:
        transformed[lat_col] = layers.Lambda(
            scale_latitude,
            name=f'scale_{lat_col}')(inputs[lat_col])

    # add Euclidean distance
    transformed['euclidean'] = layers.Lambda(
        euclidean,
        name='euclidean')([inputs['pickup_longitude'],
                           inputs['pickup_latitude'],
                           inputs['dropoff_longitude'],
                           inputs['dropoff_latitude']])


    # Add euclidean distance to feature columns
    feature_columns['euclidean'] = fc.numeric_column('euclidean')

    return transformed, feature_columns

## Update the model

Next, you'll create the DNN model now with the engineered (transformed) features.

In [None]:
def build_dnn_model():

    # input layer (notice type of float32 since features are numeric)
    inputs = {
        colname: layers.Input(name=colname, shape=(), dtype='float32')
        for colname in NUMERIC_COLS
    }

    # transformed features
    transformed, feature_columns = transform(inputs, numeric_cols=NUMERIC_COLS)

    # Constructor for DenseFeatures takes a list of numeric columns
    # and the resulting tensor takes a dictionary of Lambda layers
    dnn_inputs = layers.DenseFeatures(feature_columns.values())(transformed)

    # two hidden layers of 32 and 8 units, respectively
    h1 = layers.Dense(32, activation='relu', name='h1')(dnn_inputs)
    h2 = layers.Dense(8, activation='relu', name='h2')(h1)

    # final output is a linear activation because this is a regression problem
    output = layers.Dense(1, activation='linear', name='fare')(h2)

    # Create model with inputs and output
    model = models.Model(inputs, output)

    # Compile model (Mean Squared Error is suitable for regression)
    model.compile(optimizer='adam',
                  loss='mse',
                  metrics=[tf.keras.metrics.RootMeanSquaredError(name='rmse'), 'mse'])

    return model

In [None]:
# Save compiled model into a variable
model = build_dnn_model()

Let's see how the model architecture has changed.

In [None]:
# Plot the layer architecture and relationship between input features
tf.keras.utils.plot_model(model, 'dnn_model_engineered.png', show_shapes=False, rankdir='LR')

This plot is very useful for understanding the relationships and dependencies between the original and the transformed features!

**Notice that the input of the model now consists of 5 features instead of the original 7, thus reducing the dimensionality of the problem.**

Let's now train the model that includes feature engineering.

In [None]:
# Train the model and save the history
history = model.fit(trainds,
                    validation_data=evalds,
                    epochs=NUM_EPOCHS,
                    steps_per_epoch=steps_per_epoch)

Notice that the features `passenger_count`, `hourofday` and `dayofweek` were excluded since they were omitted when defining the input pipeline.

Now lets visualize the training history of the model with the engineered features.

In [None]:
# Plot history metrics
plot_curves(history, ['loss', 'mse'])

This looks a lot better than the previous training history! Now the loss and error metrics are decreasing with each epoch and both curves (train and validation) are very close to each other. Nice job!

Let's do a prediction with this new model on the example we previously used.

In [None]:
# Use the model to predict
prediction = model.predict(taxi_ride, steps=1)

# Print prediction
print(f"the model predicted a fare total of {float(prediction):.2f} USD for the ride.")

Wow, now the model predicts a fare that is roughly half of what the previous model predicted! Looks like the model with the raw features was overestimating the fare by a great margin.

Notice that you get a warning since the `taxi_ride` dictionary contains information about the unused features. You can supress it by redefining `taxi_ride` without these values but it is useful to know that Keras is smart enough to handle it on its own.

**Congratulations on finishing this ungraded lab!** Now you should have a clearer understanding of the importance and impact of performing feature engineering on your data.

This process is very domain-specific and requires a great understanding of the situation that is being modelled. Because of this, new techniques that switch from a manual to an automatic feature engineering have been developed and you will check some of them in an upcoming lab.


**Keep it up!**