<a href="https://colab.research.google.com/github/nhuyen183/LungCancerSupportSystem/blob/master/ANNs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#@title Import relevant modules
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers
from matplotlib import pyplot as plt
import seaborn as sns

import os
from glob import glob
import joblib
import pandas as pd
from sklearn import model_selection
from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import train_test_split
from tensorflow.python.keras import Sequential
from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras import models
from tensorflow.python.keras.models import load_model

# The following lines adjust the granularity of reporting. 
pd.options.display.max_rows = 10
pd.options.display.float_format = "{:.1f}".format

print("Imported modules.")

Imported modules.


In [10]:
filenames = glob('/content/*.csv')
appended_data=[] #create a list
for f in filenames:
    df = pd.read_csv(f, index_col=None)
    appended_data.append(df) #append to the list

df = pd.concat(appended_data, axis=0)

In [11]:
df.head(5)

Unnamed: 0,Gender,Age65,GeneralHealth,Smoked100,SmokerStatus,FirstSmokedAge,LastSmokedAge,AvgNumCigADay,HasCTScan,StopSmoking,HasAsthma,HasLungCancer
0,1,1,3,1,2,0,0,0,0,1,2,0
1,2,1,3,2,4,0,0,0,0,0,2,0
2,2,2,4,1,3,0,0,0,0,0,2,0
3,1,1,3,1,1,0,0,0,0,1,1,0
4,1,1,4,1,1,0,0,0,0,2,2,0


In [38]:
df.shape

(371781, 12)

In [23]:
df.HasLungCancer.value_counts()

0    371625
1       156
Name: HasLungCancer, dtype: int64

In [24]:
count_majority_class, count_minority_class = df.HasLungCancer.value_counts()

In [25]:
df_majority_class = df[df['HasLungCancer'] == 0]
df_minority_class = df[df['HasLungCancer'] == 1]

In [26]:
df_class_oversample = df_minority_class.sample(count_majority_class, replace=True)

In [28]:
df_balanced_os = pd.concat([df_class_oversample, df_majority_class], axis=0)

In [43]:
print('Number of data samples after over-sampling:')
print(df_balanced_os.HasLungCancer.value_counts())

train_df = df_balanced_os.reindex(np.random.permutation(df_balanced_os.index)) # shuffle the examples

Number of data samples after over-sampling:
1    371625
0    371625
Name: HasLungCancer, dtype: int64


In [41]:
X = df_balanced_os.iloc[:, 0:11].astype(int)
y = df_balanced_os['HasLungCancer'].astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [34]:
#@title Create numeric feature columns
# For Numeric features, we can just call on feature_column.numeric_column()
# to use its raw value instead of having to create a map between value and ID.
gender = tf.feature_column.numeric_column("Gender")
age = tf.feature_column.numeric_column("Age65")
genhealth = tf.feature_column.numeric_column("GeneralHealth")
smoked100 = tf.feature_column.numeric_column("Smoked100")
firstsmoked_age = tf.feature_column.numeric_column("FirstSmokedAge")
lastsmoked_age = tf.feature_column.numeric_column("LastSmokedAge")
avg_numcig_day = tf.feature_column.numeric_column("AvgNumCigADay")
scan = tf.feature_column.numeric_column("HasCTScan")
stopsmoked_attempt = tf.feature_column.numeric_column("StopSmoking")
has_asthma = tf.feature_column.numeric_column("HasAsthma")

In [39]:
# Create an empty list that will eventually hold all created feature columns.
feature_columns = [gender, age, genhealth, smoked100, firstsmoked_age, lastsmoked_age, avg_numcig_day, scan, stopsmoked_attempt, has_asthma]

# Convert the list of feature columns into a layer that will later be fed into the model. 
my_feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

In [35]:
#@title Define the plotting function.

def plot_the_loss_curve(epochs, mse):
  """Plot a curve of loss vs. epoch."""

  plt.figure()
  plt.xlabel("Epoch")
  plt.ylabel("Mean Squared Error")

  plt.plot(epochs, mse, label="Loss")
  plt.legend()
  plt.ylim([mse.min()*0.95, mse.max() * 1.03])
  plt.show()  

print("Defined the plot_the_loss_curve function.")

Defined the plot_the_loss_curve function.


In [36]:
#@title Define functions to create and train a linear regression model
def create_model(my_learning_rate, feature_layer):
  """Create and compile a simple linear regression model."""
  # Most simple tf.keras models are sequential.
  model = tf.keras.models.Sequential()

  # Add the layer containing the feature columns to the model.
  model.add(feature_layer)

  # Add one linear layer to the model to yield a simple linear regressor.
  model.add(tf.keras.layers.Dense(units=1, input_shape=(1,)))

  # Construct the layers into a model that TensorFlow can execute.
  model.compile(optimizer=tf.keras.optimizers.RMSprop(lr=my_learning_rate),
                loss="mean_squared_error",
                metrics=[tf.keras.metrics.MeanSquaredError()])

  return model           


def train_model(model, df_balanced_os, epochs, batch_size, label_name):
  """Feed a dataset into the model in order to train it."""

  # Split the dataset into features and label.
  features = {name:np.array(value) for name, value in df_balanced_os.items()}
  label = np.array(features.pop(label_name))
  history = model.fit(x=features, y=label, batch_size=batch_size,
                      epochs=epochs, shuffle=True)

  # Get details that will be useful for plotting the loss curve.
  epochs = history.epoch
  hist = pd.DataFrame(history.history)
  rmse = hist["mean_squared_error"]

  return epochs, rmse   

print("Defined the create_model and train_model functions.")

Defined the create_model and train_model functions.


In [42]:
# The following variables are the hyperparameters.
learning_rate = 0.01
epochs = 15
batch_size = 1000
label_name = "HasLungCancer"

# Establish the model's topography.
my_model = create_model(learning_rate, my_feature_layer)

# Train the model on the training set.
epochs, mse = train_model(my_model, X_train, y_train, epochs, batch_size, label_name)
plot_the_loss_curve(epochs, mse)

test_features = {name:np.array(value) for name, value in test_df_norm.items()}
test_label = np.array(test_features.pop(label_name)) # isolate the label
print("\n Evaluate the linear regression model against the test set:")
my_model.evaluate(x = test_features, y = test_label, batch_size=batch_size)

  super(RMSprop, self).__init__(name, **kwargs)


TypeError: ignored