# COMP30027 Assignment 2
### Lucas Fern (1080613)

## Approach 1: Support Vector Machine
This will use the `doc2vec100` dataset since SVM is capable of classification in high dimensional spaces.

In [None]:
import tensorflow as tf
import visualkeras
import pandas as pd
import numpy as np
import pickle
import time
import ast

from sklearn import svm, naive_bayes, ensemble, model_selection
from keras.models import Sequential
from keras import layers, Model
from tcn import TCN, tcn_full_summary

from tensorflow.keras import layers, losses, preprocessing, utils
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [None]:
import matplotlib.pyplot as plt


def plot_graphs(history, metric, n_epochs):
  plt.plot(range(1, n_epochs+1), history.history[metric])
  plt.plot(range(1, n_epochs+1), history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])

In [None]:
names = pd.read_csv(r"datasets/recipe_text_features_doc2vec100/train_name_doc2vec100.csv", 
                    index_col = False, delimiter = ',', header=None)
steps = pd.read_csv(r"datasets/recipe_text_features_doc2vec100/train_steps_doc2vec100.csv", 
                    index_col = False, delimiter = ',', header=None)
ingrs = pd.read_csv(r"datasets/recipe_text_features_doc2vec100/train_name_doc2vec100.csv", 
                          index_col = False, delimiter = ',', header=None)
recipes = pd.read_csv(r"datasets/recipe_train.csv", index_col = False, delimiter = ',')
recipes['ingredients'] = recipes['ingredients'].apply(lambda x: ast.literal_eval(x)).apply(lambda x: ' '.join(x))
recipes['steps'] = recipes['steps'].apply(lambda x: ast.literal_eval(x)).apply(lambda x: ' '.join(x))

In [None]:
y = recipes['duration_label']

recipes['norm_n_ingredients'] = recipes['n_ingredients'] / max(recipes['n_ingredients'])
n_ingrs = recipes['norm_n_ingredients']

recipes['norm_n_steps'] = recipes['n_steps'] / max(recipes['n_steps'])
n_steps = recipes['norm_n_steps']

X = pd.concat([names, steps, ingrs, n_ingrs, n_steps], axis=1)

In [None]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, train_size=0.8)

In [None]:
clf = svm.SVC(kernel='rbf', C=10)

start = time.time()
clf.fit(X_train, y_train)
print("Took: " + str(time.time() - start))

# Save the model since it takes a few minutes to train
with open("SVM-default-testing.pkl", "wb") as f:
    pickle.dump(clf, f)

In [None]:
# Load a saved model if required
_MODEL_FILE = r"SVM-linear.pkl"
with open(_MODEL_FILE, "rb") as f:
    clf = pickle.load(f)

In [None]:
from sklearn.metrics import precision_recall_fscore_support
predictions = clf.predict(X_test)

precision, recall, _, _ = precision_recall_fscore_support(predictions, y_test)
print(precision, recall)

### Generating Predictions

In [None]:
names = pd.read_csv(r"datasets/recipe_text_features_doc2vec100/test_name_doc2vec100.csv", 
                    index_col = False, delimiter = ',', header=None)
steps = pd.read_csv(r"datasets/recipe_text_features_doc2vec100/test_steps_doc2vec100.csv", 
                    index_col = False, delimiter = ',', header=None)
ingrs = pd.read_csv(r"datasets/recipe_text_features_doc2vec100/test_name_doc2vec100.csv", 
                          index_col = False, delimiter = ',', header=None)
recipes = pd.read_csv(r"datasets/recipe_test.csv", index_col = False, delimiter = ',')
recipes['ingredients'] = recipes['ingredients'].apply(lambda x: ast.literal_eval(x)).apply(lambda x: ' '.join(x))
recipes['steps'] = recipes['steps'].apply(lambda x: ast.literal_eval(x)).apply(lambda x: ' '.join(x))

recipes['norm_n_ingredients'] = recipes['n_ingredients'] / max(recipes['n_ingredients'])
n_ingrs = recipes['norm_n_ingredients']

recipes['norm_n_steps'] = recipes['n_steps'] / max(recipes['n_steps'])
n_steps = recipes['norm_n_steps']

X = pd.concat([names, steps, ingrs, n_ingrs, n_steps], axis=1)

In [None]:
predictions = clf.predict(X)

In [None]:
with open('prediction-RBF-C10.csv', 'w+') as f: 
    pd.DataFrame(enumerate(predictions, start=1), columns=('id', 'duration_label')).to_csv(f, line_terminator='\n', index=False)

## Approach 2: Naive Bayes

In [None]:
with open(r"datasets/recipe_text_features_countvec/train_name_countvectorizer.pkl", "rb") as f:
    names_CV = pickle.load(f)
with open(r"datasets/recipe_text_features_countvec/train_steps_countvectorizer.pkl", "rb") as f:
    steps_CV = pickle.load(f)
with open(r"datasets/recipe_text_features_countvec/train_ingr_countvectorizer.pkl", "rb") as f:
    ingrs_CV = pickle.load(f)

In [None]:
recipes, X_test, y_train, y_test = model_selection.train_test_split(recipes, recipes['duration_label'], train_size=0.8)

X_names = recipes['name']
# The steps and ingredients are string formatted lists, so evaluate them to lists, then join to strings
X_steps = recipes['steps']
X_ingrs = recipes['ingredients']

# Now vectorise the names into numeric values
X_names = names_CV.transform(X_names).toarray()
X_steps = steps_CV.transform(X_steps).toarray()
X_ingrs = ingrs_CV.transform(X_ingrs).toarray()
X_nstep = recipes['n_steps'].to_numpy().reshape(-1, 1)
X_ningr = recipes['n_ingredients'].to_numpy().reshape(-1, 1)

In [None]:
gnb_names = naive_bayes.GaussianNB()
gnb_steps = naive_bayes.GaussianNB()
gnb_ingrs = naive_bayes.GaussianNB()
gnb_nstep = naive_bayes.GaussianNB()
gnb_ningr = naive_bayes.GaussianNB()

models = (gnb_names, gnb_steps, gnb_ingrs, gnb_nstep, gnb_ningr)
data = (X_names, X_steps, X_ingrs, X_nstep, X_ningr)

for model, X in zip(models, data):
    model.fit(X_train, y_train)
    print(model.score(X_test, y_test))

In [None]:
def ensemble_predict(X):
    result = []
    for _, instance in X.iterrows():
        votes = {1: 0, 2: 0, 3: 0}

        X_names = [instance['name']]
        # The steps and ingredients are string formatted lists, so evaluate them to lists, then join to strings
        X_steps = [instance['steps']]
        X_ingrs = [instance['ingredients']]

        # Now vectorise the names into numeric values
        X_names = names_CV.transform(X_names).toarray()
        X_steps = steps_CV.transform(X_steps).toarray()
        X_ingrs = ingrs_CV.transform(X_ingrs).toarray()
        X_nstep = np.array(instance['n_steps']).reshape(-1, 1)
        X_ningr = np.array(instance['n_ingredients']).reshape(-1, 1)

        # Take votes from each model
        votes[gnb_names.predict(X_names)[0]] += 2
        votes[gnb_steps.predict(X_steps)[0]] += 2
        votes[gnb_ingrs.predict(X_ingrs)[0]] += 1
        votes[gnb_nstep.predict(X_nstep)[0]] += 3
        votes[gnb_ningr.predict(X_ningr)[0]] += 3

        result.append(max(votes, key=votes.get))

    return result
        
def ensemble_score(X, y):
    predictions = np.array(ensemble_predict(X))
    y = y.to_numpy()

    print(predictions)
    print(y)
    print(np.equal(predictions, y))
    accuracy = np.count_nonzero(np.equal(predictions, y)) / len(y)

    return accuracy

In [None]:
print(gnb_names.score(X_names, y))
print(gnb_steps.score(X_steps, y))
print(gnb_ingrs.score(X_ingrs, y))
print(gnb_nstep.score(X_nstep, y))
print(gnb_ningr.score(X_ningr, y))

In [None]:
ensemble_score(recipes, y)

In [None]:
predictions = ensemble_predict(recipes)
precision, recall, _, _ = precision_recall_fscore_support(predictions, y)
print(precision, recall)

### GNB Predictions
Need to Count Vectorise the data

In [None]:
test_recipes = pd.read_csv(r"datasets/recipe_test.csv", index_col = False, delimiter = ',')
test_recipes['ingredients'] = test_recipes['ingredients'].apply(lambda x: ast.literal_eval(x)).apply(lambda x: ' '.join(x))
test_recipes['steps'] = test_recipes['steps'].apply(lambda x: ast.literal_eval(x)).apply(lambda x: ' '.join(x))

test_X_names = test_recipes['name']
# The steps and ingredients are string formatted lists, so evaluate them to lists, then join to strings
test_X_steps = test_recipes['steps']
test_X_ingrs = test_recipes['ingredients']

test_X_names = names_CV.transform(test_X_names).toarray()
test_X_steps = steps_CV.transform(test_X_steps).toarray()
test_X_ingrs = ingrs_CV.transform(test_X_ingrs).toarray()
test_X_nstep = test_recipes['n_steps'].to_numpy().reshape(-1, 1)
test_X_ningr = test_recipes['n_ingredients'].to_numpy().reshape(-1, 1)

test_data = (test_X_names, test_X_steps, test_X_ingrs, test_X_nstep, test_X_ningr)

In [None]:
test_recipes = pd.read_csv(r"datasets/recipe_test.csv", index_col = False, delimiter = ',')

predictions = ensemble_predict(test_recipes)
with open('prediction-GNB-ensemble.csv', 'w+') as f: 
    pd.DataFrame(enumerate(predictions, start=1), columns=('id', 'duration_label')).to_csv(f, line_terminator='\n', index=False)

## Approach 3: Neural Network

In [None]:
recipes = pd.read_csv(r"datasets/recipe_train.csv", index_col = False, delimiter = ',')
recipes['ingredients'] = recipes['ingredients'].apply(lambda x: ast.literal_eval(x)).apply(lambda x: ' '.join(x))
recipes['steps'] = recipes['steps'].apply(lambda x: ast.literal_eval(x)).apply(lambda x: ' '.join(x))

train, test = model_selection.train_test_split(recipes, train_size=0.9)

In [None]:
# for i, instance in a.iterrows():
#     label = int(instance['duration_label'])
# 
#     string_rep = 'number steps: ' + str(instance['n_steps']) + \
#         '\nnumber ingredients: ' + str(instance['n_ingredients']) + \
#         '\nname: ' + instance['name'] + \
#         '\ningredients: ' + instance['ingredients'] + \
#         '\nsteps: ' + instance['steps']
#  
#     with open(f'NN-datasets/full-train/{label}/{i}.txt', 'w+') as f:
#         f.write(string_rep)

In [None]:
_BATCH_SIZE = 50
_SEED = 42069

raw_train_ds = preprocessing.text_dataset_from_directory(
    'NN-datasets/train',
    batch_size=_BATCH_SIZE,
    validation_split=0.2,
    subset='training',
    seed=_SEED)

# raw_train_ds = preprocessing.text_dataset_from_directory(
#     'NN-datasets/full-train',
#     batch_size=_BATCH_SIZE)

raw_val_ds = preprocessing.text_dataset_from_directory(
    'NN-datasets/train',
    batch_size=_BATCH_SIZE,
    validation_split=0.2,
    subset='validation',
    seed=_SEED)

raw_test_ds = preprocessing.text_dataset_from_directory(
    'NN-datasets/test', 
    batch_size=_BATCH_SIZE)

In [None]:
_VOCAB_SIZE = 10000
_MAX_SEQUENCE_LENGTH = 300

int_vectorize_layer = TextVectorization(
    max_tokens=_VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=_MAX_SEQUENCE_LENGTH)

In [None]:
# Make a text-only dataset (without labels), then call adapt
train_text = raw_train_ds.map(lambda text, labels: text)
int_vectorize_layer.adapt(train_text)

In [None]:
def int_vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return int_vectorize_layer(text), label

text_batch, label_batch = next(iter(raw_train_ds))
first_recipe, first_label = text_batch[0], label_batch[0]
print("Recipe", first_recipe)
print("Label", first_label)

print("'int' vectorized recipe:",
      int_vectorize_text(first_recipe, first_label)[0])


In [None]:
int_train_ds = raw_train_ds.map(int_vectorize_text)
int_val_ds = raw_val_ds.map(int_vectorize_text)
int_test_ds = raw_test_ds.map(int_vectorize_text)

In [None]:
AUTOTUNE = tf.data.AUTOTUNE

def configure_dataset(dataset):
  return dataset.cache().prefetch(buffer_size=AUTOTUNE)

int_train_ds = configure_dataset(int_train_ds)
int_val_ds = configure_dataset(int_val_ds)
int_test_ds = configure_dataset(int_test_ds)

In [None]:
int_train_ds

In [None]:
# def create_model(vocab_size, num_labels):
#   model = tf.keras.Sequential([
#       layers.Embedding(vocab_size, 128, mask_zero=True),
#       layers.LSTM(64),
#       layers.Dropout(0.3),
#       layers.Dense(num_labels)
#   ])
#   return model

def create_model(vocab_size, num_labels):
  model = tf.keras.Sequential([
      layers.Embedding(vocab_size, 128, mask_zero=True),
      layers.Conv1D(128, 5, padding="valid", activation="relu", strides=1),
      layers.GlobalMaxPooling1D(),
      layers.Dense(num_labels)
  ])
  return model

In [None]:
int_model = create_model(vocab_size=_VOCAB_SIZE + 1, num_labels=3)
int_model.compile(
    loss=losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer='adam',
    metrics=['accuracy'])

# TODO: find the shapes 
# current_shape = (300, 1)
# for model in int_model.layers:
#     print(model, current_shape := model.compute_output_shape(current_shape))

history = int_model.fit(int_train_ds, validation_data=int_val_ds, epochs=10)
# history = int_model.fit(int_train_ds, epochs=3)

In [None]:
int_model.summary()

In [None]:
predictions = [int(i) for i in get_string_labels(int_model.predict(int_val_ds))]


In [None]:
labels = [list(j) for i, j in list(int_val_ds)]
flat_labels = [item for sublist in labels for item in sublist]
int_labels = [int(i + 1) for i in flat_labels]

In [None]:
precision, recall, _, _ = precision_recall_fscore_support(predictions, int_labels)
print(precision, recall)

In [None]:
int_loss, int_accuracy = int_model.evaluate(int_test_ds)

print(f"Int model accuracy: {int_accuracy}")

In [None]:
plt.figure(figsize=(16, 8))
plt.subplot(1, 2, 1)
plot_graphs(history, 'accuracy', 10)
plt.ylim(None, 1)
plt.subplot(1, 2, 2)
plot_graphs(history, 'loss', 10)
plt.ylim(0, None)

In [None]:
plt.figure(figsize=(8, 8))
plt.subplot(1, 1, 1)
plot_graphs(history, 'loss', 10)
plt.ylim(0, None)

### Prediction Model

In [None]:
def get_string_labels(predicted_scores_batch):
  predicted_int_labels = tf.argmax(predicted_scores_batch, axis=1)
  predicted_labels = tf.gather(raw_train_ds.class_names, predicted_int_labels)
  return predicted_labels

In [None]:
export_model = tf.keras.Sequential(
    [int_vectorize_layer, int_model])

export_model.compile(
    loss=losses.SparseCategoricalCrossentropy(from_logits=False),
    optimizer='adam',
    metrics=['accuracy'])

# Test it with `raw_test_ds`, which yields raw strings
print(get_string_labels(export_model.predict(['test', 'longer test of a few words', 'small test'])))

In [None]:
unlabeled_data = []
for i in range(10000):
    with open(f'NN-datasets/unlabeled-test/{i}.txt') as f:
        unlabeled_data.append(f.read())


In [None]:
export_model.predict(unlabeled_data)

In [None]:
get_string_labels(export_model.predict(unlabeled_data))

In [None]:
predictions = get_string_labels(export_model.predict(unlabeled_data))
predictions = [float(i) for i in predictions.numpy()]

In [None]:
visualkeras.layered_view(export_model).show()

In [None]:
with open('NN-datasets/prediction-CNN-128.csv', 'w+') as f: 
    pd.DataFrame(enumerate(predictions, start=1), columns=('id', 'duration_label')).to_csv(f, line_terminator='\n', index=False)

### Temporal Convolutional Model

In [None]:
def tcn_model(vocab_size, num_labels, kernel_size = 3, activation='relu', input_dim = None, 
                   output_dim=300, max_length = None, emb_matrix = None):
    
    inp = layers.Input(shape=(None, 300))
    x = layers.Embedding(input_dim=vocab_size, 
                  output_dim=128,
                  # Set the weight to be not trainable (static)
                  trainable = False)(inp)
    
    x = layers.SpatialDropout1D(0.1)(x)
    
    x = TCN(128,dilations = [1, 2, 4], return_sequences=True, activation = activation, name = 'tcn1')(x)
    x = TCN(64,dilations = [1, 2, 4], return_sequences=True, activation = activation, name = 'tcn2')(x)
    
    avg_pool = layers.GlobalAveragePooling1D()(x)
    max_pool = layers.GlobalMaxPooling1D()(x)
    
    conc = layers.concatenate([avg_pool, max_pool])
    conc = layers.Dense(16, activation="relu")(conc)
    conc = layers.Dropout(0.1)(conc)
    outp = layers.Dense(1, activation="sigmoid")(conc)

    model = Model(inputs=inp, outputs=outp)
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    
    return model

In [None]:
model = tcn_model(vocab_size=_VOCAB_SIZE + 1, num_labels=3)
model.summary()

history = model.fit(int_train_ds, validation_data=int_val_ds, epochs=4)

### Precision and Recall Calculation