In [None]:
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import tensorflow as tf
import tensorflow_hub as hub

df = pd.read_csv("nofo_final_reduced_header.csv", header=None)

In [None]:
train, val, test = np.split(df.sample(frac=1), [int(0.8*len(df)), int(0.9*len(df))]) #60 training, 20 validation, 20 test
#can go to 80/10/10 for larger sets, or even more...

len(train), len(val), len(test)

In [None]:
def df_to_dataset(dataframe, shuffle=True, batch_size=256):
  df = dataframe.copy()
  labels = df.pop(1)
  df = df[0]
  ds = tf.data.Dataset.from_tensor_slices((df, labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(tf.data.AUTOTUNE)
  return ds

train_data = df_to_dataset(train)
valid_data = df_to_dataset(val)
test_data = df_to_dataset(test)

In [None]:
embedding = "https://tfhub.dev/google/nnlm-en-dim50/2"
hub_layer = hub.KerasLayer(embedding, dtype=tf.string, trainable=True)

In [None]:
model = tf.keras.Sequential()
model.add(hub_layer) #first layer in neural net is text to value
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dropout(0.4))
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dropout(0.4))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=['accuracy'])

In [None]:
model.evaluate(train_data)

In [None]:
model.evaluate(valid_data)

In [None]:
history = model.fit(train_data, epochs=5, validation_data=valid_data)

In [None]:
model.evaluate(test_data)

In [None]:
examples = df[0].values.tolist()
print(examples)
test_predict = model.predict(examples)

In [None]:
outcome = []
for i in test_predict:
    outcome.append(i[0])
df['text_predictions'] = outcome

In [None]:
df.head()

In [None]:
df = df[[3, 5, 7, 8, 9, 10, 'text_predictions', 1]]

In [None]:
spec = []
for i in df[5]:
    if i != 0:
        spec.append(1)
    else:
        spec.append(0)

In [None]:
df[5] = spec

In [None]:
df.head()

In [None]:
# for i in range(len(df.columns[:-1])):
#     label = df.columns[i]
#     plt.hist(df[df[1]==1][label],color='blue', label='Subheader',alpha=0.7)
#     plt.hist(df[df[1]==0][label],color='red', label='Not Header',alpha=0.7)
#     plt.title(label)
#     plt.ylabel('N')
#     plt.xlabel(label)
#     plt.legend()
#     plt.show()

In [None]:
x = df[df.columns[:-1]].values
y = df[df.columns[-1]].values

scaler = StandardScaler()
x = scaler.fit_transform(x)

In [None]:
# over = RandomOverSampler()
# x, y = over.fit_resample(x,y)

x_train, x_temp, y_train, y_temp = train_test_split(x, y, test_size=0.4, random_state=0)
x_valid, x_test, y_valid, y_test = train_test_split(x_temp, y_temp, test_size=0.5, random_state=0)

In [None]:
model_2 = tf.keras.Sequential()
# model_2.add(hub_layer) #first layer in neural net is text to value
model_2.add(tf.keras.layers.Dense(16, activation='relu'))
model_2.add(tf.keras.layers.Dropout(0.4))
model_2.add(tf.keras.layers.Dense(16, activation='relu'))
model_2.add(tf.keras.layers.Dropout(0.4))
model_2.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model_2.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=['accuracy'])

In [None]:
model_2.evaluate(x_train, y_train)

In [None]:
model_2.fit(x_train, y_train, batch_size=16, epochs=20, validation_data=(x_valid, y_valid))

In [63]:
model_2.evaluate(x_test, y_test)



[0.005966543219983578, 0.9987203478813171]

In [None]:
model.save('header_models/model_1')
model_2.save('header_models/model_2')