<a href="https://colab.research.google.com/github/mawalz05/WSTP/blob/main/WSTP_NLPModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Installng tensorflow and transfomers to use pre-trained BERT
import tensorflow as tf
!pip install -q transformers

# Importing the data from local drive
from google.colab import files
uploaded = files.upload()
uploaded2 = files.upload()
uploaded3 = files.upload()

In [None]:
import io
import pandas as pd
import numpy as np
df_1 = pd.read_csv(io.BytesIO(uploaded['nlp_batch_3_complete.csv']), encoding = "ISO-8859-1")
#df2 = pd.read_csv(io.BytesIO(uploaded2['NLP_test.csv']), encoding = "ISO-8859-1") # This is the new data for prediction

df_1 = df_1[['text', 'sustainable']]

# This is in case of missing values
df_1 = df_1.dropna()

# Merging Datasets
df_2 = pd.read_csv(io.BytesIO(uploaded2['nlp_batch_csr_complete.csv']), encoding = "ISO-8859-1")
df_2 = df_2[['text','sustainable']]
df_2 = df_2.dropna()

df = pd.concat([df_1, df_2], axis = 0)


# Turning the label column into binary
df['label'] = pd.np.where(df['sustainable'] == 'Company', 1, df['sustainable'])
df['label'] = pd.np.where(df['sustainable'] == 'General', 1, df['label'])
df['label'] = np.where(df['sustainable'] == 'No', 0, df['label'])

df = df[(df['label'] == 0) | (df['label'] == 1)]

df = df.dropna()


# Shuffling the data and creating a test sample to run quicker if we want
# df_test = df.sample(frac = 1)
# df_test = df_test[:100000]

# Creating target and feature vectors
# target = df_test['y']
# features = df_test['text']
target = df['label']
features = df['text']


# Splitting the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = .30, shuffle = True)

# Turning the feature text into a lists of tweets
text_list_train = X_train.tolist()
text_list_test = X_test.tolist()

In [None]:
df.info()

In [None]:
import transformers
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)

# max length can be up to 512 for BERT
max_length = 50

# The recommended batch size for BERT is 16, 32, ... smaller batches work better for regularization
batch_size = 6

# Creating a function that tokenizes the data
def convert_example_to_feature(tweet):
  return tokenizer.encode_plus(
      tweet, add_special_tokens = True, # add [CLS] and [SEP]
      max_length = max_length, # max length of the text that can go to BERT
      pad_to_max_length = True, # Add [PAD] to tokens
      return_attention_mask = True, #Add attention mask to not focuson on pad tokens
      )

# Creating a function that can map the input, tokens, and attention masks, and the label
def map_example_to_dict(input_ids, attention_mask, token_type_ids, label):
  return {
      "input_ids": input_ids,
      "token_type_ids": token_type_ids,
      "attention_mask": attention_mask,
  }, label

# Creating a function to conform to the input standards of BERT
def encode_examples(X, y, limit = -1):
  input_ids_list = []
  token_type_ids_list = []
  attention_mask_list = []
  label_list = []

  if (limit > 0):
    X = X.take(limit)
    y = y.take(limit)

  for tweet in X:
    bert_input = convert_example_to_feature(tweet)

    input_ids_list.append(bert_input['input_ids'])
    token_type_ids_list.append(bert_input['token_type_ids'])
    attention_mask_list.append(bert_input['attention_mask'])
  
  for label in y:
    label_list.append([tf.dtypes.cast(label, tf.int32)])

  return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_example_to_dict)

In [None]:
# Conforming the training data to BERT standards
ds_train_encoded = encode_examples(text_list_train, y_train).shuffle(10000).batch(batch_size)

# Conforming the testing data to BERT standards
ds_test_encoded = encode_examples(text_list_test, y_test).batch(batch_size)

In [None]:
from transformers import TFBertForSequenceClassification
import tensorflow as tf

# Recommended learning rates for Adam 5e-5, 3e-5, 2e-5
learning_rate = 2e-5

# Running only five epochs for illustration
number_of_epochs = 5

# Model initialization
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

# Use Adam
optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate, epsilon = 1e-08)

# We do not have one-hot vectors, so we can use sparse categorical cross entropy and accuracy
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(optimizer = optimizer, loss = loss, metrics = [metric])

In [None]:
checkpoint_path = "training_1/cp.ckpt"

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)

# Now we can begin fine tuning
bert_history = model.fit(ds_train_encoded, epochs = number_of_epochs, validation_data = ds_test_encoded, callbacks=[cp_callback])

In [None]:
df2 = pd.read_csv(io.BytesIO(uploaded3['nlp_unlabelled_test.csv']), encoding = "ISO-8859-1") # This is the new data for prediction

# df2['label'] = pd.np.where(df2['sustainable'] == 'yes', 1, df2['Sustainable'])
# df2['label'] = np.where(df2['label'] == 'no', 0, df2['label'])

features_new = df2['text']
target_new = df2['sustainable']

text_list_new = features_new.tolist()

import transformers
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)

# max length can be up to 512 for BERT
max_length = 50

# The recommended batch size for BERT is 16, 32, ... smaller batches work better for regularization
batch_size = 6

# Creating a function that tokenizes the data
def convert_example_to_feature(tweet):
  return tokenizer.encode_plus(
      tweet, add_special_tokens = True, # add [CLS] and [SEP]
      max_length = max_length, # max length of the text that can go to BERT
      pad_to_max_length = True, # Add [PAD] to tokens
      return_attention_mask = True, #Add attention mask to not focuson on pad tokens
      )

# Creating a function that can map the input, tokens, and attention masks, and the label
def map_example_to_dict(input_ids, attention_mask, token_type_ids):
  return {
      "input_ids": input_ids,
      "token_type_ids": token_type_ids,
      "attention_mask": attention_mask,
  }

# Creating a function to conform to the input standards of BERT
def encode_examples(X, limit = -1):
  input_ids_list = []
  token_type_ids_list = []
  attention_mask_list = []
  # label_list = []

  if (limit > 0):
    X = X.take(limit)
    # y = y.take(limit)

  for tweet in X:
    bert_input = convert_example_to_feature(tweet)

    input_ids_list.append(bert_input['input_ids'])
    token_type_ids_list.append(bert_input['token_type_ids'])
    attention_mask_list.append(bert_input['attention_mask'])
  
  # for label in y:
  #   label_list.append([tf.dtypes.cast(label, tf.int32)])

  return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list)).map(map_example_to_dict)

# Conforming the testing data to BERT standards
ds_new_encoded = encode_examples(text_list_new).batch(batch_size)

In [None]:
sentiment = model.predict(ds_new_encoded, verbose = 1)
#tf_prediction = tf.nn.sigmoid(sentiment[0], axis = -1)
tf_prediction = tf.nn.sigmoid(sentiment[0])
label = tf.argmax(tf_prediction, axis = 1).numpy()
den = len(label)
nom = sum(label)

print(den)
print(nom)
score = nom/den
print(score)

total = (score/2 + .5)*100
print(total)

In [None]:
######################################################
# This is the end

In [None]:
####################################################
# Extra code not needed.

In [None]:
latest = tf.train.latest_checkpoint(checkpoint_path)

from transformers import TFBertForSequenceClassification
import tensorflow as tf

# Recommended learning rates for Adam 5e-5, 3e-5, 2e-5
learning_rate = 2e-5

# Running only five epochs for illustration
number_of_epochs = 5

# Model initialization
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

# Use Adam
optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate, epsilon = 1e-08)

# We do not have one-hot vectors, so we can use sparse categorical cross entropy and accuracy
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(optimizer = optimizer, loss = loss, metrics = [metric])

model.load_weights(latest)

# Make predictions
sentiment = model.predict_proba(ds_new_encoded, verbose = 1)