In [None]:
%load_ext autoreload
%autoreload 2

!pip install transformers
!pip install sentencepiece

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import re
import random
import numpy as np
import pandas as pd

import tensorflow as tf
tf.config.optimizer.set_jit(True)

from tqdm.auto import tqdm
from transformers import XLMRobertaTokenizer, BartTokenizer, RobertaTokenizer, T5Tokenizer

import os
import yaml
from pathlib import Path

import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks/ML Projects/Google AI4Code/modules')
import pipeline, protobuf_handler, dual_code_rep_fc, data_cleaner, triple_code_rep_fc

print('Libraries Imported')

In [None]:
# Unzip files from drive to disk
main_path = '/content/drive/MyDrive/Colab Notebooks/ML Projects/Google AI4Code/data/AI4Code.zip'
kag_path = '/content/drive/MyDrive/Colab Notebooks/ML Projects/Google AI4Code/data/kag_train.zip'
jup_path = '/content/drive/MyDrive/Colab Notebooks/ML Projects/Google AI4Code/data/jup_train.zip'

main_disk_path = '/content/AI4Code'
kag_disk_path = '/content/kag_train'
jup_disk_path = '/content/jup_train'

pipeline.unzip_files(main_path, main_disk_path)
pipeline.unzip_files(kag_path, kag_disk_path)
pipeline.unzip_files(jup_path, jup_disk_path)

Files have already been unzipped to disk path.
Files have already been unzipped to disk path.
Files have already been unzipped to disk path.


In [None]:
# Get correct order of documents
main_data_dir = Path('/content/AI4Code')
df_orders = pd.read_csv(main_data_dir / 'train_orders.csv',
                        index_col='id',
                        squeeze=True).str.split()

kag_orders_path = '/content/drive/MyDrive/Colab Notebooks/ML Projects/Google AI4Code/data/kag_orders.json'
kag_orders = pd.read_json(kag_orders_path, typ='series')

jup_orders_path = '/content/drive/MyDrive/Colab Notebooks/ML Projects/Google AI4Code/data/jup_orders.json'
jup_orders = pd.read_json(jup_orders_path, typ='series')

df_orders = pd.concat([df_orders, kag_orders, jup_orders])

In [None]:
# Load in excluded IDs and drop them from the data filepath
# Avaliable options: 'similars', 'dupe_md', 'single_code_cell', 'less_than_five', 'less_than_eight', 'less_than_four', 'single_md', 'non_english_ids', 'outliers'

excluded_ids_path = '/content/drive/MyDrive/Colab Notebooks/ML Projects/Google AI4Code/data/excluded_ids.yaml'
outlier_ids_path = '/content/drive/MyDrive/Colab Notebooks/ML Projects/Google AI4Code/data/outlier_ids.yaml'

similar_ids = pipeline.load_yaml_file(excluded_ids_path)['similars']
outlier_ids = pipeline.load_yaml_file(outlier_ids_path)['ids']

random_seed = 20
data_paths = list((main_data_dir / 'train').glob('*.json'))
data_paths = pipeline.remove_id_paths(data_paths, similar_ids)

kag_paths = list(Path(kag_disk_path).glob('*.csv'))
jup_paths = list(Path(jup_disk_path).glob('*.csv'))
data_paths = data_paths + kag_paths + jup_paths

data_paths = pipeline.remove_id_paths(data_paths, outlier_ids)
random.Random(random_seed).shuffle(data_paths)  # Shuffle data paths list

all_dataset_splits = np.linspace(0, len(data_paths), num=97, dtype=np.int32)
print(f"Training with {all_dataset_splits}")

In [None]:
# Select Feature Creation Type and Transformer Model
# Mean tokens in a markdown cell is 112.53 and median is 50.0
# 11 markdown count per document median and 15 mean

# Mean tokens in a code cell is 43.66 and median is 15.0
# 23 code count per document median and 30 mean

c1_trans_model = 'gbert'
c2_trans_model = 'gbert'
c3_trans_model = 'multi'

folder_name = 'gbert_gbert_multi_code_rep'
feature_creation_type = 'triple_code_rep_fc'

# Initalize Model Name and Tokenizer
trans_options = {'multi': ['Unbabel/xlm-roberta-comet-small', XLMRobertaTokenizer],
                 'cbert': ['microsoft/codebert-base', RobertaTokenizer],
                 'gbert': ['microsoft/graphcodebert-base', RobertaTokenizer],
                 't5': ['Salesforce/codet5-base', RobertaTokenizer]}
                 
fc_reqs = {'triple_code_rep_fc': [protobuf_handler.write_triple_set_to_tfrecords,
                                  protobuf_handler.decode_triple_protobuf,
                                  protobuf_handler.parse_triple_tensor_arrays],
           'dual_code_rep_fc': [protobuf_handler.write_dual_set_to_tfrecords,
                                protobuf_handler.decode_dual_protobuf,
                                protobuf_handler.parse_dual_tensor_arrays]}

write_tensor_arrays_to_tfrecords = fc_reqs[feature_creation_type][0]
decode_protobuf = fc_reqs[feature_creation_type][1]
parse_tensor_arrays = fc_reqs[feature_creation_type][2]

selected_trans = trans_options[c1_trans_model] 
c1_trans_name = selected_trans[0]
c1_tokenizer_class = selected_trans[1]
c1_tokenizer = c1_tokenizer_class.from_pretrained(c1_trans_name)

selected_trans = trans_options[c2_trans_model] 
c2_trans_name = selected_trans[0]
c2_tokenizer_class = selected_trans[1]
c2_tokenizer = c2_tokenizer_class.from_pretrained(c2_trans_name)

selected_trans = trans_options[c3_trans_model] 
c3_trans_name = selected_trans[0]
c3_tokenizer_class = selected_trans[1]
c3_tokenizer = c3_tokenizer_class.from_pretrained(c3_trans_name)

In [None]:
# Add custom tokens
custom_tokens = ['<d>', '<c>', '[DIVIDER]', '[EMPTY]', '[EMOJI]']

c1_tokenizer = pipeline.add_custom_tokens_to_tokenizer(c1_tokenizer, custom_tokens)
c2_tokenizer = pipeline.add_custom_tokens_to_tokenizer(c2_tokenizer, custom_tokens)
c3_tokenizer = pipeline.add_custom_tokens_to_tokenizer(c3_tokenizer, custom_tokens + ['\n'])

### Create One Dataset to Test

In [None]:
# Load data to memory and preprocess text (formatting and stemming)

start_idx = 0
end_idx = 3000

doc_ids, cell_metadata, unprocessed_text = pipeline.load_and_parse_data(data_paths, start_idx, end_idx)
code_markdown_locs = [cell for cells in cell_metadata for cell in cells.values()]

text = pipeline.preprocess_text(unprocessed_text, code_markdown_locs)

In [None]:
# Prepare Features and Labels
c1_input_ids = pipeline.encode_text_for_input_ids(text, c1_tokenizer, disable_print=True)
c2_input_ids = pipeline.encode_text_for_input_ids(text, c2_tokenizer, disable_print=True)
c3_input_ids = pipeline.encode_text_for_input_ids(text, c3_tokenizer)

md_pct_ranks = dual_code_rep_fc.get_md_pct_ranks(doc_ids, cell_metadata, df_orders)
c1_groupings, c2_groupings, c3_groupings, labels = triple_code_rep_fc.collect_md_code_groupings(c1_input_ids,
                                                                                                c2_input_ids,
                                                                                                c3_input_ids,
                                                                                                cell_metadata,
                                                                                                md_pct_ranks)
c1_features = triple_code_rep_fc.create_features(c1_groupings, c1_tokenizer, md_len=70, disable_print=True)
c2_features = triple_code_rep_fc.create_features(c2_groupings, c2_tokenizer, md_len=70, disable_print=True)
c3_features = triple_code_rep_fc.create_features(c3_groupings, c3_tokenizer, md_len=130)
features = tuple((c1_features[0], c1_features[1],
                  c2_features[0], c2_features[1],
                  c3_features[0], c3_features[1]))

In [None]:
sample_filepath = f'/content/drive/MyDrive/Colab Notebooks/ML Projects/Google AI4Code/tf_data/{folder_name}/{start_idx}-{end_idx}.tfrecords'
write_tensor_arrays_to_tfrecords(features, labels, sample_filepath)
print(f"Finished dataset for {start_idx}-{end_idx}")

In [None]:
# # Prepare Features and Labels
# c1_input_ids = pipeline.encode_text_for_input_ids(text, c1_tokenizer, disable_print=True)
# c2_input_ids = pipeline.encode_text_for_input_ids(text, c2_tokenizer)

# md_pct_ranks = dual_code_rep_fc.get_md_pct_ranks(doc_ids, cell_metadata, df_orders)
# c1_groupings, c2_groupings, labels = dual_code_rep_fc.collect_md_code_groupings(c1_input_ids,
#                                                                                 c2_input_ids,
#                                                                                 cell_metadata,
#                                                                                 md_pct_ranks)
# c1_features = dual_code_rep_fc.create_features(c1_groupings, c1_tokenizer, disable_print=True)
# c2_features = dual_code_rep_fc.create_features(c2_groupings, c2_tokenizer)
# features = tuple((c1_features[0], c1_features[1],
#                   c2_features[0], c2_features[1]))

### Double Check Created Features

In [None]:
# Take a look at the unparsed daata
print(f'Markdown II:\n{features[0]}\n')
print(f'Markdown AM:\n{features[1]}\n')
print(f'Code II:\n{features[2]}\n')
print(f'Code AM:\n{features[3]}\n')
print(f'Labels:\n{labels[:10]}\n')
print(f'Labels Shapes (Please Ensure 2D): {labels.shape}')

In [None]:
# Manually parse TFRecords
auto = tf.data.experimental.AUTOTUNE
batch_size = 32
epochs = 2
print(f'Sample Filepath is {sample_filepath}')

sample_dataset = (tf.data.TFRecordDataset(sample_filepath, num_parallel_reads=auto)
                  .map(decode_protobuf, num_parallel_calls=auto)
                  .map(parse_tensor_arrays, num_parallel_calls=auto)
                  .shuffle(len(labels)+1, seed=42)
                  .repeat(epochs)
                  .batch(batch_size, drop_remainder=True, num_parallel_calls=auto)
                  .prefetch(auto))
print(sample_dataset)

for batch_features, batch_labels in sample_dataset.take(1):
  print(f'\nParsed MD II:\n{batch_features[0]}\n')
  print(f'\nFirst MD II:\n{batch_features[0][0]}\n')

  print(f'Parsed MD AM:\n{batch_features[1]}\n')
  print(f'\nFirst MD AM:\n{batch_features[1][0]}\n')

  print(f'Parsed Code II:\n{batch_features[2]}\n')
  print(f'Parsed Code AM:\n{batch_features[3]}\n')
  
  print(f'Parsed Labels:\n{batch_labels}\n')
  print(f'\nFirst Label:\n{batch_labels[0]}\n')

# Remove sample file to avoid noise in final TFRecords folder
try:
  os.remove(sample_filepath)
  print('Deleted sample file to avoid noise creation in final TFRecords folder.')
except:
  print('Sample file has already been deleted.')

### Create all TFRecord Datasets

In [None]:
# Create all 11 TFRecords Datasets
avg_samples_per_file = []
jumpstart = 0

for i in range(len(all_dataset_splits[jumpstart:-1])):
  start_idx = all_dataset_splits[i+jumpstart]
  end_idx = all_dataset_splits[i+1+jumpstart]
  filepath = f'/content/drive/MyDrive/Colab Notebooks/ML Projects/Google AI4Code/tf_data/{folder_name}/{start_idx}-{end_idx}.tfrecords'

  doc_ids, cell_metadata, text = pipeline.load_and_parse_data(data_paths, start_idx, end_idx)
  code_markdown_locs = [cell for cells in cell_metadata for cell in cells.values()]
  text = pipeline.preprocess_text(text, code_markdown_locs)
  
  # Prepare Features and Labels
  c1_input_ids = pipeline.encode_text_for_input_ids(text, c1_tokenizer, disable_print=True)
  c2_input_ids = pipeline.encode_text_for_input_ids(text, c2_tokenizer, disable_print=True)
  c3_input_ids = pipeline.encode_text_for_input_ids(text, c3_tokenizer)

  md_pct_ranks = dual_code_rep_fc.get_md_pct_ranks(doc_ids, cell_metadata, df_orders)
  c1_groupings, c2_groupings, c3_groupings, labels = triple_code_rep_fc.collect_md_code_groupings(c1_input_ids,
                                                                                                  c2_input_ids,
                                                                                                  c3_input_ids,
                                                                                                  cell_metadata,
                                                                                                  md_pct_ranks)
  c1_features = triple_code_rep_fc.create_features(c1_groupings, c1_tokenizer, md_len=70, disable_print=True)
  c2_features = triple_code_rep_fc.create_features(c2_groupings, c2_tokenizer, md_len=70, disable_print=True)
  c3_features = triple_code_rep_fc.create_features(c3_groupings, c3_tokenizer, md_len=130)
  features = tuple((c1_features[0], c1_features[1],
                    c2_features[0], c2_features[1],
                    c3_features[0], c3_features[1]))

  # Write Features to Protobufs
  avg_samples_per_file.append(len(features[0]))
  write_tensor_arrays_to_tfrecords(features, labels, filepath)
  print(f"Finished dataset for {start_idx}-{end_idx}\n")
print('All requested datasets have been recoreded as TFRecords!')
print(f'The average samples per file was {sum(avg_samples_per_file)/len(avg_samples_per_file):.2f}')

In [None]:
# Save Test IDS for Post Processing
# test_ids_path = '/content/drive/MyDrive/Colab Notebooks/ML Projects/Google AI4Code/data/test_ids.yaml'
# test_id_idxs = all_dataset_splits[-17:]

# test_ids = {}
# test_ids['ids'] = [test_id_idx.stem for test_id_idx in data_paths[test_id_idxs[0]:test_id_idxs[-1]]]

# with open(test_ids_path, 'r+') as stream:
#   try:
#     yaml.dump(test_ids, stream, default_flow_style=False)
#   except yaml.YAMLError as error:
#     print(error)
# print('Succesfully dumped test ids to yaml file')