In [None]:
%load_ext autoreload
%autoreload 2

!pip install transformers
!pip install sentencepiece

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import random
import math
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
tf.config.optimizer.set_jit(True)

from tqdm.auto import tqdm
from transformers import AdamWeightDecay
from transformers import XLMRobertaTokenizer, RobertaTokenizer, T5Tokenizer

import os
from pathlib import Path

import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks/ML Projects/Google AI4Code/modules')
import pipeline, post_processing, tfmodels, dual_code_rep_fc, triple_code_rep_fc

print('Libraries Imported')

In [None]:
# Check GPU Model and Status
# A100 > V100 > T4 > P100
pipeline.check_for_gpu_status()
gpu_strategy = tf.distribute.MirroredStrategy()

!nvidia-smi
!nvidia-smi --query-gpu=gpu_name, driver_version, memory.total --format=csv

In [None]:
# Setup models to analyze

c1_trans_model = 'gbert'
c2_trans_model = 'gbert'
c3_trans_model = 'multi'
model_name = 'gbert_gbert_multi_code_rep'

params = {'batch_size': 64}

In [None]:
# Initalize Model Name and Tokenizer with Custom Tokens
trans_options = {'multi': ['Unbabel/xlm-roberta-comet-small', XLMRobertaTokenizer],
                 'cbert': ['microsoft/codebert-base',  RobertaTokenizer],
                 'gbert': ['microsoft/graphcodebert-base', RobertaTokenizer],
                 't5': ['Salesforce/codet5-base', RobertaTokenizer]}            
custom_tokens = ['<d>', '<c>', '[DIVIDER]', '[EMPTY]', '[EMOJI]']

# C1 Model
c1_options = trans_options[c1_trans_model] 
c1_trans_name = c1_options[0]
c1_tokenizer_class = c1_options[1]
c1_tokenizer = c1_tokenizer_class.from_pretrained(c1_trans_name)
c1_tokenizer = pipeline.add_custom_tokens_to_tokenizer(c1_tokenizer, custom_tokens)

# C2 Model
c2_options = trans_options[c2_trans_model] 
c2_trans_name = c2_options[0]
c2_tokenizer_class = c2_options[1]
c2_tokenizer = c2_tokenizer_class.from_pretrained(c2_trans_name)
c2_tokenizer = pipeline.add_custom_tokens_to_tokenizer(c2_tokenizer, custom_tokens)

# C3 Modle
c3_options = trans_options[c3_trans_model] 
c3_trans_name = c3_options[0]
c3_tokenizer_class = c3_options[1]
c3_tokenizer = c3_tokenizer_class.from_pretrained(c3_trans_name)
c3_tokenizer = pipeline.add_custom_tokens_to_tokenizer(c3_tokenizer, custom_tokens + ['\n'])

In [None]:
# Unzip files from drive to disk
data_path = '/content/drive/MyDrive/Colab Notebooks/ML Projects/Google AI4Code/data/AI4Code.zip'
disk_path = '/content/AI4Code'

pipeline.unzip_files(data_path, disk_path)

In [None]:
# Load in saved models
model_path = f'/content/drive/MyDrive/Colab Notebooks/ML Projects/Google AI4Code/saved_models/{model_name}'

with gpu_strategy.scope():
  lr_scheduler = {'AdamWeightDecay': AdamWeightDecay,
                  'WarmupCosineDecayRestarts': tfmodels.WarmupCosineDecay,
                  'LRTracker': tfmodels.LRTracker}
  load_options = tf.saved_model.LoadOptions(experimental_io_device='/job:localhost')

  model = tf.keras.models.load_model(model_path,
                                     custom_objects=lr_scheduler,
                                     options=load_options,
                                     compile=False)
  print('Loaded in model')

In [None]:
test_ids_path = '/content/drive/MyDrive/Colab Notebooks/ML Projects/Google AI4Code/data/test_ids.yaml'
test_ids = pipeline.load_yaml_file(test_ids_path)['ids']

data_dir = Path(disk_path)
test_paths = [(data_dir/'train'/(test_id+'.json')) for test_id in test_ids]

print(f"There are {len(test_paths)} files after removing outliers and similar files.")

In [None]:
# We should just load it all in to see what it all looks like
def read_notebook(path):
    return (pd.read_json(path, dtype={'cell_type': 'category', 'source': 'str'})
              .assign(id=path.stem)
              .rename_axis('cell_id'))
    
notebooks = [read_notebook(path) for path in tqdm(test_paths, desc='Train NBs')]
df = (pd.concat(notebooks)
        .set_index('id', append=True)
        .swaplevel()
        .sort_index(level='id', sort_remaining=False))

df_orders = pd.read_csv(data_dir / 'train_orders.csv',
                        index_col='id',
                        squeeze=True).str.split()                      

In [None]:
# How many doc ids do you want to test?
doc_ids = test_ids[10000:18000]

## Assemble Predicted Order

### Markdown Code Rep

In [None]:
pred_map = {}
dupes = {}
dataset_len = 0

c1_ii = []
c1_am = []

c2_ii = []
c2_am = []

c3_ii = []
c3_am = []

for doc_id in tqdm(doc_ids, desc='Preparing Mardown Code Rep Features'):
  current_doc = df.loc[doc_id]
  code_markdown_locs, text = current_doc['cell_type'], current_doc['source']
  cell_metadata = [code_markdown_locs.to_dict()]
  md_count = code_markdown_locs.value_counts()['markdown']
  code_count = code_markdown_locs.value_counts()['code']
  
  text = pipeline.preprocess_text(text, code_markdown_locs, disable_print=True)
  c1_input_ids = pipeline.encode_text_for_input_ids(text, c1_tokenizer, disable_print=True)
  c2_input_ids = pipeline.encode_text_for_input_ids(text, c2_tokenizer, disable_print=True)
  c3_input_ids = pipeline.encode_text_for_input_ids(text, c3_tokenizer, disable_print=True)
  c1_groupings, c2_groupings, c3_groupings, md_dupes = triple_code_rep_fc.collect_best_code_groupings(c1_input_ids,
                                                                                                      c2_input_ids,
                                                                                                      c3_input_ids,
                                                                                                      cell_metadata)
  c1_features = triple_code_rep_fc.create_features(c1_groupings, c1_tokenizer, disable_print=True)
  c2_features = triple_code_rep_fc.create_features(c2_groupings, c2_tokenizer, disable_print=True)
  c3_features = triple_code_rep_fc.create_features(c3_groupings, c3_tokenizer, disable_print=True)
 
  if md_dupes:
    dupes[doc_id] = md_dupes
  
  if len(c1_groupings) == 0:
    pred_map[doc_id] = 0
  else:
    pred_map[doc_id] = 1
    
    c1_ii.append(c1_features[0])
    c1_am.append(c1_features[1])

    c2_ii.append(c2_features[0])
    c2_am.append(c2_features[1])

    c3_ii.append(c3_features[0])
    c3_am.append(c3_features[1])
    dataset_len += len(c1_groupings)

In [None]:
# MD Code Rep Predictions
base_features = tf.data.Dataset.from_tensor_slices((np.concatenate(c1_ii, axis=0, dtype=np.int32),
                                                    np.concatenate(c1_am, axis=0, dtype=np.int32),
                                                    np.concatenate(c2_ii, axis=0, dtype=np.int32),
                                                    np.concatenate(c2_am, axis=0, dtype=np.int32),
                                                    np.concatenate(c3_ii, axis=0, dtype=np.int32),
                                                    np.concatenate(c3_am, axis=0, dtype=np.int32)))
dummy_labels = tf.data.Dataset.from_tensor_slices(np.zeros((dataset_len, 1), dtype=np.float32))
dataset = (tf.data.Dataset.zip((base_features, dummy_labels))
            .batch(params['batch_size']))

pred_steps = math.ceil(dataset_len / params['batch_size'])
preds = model.predict(dataset, steps=pred_steps, verbose=1)

In [None]:
# Assemble Doc Order
pred_start = 0
y_preds = {}

for doc_id in tqdm(doc_ids, desc='Assembling Doc Order'):
  code_markdown_locs = df.loc[doc_id]['cell_type']
  pred_count_per_md = pred_map[doc_id]
    
  try:
    md_dupes = dupes[doc_id]
  except KeyError:
    md_dupes = []

  code_count = code_markdown_locs.value_counts()['code']
  code_pct_ranks = list(np.arange(1, code_count+1, dtype=np.float32))

  md_count = code_markdown_locs.value_counts()['markdown']
  md_pct_ranks = []

  pred_end = pred_start + (md_count-len(md_dupes))
  doc_mc_preds = preds[pred_start:pred_end]
  mini_pred_start = 0
  default_dupe_loc = 0.54288805 * (code_count-1)  # Avg dupe location is at 0.54288805

  for i in range(md_count):
    if i in md_dupes:
      md_pct_ranks.append(default_dupe_loc)
    else:
      rel_pred_rank = doc_mc_preds[mini_pred_start:mini_pred_start+1]
      rank = (rel_pred_rank + 1)/2 * (code_count+1)
      mini_pred_start += 1
      md_pct_ranks.append(rank)

  metadata_df = code_markdown_locs.to_frame()
  pct_ranks =  code_pct_ranks + md_pct_ranks
  metadata_df['pct_rank'] = pct_ranks

  pred_order = metadata_df.sort_values("pct_rank").index.tolist()
  y_preds[doc_id] = pred_order
  pred_start += (md_count - len(md_dupes))

In [None]:
# Calculate Kendall Tau Scores
y_preds = pd.Series(y_preds)
ground_truth = df_orders.loc[doc_ids]
kendall_tau_score = post_processing.calculate_kendall_tau(ground_truth, y_preds)
print(f"The Kendall Tau Scores for the documents was {kendall_tau_score:.4f}.")

In [None]:
# Last one was at 86.39 kendall tau same as before

### Kendall Tau Post Analysis

In [None]:
# Collect highest Kendall-Tau's and Lowest Scores, let's get their doc ids
kendall_taus = []

for gt, y_pred in zip(ground_truth, y_preds):
  kendall_taus.append(post_processing.calculate_kendall_tau([gt], [y_pred]))

kendall_taus = np.asarray(kendall_taus)
print(f'The mean score was {kendall_taus.mean():.4f} and the median score was {np.median(kendall_taus):.4f}.')

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
ax = sns.histplot(kendall_taus, ax=ax)
_ = ax.set(title='Kendall Tau Score Distribution')

In [None]:
# Collect Worst and Highest Ids
# Looking into this, the outputs share the same doc ids, unsure

highest_idx = []
lowest_idx = []

highest_kt = np.sort(kendall_taus)[::-1][:1000]
lowest_kt = np.sort(kendall_taus)[:1000]
print(f"Highest Scores are {highest_kt[:5]}")
print(f"\nLowest Scores are {lowest_kt[:5]}")

for i in range(len(highest_kt)):
  highest_idx.append(np.where(kendall_taus == highest_kt[i])[0])
  lowest_idx.append(np.where(kendall_taus == lowest_kt[i])[0])

highest_idx = np.concatenate(highest_idx)
lowest_idx = np.concatenate(lowest_idx)

doc_ids = np.asarray(doc_ids)
highest_doc_id = doc_ids[highest_idx]
lowest_doc_id = doc_ids[lowest_idx]

In [None]:
selected_id = lowest_doc_id[10]
ordered_df = post_processing.get_ordered_df(selected_id, df, df_orders)
display(ordered_df)

In [None]:
predicted_order = y_preds.loc[selected_id]
predicted_df = df.loc[selected_id].loc[predicted_order]
display(df.loc[selected_id].loc[predicted_order])

In [None]:
pipeline.preprocess_text(df.loc[selected_id].loc[predicted_order]['source'],
                         df.loc[selected_id].loc[predicted_order]['cell_type'], 
                         disable_print=False)

### Check Layer Weights

In [None]:
def plot_layer_weights(layer_weights, cmap):
  fig, ax = plt.subplots(figsize=(20,8))
  layer_max = np.amax(layer_weights)
  layer_min = np.amin(layer_weights)
  ax = sns.heatmap(data=layer_weights, cmap=cmap)
  ax.set(title=f'Min and Max at [{layer_min:.7f}, {layer_max:.7f}]')
  return

cmaps = ['Blues', 'viridis', 'Spectral', 'rocket', 'mako']

In [None]:
md_mean_weights = []

# CLS
for i in range(1536):
  md_mean_weights.append(model.layers[2].get_weights()[2][i].mean())

md_mean_weights = np.asarray(md_mean_weights, dtype=np.float32).reshape(1, 1536)
plot_layer_weights(md_mean_weights, cmap='Blues')
print(f"The markdown transformer mean weights are {md_mean_weights.mean():.7f} and median at {np.median(md_mean_weights):.7f}.\n")

In [None]:
code_mean_weights = []

# DOC
for i in range(1536):
  code_mean_weights.append(model.layers[3].get_weights()[2][i].mean())

code_mean_weights = np.asarray(code_mean_weights, dtype=np.float32).reshape(1, 1536)
plot_layer_weights(code_mean_weights, cmap='Blues')
print(f"The quadrant transformer mean weights are {code_mean_weights.mean():.7f} and median at {np.median(code_mean_weights):.7f}.\n")

### Kaggle Check

In [None]:
# def read_notebook(path):
#     return (pd.read_json(path, dtype={'cell_type': 'category', 'source': 'str'})
#               .assign(id=path.stem)
#               .rename_axis('cell_id'))

# def create_df(notebooks):
#     df = (pd.concat(notebooks)
#             .set_index('id', append=True)
#             .swaplevel()
#             .sort_index(level='id', sort_remaining=False))
#     return df

In [None]:
# Create Features and Split Test Dataset Into Two

y_preds = {}
file_amount = len(doc_ids)
path_splits = [0, int(file_amount*0.25), int(file_amount*0.5), int(file_amount*0.75), file_amount]

for split in range(4):
    start_idx = path_splits[split]
    end_idx = path_splits[split+1]
    current_paths = doc_ids[start_idx:end_idx]

    # notebooks = [read_notebook(path) for path in tqdm(current_paths, desc='Test NBs')]
    # df = create_df(notebooks)
    # doc_ids = df.index.unique(level=0)
    
    pred_map = {}
    dupes = {}
    pred_start = 0
    dataset_len = 0

    c1_ii = []
    c1_am = []

    c2_ii = []
    c2_am = []

    c3_ii = []
    c3_am = []
    
    for doc_id in tqdm(current_paths, desc='Preparing Mardown Code Rep Features'):
        current_doc = df.loc[doc_id]
        code_markdown_locs, text = current_doc['cell_type'], current_doc['source']
        cell_metadata = [code_markdown_locs.to_dict()]
        md_count = code_markdown_locs.value_counts()['markdown']
        code_count = code_markdown_locs.value_counts()['code']
  
        text = pipeline.preprocess_text(text, code_markdown_locs, disable_print=True)
        c1_input_ids = pipeline.encode_text_for_input_ids(text, c1_tokenizer, disable_print=True)
        c2_input_ids = pipeline.encode_text_for_input_ids(text, c2_tokenizer, disable_print=True)
        c3_input_ids = pipeline.encode_text_for_input_ids(text, c3_tokenizer, disable_print=True)
        c1_groupings, c2_groupings, c3_groupings, md_dupes = triple_code_rep_fc.collect_best_code_groupings(c1_input_ids,
                                                                                                            c2_input_ids,
                                                                                                            c3_input_ids,
                                                                                                            cell_metadata)
        c1_features = triple_code_rep_fc.create_features(c1_groupings, c1_tokenizer, disable_print=True)
        c2_features = triple_code_rep_fc.create_features(c2_groupings, c2_tokenizer, disable_print=True)
        c3_features = triple_code_rep_fc.create_features(c3_groupings, c3_tokenizer, disable_print=True)
 
        if md_dupes:
            dupes[doc_id] = md_dupes
        
        if len(c1_groupings) == 0:
            pred_map[doc_id] = 0
        else:
            pred_map[doc_id] = 1
    
            c1_ii.append(c1_features[0])
            c1_am.append(c1_features[1])
    
            c2_ii.append(c2_features[0])
            c2_am.append(c2_features[1])

            c3_ii.append(c3_features[0])
            c3_am.append(c3_features[1])
            dataset_len += len(c1_groupings)
            
    # MC Rep Predictions
    base_features = tf.data.Dataset.from_tensor_slices((np.concatenate(c1_ii, axis=0, dtype=np.int32),
                                                        np.concatenate(c1_am, axis=0, dtype=np.int32),
                                                        np.concatenate(c2_ii, axis=0, dtype=np.int32),
                                                        np.concatenate(c2_am, axis=0, dtype=np.int32),
                                                        np.concatenate(c3_ii, axis=0, dtype=np.int32),
                                                        np.concatenate(c3_am, axis=0, dtype=np.int32)))
    dummy_labels = tf.data.Dataset.from_tensor_slices(np.zeros((dataset_len, 1), dtype=np.float32))
    dataset = (tf.data.Dataset.zip((base_features, dummy_labels))
                .batch(params['batch_size']))

    pred_steps = math.ceil(dataset_len / params['batch_size'])
    preds = model.predict(dataset, steps=pred_steps, verbose=1)
    
    # Assemble Doc Order
    for doc_id in tqdm(current_paths, desc='Assembling Doc Order'):
        code_markdown_locs = df.loc[doc_id]['cell_type']
        pred_count_per_md = pred_map[doc_id]
    
        try:
            md_dupes = dupes[doc_id]
        except KeyError:
            md_dupes = []

        code_count = code_markdown_locs.value_counts()['code']
        md_count = code_markdown_locs.value_counts()['markdown']

        pred_end = pred_start + ((md_count-len(md_dupes))*pred_count_per_md)
        doc_mc_preds = preds[pred_start:pred_end]

        code_pct_ranks = list(np.arange(1, code_count+1, dtype=np.float32))

        md_pct_ranks = []
        mini_pred_start = 0
        default_dupe_loc = 0.54263765 * (code_count - 1) # Avg dupe location is at 0.54263765 

        for i in range(md_count):
            if i in md_dupes:
                md_pct_ranks.append(default_dupe_loc)
            else:
                rel_pred_rank = doc_mc_preds[mini_pred_start:mini_pred_start+1]
                rank = (rel_pred_rank + 1)/2 * (code_count+1)
                mini_pred_start += 1
                md_pct_ranks.append(rank)

        metadata_df = code_markdown_locs.to_frame()
        pct_ranks =  code_pct_ranks + md_pct_ranks
        metadata_df['pct_rank'] = pct_ranks

        pred_order = metadata_df.sort_values("pct_rank").index.tolist()
        #y_preds[doc_id] = ' '.join(pred_order)
        y_preds[doc_id] = pred_order
        pred_start += (md_count - len(md_dupes))*pred_count_per_md

In [None]:
# Calculate Kendall Tau Scores
y_preds = pd.Series(y_preds)
ground_truth = df_orders.loc[doc_ids]
kendall_tau_score = post_processing.calculate_kendall_tau(ground_truth, y_preds)
print(f"The Kendall Tau Scores for the documents was {kendall_tau_score:.4f}.")

In [None]:
#  Here 0.8639