In [None]:
%load_ext autoreload
%autoreload 2

%pip install transformers

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import numpy as np
import pandas as pd
import yaml

import seaborn as sns
import matplotlib.pyplot as plt

import tensorflow as tf
from tqdm.auto import tqdm

import os
from pathlib import Path

from transformers import DistilBertTokenizerFast, BartTokenizer, RobertaTokenizer

import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks/ML Projects/Google AI4Code/modules')
import pipeline

import re
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

print('Libraries Imported')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 4.4 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 26.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 57.9 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 12.5 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Un

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Libraries Imported


In [None]:
# Unzip files from drive to disk
data_path = '/content/drive/MyDrive/Colab Notebooks/ML Projects/Google AI4Code/data/AI4Code.zip'
disk_path = '/content'

pipeline.unzip_files(data_path, disk_path)

Unzipping files:   0%|          | 0/139263 [00:00<?, ?it/s]


 Done unzipping data to disk path.


In [None]:
# Load in excluded IDs and drop them from the data filepath
excluded_ids_path = '/content/drive/MyDrive/Colab Notebooks/ML Projects/Google AI4Code/data/excluded_ids.yaml'

# excluded_keys = ['outliers', 'similars', 'single_code_cell', 'less_than_five', 'less_than_eight']
# excluded_ids = pipeline.load_yaml_file(excluded_ids_path)['dupe_md']

# excluded_ids = {id
#                 for key, ids in excluded_ids.items()
#                 if key in excluded_keys
#                 for id in ids}

excluded_ids = {}

data_dir = Path(disk_path)
data_paths = list((data_dir / 'train').glob('*.json'))
data_paths = pipeline.remove_excluded_id_paths(data_paths, excluded_ids)

print(f"There are {len(data_paths)} files after removing excluded ids.")

There are 139256 files after removing excluded ids.


In [None]:
# Load all your dataframes in
def read_notebook(path):
    return (pd.read_json(path, dtype={'cell_type': 'category', 'source': 'str'})
              .assign(id=path.stem)
              .rename_axis('cell_id'))

notebooks_train = [read_notebook(path) for path in tqdm(data_paths, desc='Train NBs')]

df = (pd.concat(notebooks_train)
        .set_index('id', append=True)
        .swaplevel()
        .sort_index(level='id', sort_remaining=False))

df_orders = (pd.read_csv(data_dir / 'train_orders.csv',
                         index_col='id',
                         squeeze=True)
                          .str.split()
                          .drop(index=excluded_ids))

Train NBs:   0%|          | 0/139256 [00:00<?, ?it/s]

In [None]:
# Initalize Model Name and Tokenizer
bert_name = 'distilbert-base-uncased'
bart_name = 'facebook/bart-base'
roberta_name = 'microsoft/codebert-base'

bert_tokenizer_class = DistilBertTokenizerFast
bart_tokenizer_class = BartTokenizer
roberta_tokenizer_class = RobertaTokenizer

bert_tokenizer = bert_tokenizer_class.from_pretrained(bert_name, do_lower_case=True)
bart_tokenizer = bart_tokenizer_class.from_pretrained(bart_name, do_lower_case=False)
roberta_tokenizer = roberta_tokenizer_class.from_pretrained(roberta_name, do_lower_case=False)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.68k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/498 [00:00<?, ?B/s]

In [26]:
# Helper function for ordered analysis
def get_ordered_df(id, df, df_orders):
  ordered_id = pd.Series(df.loc[id, 'source'].loc[df_orders[id]].to_numpy(),
                         index=df.loc[id, 'cell_type'].loc[df_orders[id]])
  return ordered_id

# Helper function for token inspection
def get_tokens(text):
  roberta_tokens = roberta_tokenizer(text)
  return roberta_tokens

### Outlier Texts

In [7]:
# Collect and inspect both small and large outlier texts
high_len_markdowns = {}
low_len_markdowns = {}
no_markdowns_left = []

high_len_code_cells = {}
low_len_code_cells = {}
no_code_cells_left = []

max_code_thresh = 120
max_mark_thresh = 60
min_thresh = 4

## Statistics from Overview Outlier EDA ##
# The norm mean norm code cell length was 263.73 and the median was 128.0
# The norm mean  markdown cell length was 188.85 and the median was 67.0

# The norm longest code cell length was 3242 and the shortest code cell length was 1
# The norm longest markdown cell length was 8457 and the shortest markdown cell length was

for line, cell_type, doc_id in tqdm(zip(df.loc[:, 'source'],
                                        df.loc[:, 'cell_type'],
                                        df.index.get_level_values(0)),
                                    desc='Checking for Low Text Amounts', total=len(df)):
  if cell_type == 'markdown':
    if len(line) < min_thresh:
      if doc_id in low_len_markdowns:
        low_len_markdowns[doc_id].append(line)
      else:
        low_len_markdowns[doc_id] = [line]
      if len(low_len_markdowns[doc_id]) == sum(df.loc[doc_id, 'cell_type'] == 'markdown'):
        no_markdowns_left.append(doc_id)
    if len(line) >= max_mark_thresh:
      if doc_id in high_len_markdowns:
        high_len_markdowns[doc_id].append(line)
      else:
        high_len_markdowns[doc_id] = [line]

  elif cell_type == 'code':
    if len(line) < min_thresh:
      if doc_id in low_len_code_cells:
        low_len_code_cells[doc_id].append(line)
      else:
        low_len_code_cells[doc_id] = [line]
      if len(low_len_code_cells[doc_id]) == sum(df.loc[doc_id, 'cell_type'] == 'code'):
        no_code_cells_left.append(doc_id)
    if len(line) >= max_code_thresh:
      if doc_id in high_len_code_cells:
        high_len_code_cells[doc_id].append(line)
      else:
        high_len_code_cells[doc_id] = [line]

print(f'There are {len(no_markdowns_left)} documents with all flagged markdowns and {len(no_code_cells_left)} documents with all flagged code cells after preprocessing.')
print(no_markdowns_left[:10])  # These documents must be handled, impossible to tell where they go

Checking for Low Text Amounts:   0%|          | 0/6370646 [00:00<?, ?it/s]

There are 74 documents with all flagged markdowns and 0 documents with all flagged code cells after preprocessing.
['0032b483ebaf5a', '00ded9fd6fd29b', '00eee98a220c10', '01474fdc513442', '01f45e17a3d050', '064515cce2b452', '0b3b0242291cba', '19750de05eb55f', '1c215796e751ba', '1c51d2a44ac5df']


In [8]:
# Ordering the common low len cells by frequencies of texts
common_low_len_markdowns = {}
common_low_len_code_cells = {}

for outlier_texts in low_len_markdowns.values():
  for cell in outlier_texts:
    if cell in common_low_len_markdowns:
      common_low_len_markdowns[cell] += 1
    else:
      common_low_len_markdowns[cell] = 1

for outlier_texts in low_len_code_cells.values():
  for cell in outlier_texts:
    if cell in common_low_len_code_cells:
      common_low_len_code_cells[cell] += 1
    else:
      common_low_len_code_cells[cell] = 1

# Sorting them
sorted_low_common_markdowns = sorted(common_low_len_markdowns.items(), key=lambda x:x[1], reverse=True)
sorted_low_common_code_cells = sorted(common_low_len_code_cells.items(), key=lambda x:x[1], reverse=True)

print(sorted_low_common_markdowns[:20])
print(sorted_low_common_code_cells[:20])

[('---', 2151), ('***', 563), ('___', 198), ('# ', 157), ('## ', 85), ('> ', 75), ('EDA', 61), ('Age', 50), ('KNN', 41), ('.', 35), ('And', 34), ('SVM', 30), ('#  ', 27), ('#', 19), ('>', 18), ('PCA', 18), ('...', 17), ('SVC', 17), ('Sex', 15), ('##', 14)]
[('df', 3251), ('X', 1003), ('y', 879), ('x', 467), ('!ls', 370), ('df1', 314), ('sub', 224), ('a', 224), ('df2', 188), ('ls', 129), ('cm', 123), ('Y', 121), ('pwd', 107), ('arr', 83), ('m', 73), ('b', 72), ('df3', 54), ('d', 53), ('c', 53), ('res', 49)]


In [9]:
code_markdown_locs = df['cell_type'].to_numpy()
markdown_locs = np.where(code_markdown_locs=='markdown')[0]
md_text = df.iloc[markdown_locs]['source'].to_numpy()

In [10]:
# no_preprocessing_text = [markdown
#                         for markdowns in list(high_len_markdowns.values())[2000:3000]
#                         for markdown in markdowns]
# preprocessed_text = [pipeline.markdown_preprocessing(markdown, lower_case=False, emojis=False)
#                     for markdowns in list(high_len_markdowns.values())[2000:3000]
#                     for markdown in markdowns]

no_preprocessing_text = md_text[0:5000]
preprocessed_text = [pipeline.markdown_preprocessing(markdown)
                    for markdown in md_text[0:5000]]

### Looking into Code Structure and Duplicates

In [11]:
doc_ids = df.index.get_level_values(0).unique()

dupe_ids = []
avg_loc = []

for doc_id in tqdm(doc_ids, desc='Collecting Dupe Ids and Avg Loc'):
  current_doc = df.loc[doc_id]
  ordered_cell_ids = df_orders.loc[doc_id]

  code_loc = 0
  seen_mds = {}
  dupe_mds = []

  for cell_id in ordered_cell_ids:
    if current_doc.loc[cell_id, 'cell_type'] == 'code':
      code_loc += 1
    else:
      md_text = tuple(pipeline.markdown_preprocessing(current_doc.loc[cell_id, 'source']))
      if md_text not in seen_mds:
        seen_mds[md_text] = code_loc
      else:
        dupe_mds.extend([seen_mds[md_text], code_loc])
  
  if dupe_mds:
    dupe_mds = np.asarray(list(set(dupe_mds))) / code_loc
    dupe_ids.append(doc_id)

    total_dupe_amount = len(dupe_mds)
    avg_loc.append(np.sum(dupe_mds)/total_dupe_amount)

avg_loc = np.asarray(avg_loc, dtype=np.float32)
print(avg_loc.mean())
print(len(dupe_ids))  # Current is at 8000, I think you got rid of the wrong one
# When you do this you have to make sure you are re-writing the right one

Collecting Dupe Ids and Avg Loc:   0%|          | 0/139256 [00:00<?, ?it/s]

0.54288805
0


In [None]:
# Right now there is an insane amount of dupes even without preprocessing?
# TBH though when you create a dupe, there is not much data to scrap from it unfortuneatly
# Check unprocessed dupes, how many times [empty], [divider], [emoji]
# You have to consider they may eb the lowest successss rate markdowns

# Average location when there are many emptys
# Average location of markdown in low code cell environments with both high and low markdown concentrations
# Can we check markdown pairings, what do they look like? Do they have high cosine similarity?
# How well related are they?
# How often do two appear together for markdowns? For code cells?
# Cosine similarity

### Markdown Pairings and Groupings

### Create Option to Remove All Duplicate IDs

In [None]:
# doc_ids = df.index.get_level_values(0).unique()
# dupe_ids = []

# for doc_id in tqdm(doc_ids, desc='Collecting All Dupe IDs'):
#   seen_markdowns = []
#   doc = df.loc[doc_id]
#   markdown_texts = doc[doc['cell_type'] == 'markdown'].loc[:, 'source'].to_numpy()
#   for markdown_text in markdown_texts:
#     markdown_text = pipeline.markdown_preprocessing(markdown_text, lower_case=False, emojis=False)
#     markdown_tokens = tuple(bert_tokenizer(markdown_text,
#                                            padding=False,
#                                            truncation=False,
#                                            add_special_tokens=False,
#                                            return_attention_mask=False)['input_ids'])
#     if markdown_tokens in seen_markdowns:
#       dupe_ids.append(doc_id)
#       break
#     else:
#       seen_markdowns.append(markdown_tokens)

In [13]:
# excluded_ids_path = '/content/drive/MyDrive/Colab Notebooks/ML Projects/Google AI4Code/data/excluded_ids.yaml'

# with open(excluded_ids_path, 'r+') as stream:
#   try:
#     excluded_ids = yaml.safe_load(stream)
#     excluded_ids['dupes'] = dupe_ids
#     yaml.dump(excluded_ids, stream, default_flow_style=False)
#   except yaml.YAMLError as error:
#     print(error)

### Checking for Custom Tokens

In [46]:
# Checks for opportunities to add special tokens to help the BERT or BART understand 

def get_word_freq(text):
  word_freq = {}
  for line in text:
    for word in line.split():
      if word in word_freq:
        word_freq[word] += 1
      else:
        word_freq[word] = 0
  return word_freq

def get_tokens(text):
  roberta_tokens = roberta_tokenizer(text)
  return roberta_tokens

def check_for_broken_tokens(word_freq):
  broken_tokens = {}

  for rank, word in enumerate(word_freq):
    roberta_tokens = get_tokens(word)['input_ids']
    if len(roberta_tokens) > 3:
      broken_tokens[word] = rank
  return broken_tokens

In [55]:
# Run above functions
code_broken_tokens = []
markdown_broken_tokens = []
all_dataset_splits = list(range(0, len(data_paths), round(len(data_paths)/11)))

for i in range(len(all_dataset_splits[:-1])):
  start_idx = all_dataset_splits[i]
  end_idx = all_dataset_splits[i+1]

  doc_ids, cell_metadata, text = pipeline.load_and_parse_data(data_paths, start_idx, end_idx)
  code_markdown_locs = [cell for cells in cell_metadata for cell in cells.values()]

  # Code has no preprocessing yet
  code_mask = np.asarray(code_markdown_locs) == 'code'
  code_text = np.asarray(text, dtype='object')[code_mask]
  code_word_freq = get_word_freq(code_text)
  sorted_code_word_freq = [word for word, freq in sorted(code_word_freq.items(), key=lambda x: x[1], reverse=True)]
  code_broken_tokens.append(check_for_broken_tokens(sorted_code_word_freq[:100]))

  markdown_mask = np.asarray(code_markdown_locs) == 'markdown'
  markdown_text = np.asarray(text, dtype='object')[markdown_mask]
  markdown_text = [pipeline.markdown_preprocessing(text) for text in markdown_text]
  markdown_word_freq = get_word_freq(markdown_text)
  sorted_markdown_word_freq = [word for word, freq in sorted(markdown_word_freq.items(), key=lambda x: x[1], reverse=True)]
  markdown_broken_tokens.append(check_for_broken_tokens(sorted_markdown_word_freq[:100]))
  print(f"Finished finding broken tokens for indices: {start_idx}-{end_idx}")         

Loading Json Files:   0%|          | 0/12660 [00:00<?, ?it/s]

Finished finding broken tokens for indices: 0-12660


Loading Json Files:   0%|          | 0/12660 [00:00<?, ?it/s]

Finished finding broken tokens for indices: 12660-25320


Loading Json Files:   0%|          | 0/12660 [00:00<?, ?it/s]

Finished finding broken tokens for indices: 25320-37980


Loading Json Files:   0%|          | 0/12660 [00:00<?, ?it/s]

Finished finding broken tokens for indices: 37980-50640


Loading Json Files:   0%|          | 0/12660 [00:00<?, ?it/s]

Finished finding broken tokens for indices: 50640-63300


Loading Json Files:   0%|          | 0/12660 [00:00<?, ?it/s]

Finished finding broken tokens for indices: 63300-75960


Loading Json Files:   0%|          | 0/12660 [00:00<?, ?it/s]

Finished finding broken tokens for indices: 75960-88620


Loading Json Files:   0%|          | 0/12660 [00:00<?, ?it/s]

Finished finding broken tokens for indices: 88620-101280


Loading Json Files:   0%|          | 0/12660 [00:00<?, ?it/s]

Finished finding broken tokens for indices: 101280-113940


Loading Json Files:   0%|          | 0/12660 [00:00<?, ?it/s]

Finished finding broken tokens for indices: 113940-126600


In [56]:
# Collect the top frequent words that had to be split into two or more tokens
unique_code_broken_tokens = {word
                             for section in code_broken_tokens
                             for word in section.keys()}
unique_markdown_broken_tokens = {word
                                 for section in markdown_broken_tokens
                                 for word in section.keys()}             