In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import re
import json
import requests
from pathlib import Path

import yaml
import random
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import seaborn as sns
import matplotlib.pyplot as plt

Mounted at /content/drive


In [None]:
# Load in cleared python ids
python_ids_path = '/content/drive/MyDrive/Colab Notebooks/ML Projects/Google AI4Code/data/jup_python_ids/extra_ids_240000_500000.yaml'

with open(python_ids_path, 'r') as stream:
  python_ids = yaml.safe_load(stream)

python_paths = [Path(id) for id in python_ids]

### Clean the Data and Filter Out Notebooks

In [None]:
def convert_list_to_text(text):
  if type(text) == str:
    return text
  else:
    text = " ".join(text)
  return text

def create_ids(length, count, prefix):
  return [prefix + str(i) for i in range(count, count+length)]

In [None]:
# Clean data we will add a rank file to each one
save_path = '/content/drive/MyDrive/Colab Notebooks/ML Projects/Google AI4Code/data/juypter_train'
empty_text_pattern = re.compile('^[ \t\n]*$')

doc_num = 0
count = 0

with requests.Session() as s:
  for python_path in tqdm(python_paths, desc='Cleaning, Filtering, and Saving Data Frames'):
    response = s.get(f'https://github-notebooks-update1.s3-eu-west-1.amazonaws.com/{python_path}')
    nb_file = response.json()

    # Load in as DataFrame and Clean Up Columns
    current_doc = pd.DataFrame(nb_file['cells'])
    current_doc = current_doc.drop(columns=['metadata', 'outputs', 'execution_count'])
    current_doc = current_doc[(current_doc['cell_type'] == 'code') | (current_doc['cell_type'] == 'markdown')] 

    # Clean Text
    current_doc["source"] = current_doc["source"].apply(convert_list_to_text)
    current_doc['source'] = current_doc['source'].replace(empty_text_pattern, np.nan)
    current_doc = current_doc.dropna()

    # Setup Ranks
    current_doc = current_doc.reset_index(drop=True)
    current_doc['rank'] = current_doc.index
    current_doc = current_doc.sort_values(by=['cell_type', 'rank'])

    # Check for minimum cell type counts
    value_counts = current_doc['cell_type'].value_counts()
    try:
      code_count = value_counts['code']
      md_count = value_counts['markdown']
    except:
      continue

    # Shuffle markdowns
    md_rows = current_doc[code_count:]
    md_rows = md_rows.sample(frac=1)
    current_doc[code_count:] = md_rows

    # Create cell_ids
    code_ids = create_ids(code_count, count, prefix='jup_p2_code_id_')
    md_ids = create_ids(md_count, count, prefix='jup_p2_md_id_')
    current_doc['cell_id'] = code_ids + md_ids

    if md_count >= 1 and code_count >= 1:
      current_doc.to_csv(save_path + f'/jup_extra_p2_{doc_num}.csv', index=False)
      count += len(current_doc)
      doc_num += 1

Cleaning, Filtering, and Saving Data Frames:   0%|          | 0/82221 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
check_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/ML Projects/Google AI4Code/data/Juypter Extra/jup_extra_200.csv')
check_df

Unnamed: 0,cell_type,source,rank,cell_id
0,code,### Introducing dataset \n import numpy as np ...,3,jup_code_id_9409
1,code,# After loading packages & data : have a quick...,4,jup_code_id_9410
2,code,anime_users.head(),5,jup_code_id_9411
3,code,# We should at first merge both anime_names & ...,7,jup_code_id_9412
4,code,# Create a crosstable \n # Swow detail of anim...,8,jup_code_id_9413
5,code,from sklearn.decomposition import PCA \n pca =...,10,jup_code_id_9414
6,code,ps = pd.DataFrame(pca_samples)\n ps.head(),11,jup_code_id_9415
7,code,"tocluster = pd.DataFrame(ps[[0,1,2]])",12,jup_code_id_9416
8,code,from mpl_toolkits.mplot3d import Axes3D,13,jup_code_id_9417
9,code,"plt.rcParams['figure.figsize'] = (16, 9)\n \n ...",14,jup_code_id_9418


In [None]:
# juypter_orders = {}
# for data_path in tqdm(juypter_paths, desc='Loading Juypter CSV Files'):
#   data = pd.read_csv(data_path, index_col='cell_id') 
#   id = data_path.stem
#   orders = list(data.sort_values(['rank']).index)
#   juypter_orders[id] = orders

# juypter_orders = pd.Series(juypter_orders)
# juypter_orders.to_json('/content/drive/MyDrive/Colab Notebooks/ML Projects/Google AI4Code/data/juypter_orders.json')