In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import csv
import json
import pandas as pd
from sklearn.model_selection import train_test_split
import os
from tokenize import tokenize
from io import BytesIO
import re

#converting the raw data for CodeXGLUE#

In [None]:
jsonl_path = '/content/drive/MyDrive/CS4650_proj/data/leetcodecomplete.jsonl'
output_dir = '/content/drive/MyDrive/CS4650_proj/data/clean_data.jsonl'

In [None]:
def tokenize_code(code):
    tokens = []
    for tok in tokenize(BytesIO(code.encode('utf-8')).readline):
        if tok.type == 1 or tok.type == 54:  # Name or operator
            tokens.append(tok.string)
        elif tok.type == 2 or tok.type == 3 or tok.type == 51:  # Number, string, or comment
            tokens.append(tok.string)
    return tokens

def tokenize_text(text):
    # Removing special characters and tokenizing by spaces
    text = re.sub(r'[^\w\s]', '', text)
    return text.split()

# Updated function to parse JSONL file and extract Python code snippets along with input as docstrings
def parse_jsonl_for_python_code_with_docstrings(file_path):
    entries = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for i, line in enumerate(file):
            entry = json.loads(line)
            if 'output' in entry and 'input' in entry:
                python_code = entry['output'].strip('```python\n').strip('```').strip()
                code_tokens = tokenize_code(python_code)
                input_text = entry['input']
                docstring_tokens = tokenize_text(input_text)
                entries.append({
                    "code_tokens": code_tokens,
                    "docstring_tokens": docstring_tokens
                })
    return entries

def parse_raw_input_step(file_path):
    dataset1 = []  # Dataset for segments with ' 2 ' or 'then', and other full inputs
    dataset2 = []  # Dataset for the rest of the segments if input is split
    delimiter_pattern = re.compile(r' \b2\b |then', flags=re.IGNORECASE)

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            entry = json.loads(line.strip())
            if 'output' in entry and 'input' in entry:
                python_code = entry['output'].strip('```python\n').strip('```').strip()
                code_tokens = tokenize_text(python_code)
                input_text = entry['input']

                parts = delimiter_pattern.split(input_text)
                matches = delimiter_pattern.findall(input_text)

                # Tokenize each part and add it to the appropriate dataset
                for i in range(len(parts)):
                    part_tokens = tokenize_text(parts[i].strip())
                    if i < len(matches):  # For parts before a delimiter
                        dataset1.append({
                            "code_tokens": code_tokens,
                            "docstring_tokens": part_tokens
                        })
                    else:  # For the last part or if no matches
                        if matches:
                            dataset2.append({
                                "code_tokens": code_tokens,
                                "docstring_tokens": part_tokens
                            })
                        else:
                            dataset1.append({
                                "code_tokens": code_tokens,
                                "docstring_tokens": part_tokens
                            })

    return dataset1, dataset2


def save_to_jsonl(data, output_file_path):
    with open(output_file_path, 'w', encoding='utf-8') as file:
        for item in data:
            json_line = json.dumps(item)
            file.write(json_line + '\n')

In [None]:
results = parse_jsonl_for_python_code_with_docstrings(jsonl_path)
train, temp = train_test_split(results, test_size=0.3, random_state=42)
test, val = train_test_split(temp, test_size=1/3, random_state=42)

train_path = '/content/drive/MyDrive/CS4650_proj/data/clean_data_train.jsonl'
test_path = '/content/drive/MyDrive/CS4650_proj/data/clean_data_test.jsonl'
val_path = '/content/drive/MyDrive/CS4650_proj/data/clean_data_val.jsonl'

save_to_jsonl(train, train_path)
save_to_jsonl(test, test_path)
save_to_jsonl(val, val_path)

In [None]:
results_step1, results_rest = parse_raw_input_step(jsonl_path)

train_step1, temp_step1 = train_test_split(results_step1, test_size=0.3, random_state=42)
test_step1, val_step1 = train_test_split(temp_step1, test_size=1/3, random_state=42)

train_rest, temp_rest = train_test_split(results_step1, test_size=0.3, random_state=42)
test_rest, val_rest = train_test_split(temp_rest, test_size=1/3, random_state=42)

train_step1_path = '/content/drive/MyDrive/CS4650_proj/data/clean_data_train_step1.jsonl'
test_step1_path = '/content/drive/MyDrive/CS4650_proj/data/clean_data_test_step1.jsonl'
val_step1_path = '/content/drive/MyDrive/CS4650_proj/data/clean_data_val_step1.jsonl'

train_rest_path = '/content/drive/MyDrive/CS4650_proj/data/clean_data_train_rest.jsonl'
test_rest_path = '/content/drive/MyDrive/CS4650_proj/data/clean_data_test_rest.jsonl'
val_rest_path = '/content/drive/MyDrive/CS4650_proj/data/clean_data_val_rest.jsonl'

save_to_jsonl(train_step1, train_step1_path)
save_to_jsonl(test_step1, test_step1_path)
save_to_jsonl(val_step1, val_step1_path)

save_to_jsonl(train_rest, train_rest_path)
save_to_jsonl(test_rest, test_rest_path)
save_to_jsonl(val_rest, val_rest_path)