In [1]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os
from pathlib import Path
import json
import re

## Exploring XLCoST Dataset

In [None]:
python_desc_path = Path('1.Python-desc', 'Python-desc')
python_comment_path = Path('2.Python-comment', 'Python-comment')
python_desc_comment_path = Path('3.Python-desc and comment', 'Python-desc')

In [2]:
def load_dataset(data_dir, mode='test'):
    files = sorted([x for x in data_dir.glob('*') if x.name.startswith(mode) and x.suffix != '.jsonl'])
    data = {}
    for path in files:
        with open(path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
        key = None
        if path.suffix == '.py':
            key = 'code'
        if path.suffix == '.txt':
            key = 'text'
        data[key] = lines

    return pd.DataFrame(data)

In [3]:
def dump_code(code_token):
    special_tokens = {'NEW_LINE', 'INDENT', 'DEDENT'}
    indent = 0
    text = []
    first_of_line = True
    for tok in code_token:
        if tok not in special_tokens:
            if first_of_line:
                text.append('    ' * indent)
            text.append(tok + ' ')
            first_of_line = False
            continue

        if tok == 'NEW_LINE':
            text.append('\n')
            first_of_line = True
        if tok == 'INDENT':
            indent += 1
        if tok == 'DEDENT':
            indent -= 1

    return ''.join(text)

In [4]:
def dump_comment(text_token):
    pass

In [83]:
python_desc_comment_train = load_dataset(python_desc_comment_path)

In [84]:
for i, x in python_desc_comment_train.iterrows():
    print('#', x['text'])
    print(dump_code(x['code'].split()))
    print('----')

# Minimum sum possible by removing all occurrences of any array element | Function to find minimum sum after deletion ; Stores frequency of array elements ; Traverse the array ; Calculate sum ; Update frequency of the current element ; Stores the minimum sum required ; Traverse map ; Find the minimum sum obtained ; Return minimum sum ; Input array ; Size of array

def minSum ( A , N ) : 
    mp = { } 
    sum = 0 
    for i in range ( N ) : 
        sum += A [ i ] 
        if A [ i ] in mp : 
            mp [ A [ i ] ] += 1 
        else : 
            mp [ A [ i ] ] = 1 
    minSum = float ( ' inf ' ) 
    for it in mp : 
        minSum = min ( minSum , sum - ( it * mp [ it ] ) ) 
    return minSum 
arr = [ 4 , 5 , 6 , 6 ] 
N = len ( arr ) 
print ( minSum ( arr , N ) ) 

----
# Maximum difference between a pair of adjacent elements by excluding every element once | Function to calculate maximum difference between adjacent elements excluding every array element once ; Traverse the arra

In [89]:
def get_similarity(x, y):
    return float(torch.einsum('ac,bc->ab', x, y))

In [90]:
emb_dir = Path('results', 'xlcost', 'embeddings')
emb_paths = sorted(list(emb_dir.glob('*.npy')))

In [91]:
sims = []
for path in emb_paths:
    emb = np.load(path)
    sim = get_similarity(emb[:1], emb[1:])
    sims.append(sim)

NameError: name 'torch' is not defined

## Exploring CodeChef's Pseudo-code results

In [2]:
pseudo_dir = Path('pseudo_all')
pseudo_paths = sorted(list(pseudo_dir.glob('*.json')))

In [3]:
pseudos = {}
for path in pseudo_paths:
    with open(path, 'r', encoding='utf-8') as f:
        d = json.load(f)
        pseudos[path.stem] = d

In [4]:
pseudos['ALTDIA_18']

{'id': 'ALTDIA_18',
 'repo': 'ALTDIA',
 'src': 'for _ in range(int(input())):\n    b,w=map(int,input().split())\n    if b+w==1:print("B" if b==1 else "W")\n    elif b==1 and w==1:print("BW");print(1,2)\n    elif not b or not w:print(-1)\n    elif w==1 and b>1:\n        ans="W"\n        cnt=2\n        for i in range(b):\n            ans+="B"\n        print(ans)\n        for i in range(2,b+1):\n            print(1,i)\n    else:\n        ans="B"\n        for i in range(w):\n            ans+="W"\n        for i in range(b-1):\n            ans+="B"\n        cnt=2\n        print(ans)\n        for i in range(2,b+w+1):\n            print(1,i)',
 'tgt': 'for _ in range(int(input())):\n    b,w=map(int,input().split())\n    if b+w==1:print("B" if b==1 else "W")\n    elif b==1 and w==1:print("BW");print(1,2)\n    elif not b or not w:print(-1)\n    elif w==1 and b>1:\n        ans="W"\n        cnt=2\n        for i in range(b):\n            ans+="B"\n        print(ans)\n        for i in range(2,b+2):\

In [5]:
def show_pseudo_item(d):
    print(f'ID: {d["id"]}\n--------\n[Wrong code]\n{d["src"]}\n--------\n')
    for x in d['pseudo_src']:
        print(f'>>> {x}')
    print(f'--------\n[Correct code]\n{d["tgt"]}\n--------\n')
    for x in d['pseudo_tgt']:
        print(f'>>> {x}')
    print('--------\n')

In [6]:
show_pseudo_item(pseudos['ALTDIA_18'])

ID: ALTDIA_18
--------
[Wrong code]
for _ in range(int(input())):
    b,w=map(int,input().split())
    if b+w==1:print("B" if b==1 else "W")
    elif b==1 and w==1:print("BW");print(1,2)
    elif not b or not w:print(-1)
    elif w==1 and b>1:
        ans="W"
        cnt=2
        for i in range(b):
            ans+="B"
        print(ans)
        for i in range(2,b+1):
            print(1,i)
    else:
        ans="B"
        for i in range(w):
            ans+="W"
        for i in range(b-1):
            ans+="B"
        cnt=2
        print(ans)
        for i in range(2,b+w+1):
            print(1,i)
--------

>>> For each test case: 
    1.1. If there are only one black or white, print that color. 
    1.2. Else if there are no white or black, print -1. 
    1.3. Else if there are only one white and more than one black: 
        1.3.1. Print "W" and print 1,2. 
        1.3.2. Print "B" for the rest of the black. 
        1.3.3. Print 1,i for all i from 2 to the number of black + 1. 
 