# Data preprocessing notebook
This notebook contains all code related to the pre-processing of raw data, into the format we used by the trainer
The kind of pre-processing depends on which source. The man pages and scraped sites are used for pre-training, while nl2bash data, ainix data, mankier data and Stack overflow data are used for pairs. Lastly, al commands are converted to a template for.  
In the man pages, the non-technical usefull sections are removed. For the stack overflow, a series of heuritics are used to filter bad questions. 

In [2]:
import json
import random
import html
import re
import bs4
import nltk
from collections import Counter
import matplotlib.pyplot as plt

module_path = os.path.abspath(os.path.join('..'))
sys.path.insert(0, module_path)

import bashlint.bash as bash
from bashlint.data_tools import *

Setting bashlex grammar using file: /home/jaron/shared/internship-jaron/bashlint/grammar/grammar100.txt
Bashlint grammar set up (148 utilities)



# Man pages
source: https://github.com/IBM/clai/blob/nlc2cmd/docs/manpage-data.md

In [2]:
with open('data/raw/manpage-data.json') as f:
    mandump = f.readlines()
    
mandump = [json.loads(line) for line in mandump]
print(len(mandump), "man pages loaded")

36669 man pages loaded


In [3]:
def clean_text(text):
    # remove bold and underscore
    text = text.replace('<b>','').replace('</b>', '')
    text = text.replace('<u>','').replace('</u>', '')
    # remove links
    text = re.sub(r'<a.*?>', '', text).replace('</a>', '')
    # decode html escaping
    text = html.unescape(text)
    return text 

with open('data/raw/man_cleaned.txt', 'w+') as f:
    for page in mandump:
        content = page['paragraphs']

        for paragraph in content:
            section_name = paragraph['section']
            if not section_name:
                continue
            section_name = section_name.lower().strip()
            if 'examp' in section_name or 'flag' in section_name or 'option' in section_name:
                text = clean_text(paragraph['text'])
                # throw way the very long paragraphs
                if len(text) < 800:
                    print(text, file=f)

# Mankier doc

In [4]:
with open('data/raw/mankierdocs.json') as f:
    mankier = json.load(f)

def process_line(line):
    line = line.strip("$ ")
    if not line:
        return None
    # remove shebangs
    if line.startswith('#!'):
        return None
    # remove comments
    if line.startswith('#'):
        return None
    
    parsed = bash_parser(line)
    try:
        template = ast2template(parsed)
    except:
        return None
    
    #if len(template) > 7 and len(template) != len(line):
    return template

In [5]:
with open('data/raw/mankier_nl.txt', 'w+') as f_nl:
    with open('data/raw/mankier_cm.txt', 'w+') as f_cm:
        for k, v in tuple(mankier.items()):
            #print(k)
            page = bs4.BeautifulSoup(v)
            page = page.body.main
            examples = page.find('section', id="Examples_(TL;DR)")
            if examples is not None:
                for li in examples.find_all('li'):
                    nl = li.span.get_text().strip(" :")
                    cm = li.find_all('code')[-1].get_text()
                    cm = process_line(cm)
                    
                    if cm is None:
                        continue
                    print(nl, file=f_nl)
                    print(cm, file=f_cm)


# AInix
source: https://github.com/DNGros/ai-nix-kernal-dataset-archie-json

In [6]:
with open('data/raw/ainix-kernal.json') as f:
    ainix = json.load(f)

In [7]:
def replace_templates(s):
    # quick'n dirty replace of the template tokens used in AInix
    # note: specific values in the cms are not important, 
    # as it will go through the templater later
    s = s.replace('[-[ENGWORD]-]', 'word')
    s = s.replace('[-[1=ENGWORD]-]', 'foo')
    s = s.replace('[-[2=ENGWORD]-]', 'bar')
    s = s.replace('[-[3=ENGWORD]-]', 'spam')
    s = s.replace('[-[USERNAME]-]', 'username')
    s = s.replace('[-[GROUPNAME]-]', 'groupname')
    s = s.replace('[-[DIRNAME]-]', 'folder')
    s = s.replace('[-[1=DIRNAME]-]', 'folder1')
    s = s.replace('[-[2=DIRNAME]-]', 'folder2')
    s = s.replace('[-[3=DIRNAME]-]', 'folder3')
    s = s.replace('[-[FILENAME]-]', 'file.txt')
    s = s.replace('[-[1=FILENAME]-]', 'file1.txt')
    s = s.replace('[-[2=FILENAME]-]', 'file2.txt')
    s = s.replace('[-[3=FILENAME]-]', 'file3.txt')
    s = s.replace('[-[EXTENSION]-]', 'txt')
    s = s.replace('[-[$1]-]', 'arg1')
    s = s.replace('[-[$2]-]', 'arg2')
    s = s.replace('[-[$3]-]', 'arg3')
    return s

with open('data/raw/ainix_nl.txt', 'w+') as f_nl:
    with open('data/raw/ainix_cm.txt', 'w+') as f_cm:
        for _, v in ainix.items():
            nls = v['x']
            cms = v['y']
            
            nls = [replace_templates(nl['x_text']) for nl in nls]
            cms = [replace_templates(cm['y_text']) for cm in cms]
            cms = [process_line(cm) for cm in cms]
            cms = list(set([cm for cm in cms if cm]))
            A, B = len(nls), len(cms)
            random.shuffle(nls)
            random.shuffle(cms)
            for i in range(max(A, B)):
                nl = nls[i%A]
                cm = cms[i%B]
                assert cm is not None
                print(nl, file=f_nl)
                print(cm, file=f_cm)
    


# Scraped sites
source: https://drive.google.com/file/d/1KmijhOXS9PI7TB0XWJ8E1g5eP2hLVlCp/view

In [8]:
with open('data/raw/scraped_sites.json') as f:
    scrape = json.load(f)
scrape = tuple(scrape.items())

In [9]:
with open('data/raw/scrape_examples.txt', 'w+') as f:
    for k, v in scrape:
        #print(k)
        page = bs4.BeautifulSoup(v).body.find('div', id='main-content')
        if not page:
            continue

        nlp = page.find('h2', id='examples')
        previous_nl = ''
        while nlp is not None:
            cmp = nlp.find_next_sibling('pre')
            if cmp is None:
                break

            nlp = cmp.find_next_sibling('p')
            cm = cmp.get_text().strip()
            nl = nlp.get_text().strip()
            if nl.startswith('Same as'):
                nl = previous_nl
            else:
                previous_nl = nl
            if '\n' not in str(cm):
                print(nl, file=f)
                print(cm, file=f)


# Stack-overflow dump

In [10]:
with open('data/raw/stackoverflow.com-bash-top5answers.score.0.json') as f:
    dump = json.load(f)
print(len(dump), 'entries loaded')

103734 entries loaded


In [11]:
# We want to remove questions that:
# 1) ask for explanation/rationale about something
# 2) ask for help with an error/issue

illegal_words = {
    # type 1
    'understand',
    'why',
    'difference'
    'explain',
    'mean',
    'how is',
    'how are',
    # type 2
    'syntax error',
    'error:',
    'exception',
    'fail',
    'crash',
    'issue',
    'problem',
    'expected',
    'invalid',
    'bad',
    'wrong',
    "n't work",
    'not work',
    "won't",
    "can't",
    'always',
    ' fix ', # avoid matching e.g. suffix
    'solve',
    'not found',
    'help me',
    'stuck',
    'cause',
    'throws',
    'denied',
    'messed',
}


cdump = []
for entry in dump:
    title = entry['title'].lower()
    body = entry['body'].lower()
    illegal = False
    for iword in illegal_words:
        if iword in title or iword in body:
            illegal = True
    if len(title)<18:
        illegal=True
        
    if not illegal:
        entry['title'] = entry['title'].strip(" ?\n\t")
        cdump.append(entry)

print(len(dump)-len(cdump), 'entries removed')

64363 entries removed


In [12]:
# remove redundant prefies from the titles
redundant_prefixes = {
    'shell:',
    'bash:',
    'in bash',
    'how do i',
    'how do you',
    'how to',
    'how can i',
    'how can you',
    'trying to',
    'best way to',
    'a way to',
    'is there',
    'is it possible to'
}
for entry in cdump:
    title = entry['title'].lower()
    for prefix in redundant_prefixes:
        if title.startswith(prefix):
            entry['title'] = entry['title'][len(prefix):].strip(" ").capitalize()
            title = entry['title'].lower()


In [13]:
# remove questions that don't contain any code in their answers
# also throws away answers with score <= 0

def contains_snippet(s):
    return s.find("<pre><code>") != -1

ccdump = []
for entry in cdump:
    entry['answers'] = [x for x in entry['answers']
                       if contains_snippet(x['body']) and int(x['score']) > 0]
        
    if len(entry['answers']):
        ccdump.append(entry)
        
print(len(cdump)-len(ccdump), 'entries removed')
print(len(ccdump), "entries remaining")

10028 entries removed
29343 entries remaining


In [14]:
def process_line(line):
    line = line.strip("$ ")
    if not line:
        return None
    # remove shebangs
    if line.startswith('#!'):
        return None
    # remove comments
    if line.startswith('#'):
        return None
    
    parsed = bash_parser(line)
    try:
        template = ast2template(parsed)
    except:
        return None
    
    if len(template) > 7 and len(template) != len(line):
        return template


def get_snippets(answer):
    answer = bs4.BeautifulSoup(answer)
    # doesn't include inline code
    snippets = answer.find_all('pre')
    r = set()
    for snip in snippets:
        snip = snip.find('code')
        if not snip:
            continue
        snip = snip.text.strip()
        lsnip = snip.lower()
        
        # no long snippets
        if lsnip.count('\n')>4:
            continue
        
        # no if statements
        if 'if' in lsnip and 'then' in lsnip:
            continue
        # no while statements
        if 'while ' in lsnip and 'done' in lsnip:
            continue
        # no for loops
        if 'for ' in lsnip and 'done' in lsnip:
            continue
        # no function defenitions
        if ('()' in lsnip or 'function' in lsnip) and '{' in lsnip and '}' in lsnip:
            continue
        
        for line in snip.split('\n'):
            line = process_line(line)
            if line is not None:
                r.add(line)
    return r


In [15]:
with open('data/raw/ST_nl.txt', 'w+') as f_nl:
    with open('data/raw/ST_cm.txt', 'w+') as f_cm:
        for entry in ccdump:
            snippets = set()
            for answer in entry['answers']:
                snippets |= get_snippets(answer['body'])
            for s in tuple(snippets)[:5]:
                print(entry['title'], file=f_nl)
                print(s, file=f_cm)

# NL2Bash / Tellina
source: https://github.com/IBM/clai/blob/nlc2cmd/docs/nl2bash-data.md

In [16]:
with open("data/raw/nl2bash-data.json") as f:
    data = json.load(f)
    data = [x[1] for x in data.items()]
    
nls = [x['invocation'] for x in data]
cms = [x['cmd'] for x in data]

with open('data/raw/nl2bash_nl.txt', 'w+') as f_nl:
    with open('data/raw/nl2bash_cm.txt', 'w+') as f_cm:
        for nl, cm in zip(nls, cms):
            parsed = bash_parser(cm)
            try:
                template = ast2template(parsed)
            except:
                continue
            print(nl, file=f_nl)
            print(template, file=f_cm)

# Putting everything together
The data is combined and divided into train/val/test splits.
Note that the construction of test/val is to be used to check e.g. overfitting and NOT to get a objective result on the accuracy! There's NO check for overlap between test-train!

In [3]:
# how large test/val splits?
test_size = 200
val_size = 200

with open('data/raw/nl2bash_nl.txt') as f:
    nls = f.readlines()
with open('data/raw/nl2bash_cm.txt') as f:
    cms = f.readlines()

with open('data/raw/ainix_nl.txt') as f:
    nls += f.readlines()
with open('data/raw/ainix_cm.txt') as f:
    cms += f.readlines()
   
with open('data/raw/mankier_nl.txt') as f:
    nls += f.readlines()
with open('data/raw/mankier_cm.txt') as f:
    cms += f.readlines()
    
pairs = list(zip(nls, cms))
random.shuffle(pairs)

test_pairs = pairs[:test_size]
pairs = pairs[test_size:]

val_pairs = pairs[:val_size]
pairs = pairs[val_size:]

train_pairs = pairs

def write_pairs(pairs, name):
    nls = [x[0] for x in pairs]
    cms = [x[1] for x in pairs]
    with open('data/clai/'+name+'_nl.txt', 'w+') as f:
        f.write(''.join(nls))
    with open('data/clai/'+name+'_cm.txt', 'w+') as f:
        f.write(''.join(cms))        
        
write_pairs(test_pairs, 'test')
write_pairs(val_pairs, 'dev')
write_pairs(train_pairs, 'train')


In [4]:
with open('data/raw/ST_nl.txt') as f:
    nls = f.readlines()
with open('data/raw/ST_cm.txt') as f:
    cms = f.readlines()
    
pairs = list(zip(nls, cms))
write_pairs(pairs, 'dirty')

In [5]:
with open('data/raw/man_cleaned.txt') as f:
    content = f.read()
with open('data/raw/scrape_examples.txt') as f:
    content += f.read()
    
with open('data/clai/pre.txt', 'w+') as f:
    f.write(content)