# Verified Commands: Weighting to Match Training Data

Running the scraping, generation, and verification pipeline to generate a bash command dataset, matching the utility distributions of the training dataset.

In [9]:
from generator import Generator, replace
from scraper import WebScraper
import collections

## Reading the Training Data and Measuring the Distribution

First, measuring the distribution of the training data/

In [2]:
with open('cmds_proccess_train.txt') as fp:
    txt = fp.read()
    
txt = txt.split('\n')
txt

['find Path -type d ! -perm -Permission',
 'find Path -type d -name Regex -execdir tar -c -v -f File File \\;',
 'find Path -user Regex',
 'find Path -name Regex -delete',
 'find Path -name Regex',
 'date -u -d @1267619929',
 'find Path -group Regex',
 'find Path -type d -exec chmod Permission {} +',
 'find Path -type f -print0 | xargs -r -0 -I {} grep -F Regex {}',
 'find Path -name Regex | xargs -I {} grep -r Regex {}',
 'find Path Path Path Path',
 'find Path -type f -name Regex',
 'zcat Regex | grep -i Regex',
 'zcat Regex | head -n Quantity',
 'fold File | wc -l',
 'find Path -print0 | xargs -0 -I {} echo {}',
 'find Path -type d -name Regex -exec rsync -a -v -R {} File \\; -exec rm -r -f File \\;',
 'cd $( find Path -name Regex | xargs -I {} dirname {} )',
 'set | grep Regex',
 'read -r -p $( echo Regex ) Regex',
 'tree Directory',
 'kill -s Regex Regex',
 "date '+%Y' --date Regex",
 'find Path -mmin -Quantity',
 'find Path -type f -exec bzip2 {} \\;',
 'find Path -name Regex -ty

In [17]:
utility_dist = collections.defaultdict(int)
for cmd in txt:
    ut = cmd.split(' ')[0]
    utility_dist[ut] += 1

utilities = []
scale = utility_dist['find']
ut_scale = []
for ut in utility_dist:
    utilities.append(ut)
    ut_scale.append((ut, float(utility_dist[ut]) / scale))

scale_d = {}
for ut, s in ut_scale:
    scale_d[ut] = s
scale_d

{'find': 1.0,
 'date': 0.0025299662156950467,
 'fold': 0.0001388396093978989,
 'who': 0.014932971321907348,
 'ls': 0.5014886691452108,
 'df': 0.008623482405936165,
 'chown': 0.008623482405936165,
 'watch': 0.007003686962960678,
 'ln': 0.012572697962143067,
 'seq': 0.0001388396093978989,
 'cut': 0.0018357681687055521,
 'head': 0.00029310584206223097,
 'mv': 0.007003686962960678,
 'sort': 0.056353454792280516,
 'readlink': 0.0025299662156950467,
 'mkdir': 0.00029310584206223097,
 'w': 0.0012804097311139564,
 'file': 0.06261666383845241,
 'sed': 0.007003686962960678,
 'tac': 0.00029310584206223097,
 'echo': 0.00029310584206223097,
 'cat': 0.004396587630933465,
 'od': 0.014932971321907348,
 'rmdir': 0.00029310584206223097,
 'dirname': 4.627986979929963e-05,
 'ps': 0.15239961124909368,
 'comm': 0.0025299662156950467,
 'grep': 0.32124400290020516,
 'join': 0.0018357681687055521,
 'hostname': 0.004396587630933465,
 'du': 0.045107446431050705,
 'mktemp': 0.0012804097311139564,
 'chmod': 0.0018

## Scraping

In [4]:
ws = WebScraper(utilities=utilities)
ws.extract_utilities()
ws.save_json('test_syntax.json', 'test_map.json')

find
syntax not found for find




  soup = BeautifulSoup(r.text)


date
No web page found for zcat
fold
No web page found for cd
No web page found for set
No web page found for read
No web page found for tree
kill
syntax not found for kill
who
ls
df
No web page found for history
chown
watch
ln
seq
cut
head
mv
sort
No web page found for shopt
No web page found for pushd
readlink
mkdir
w
rsync
syntax not found for rsync
No web page found for which
file
syntax not found for file
sed
tac
echo
crontab
syntax not found for crontab
ssh
syntax not found for ssh
No web page found for ifconfig
No web page found for awk
cat
od
false
syntax not found for false
rmdir
dirname
No web page found for dig
ps
comm
No web page found for jobs
grep
No web page found for source
join
hostname
syntax not found for hostname
du
No web page found for gzip
mktemp
chmod
basename
syntax not found for basename
tr
su
split
No web page found for ping
tail
No web page found for $(
No web page found for finger
nl
env
No web page found for mount
ssh-keygen
syntax not found for ssh-keygen

# Generating

Since we currently don't know the number of valid commands, we will generate the maximum number of commands, run them all through the validator, and then process and sample accordingly.

In [5]:
g = Generator('test_syntax.json', 'test_map.json', utilities)

In [6]:
g.generate_all_commands('test_unverified.txt')

['find Folder -o -cmin Number -size Number',
 'find Folder -name Regex -regex Regex -okdir',
 'find Folder -mtime Number -newer Number -help',
 'find Folder -iwholename Regex -mtime Number -n',
 'find Folder -print -nouser -help',
 'find Folder -ok -false',
 'find Folder -execdir -n -fprintf',
 'find Folder -P -daystart -noignore_readdir_race',
 'find Folder -delete -anewer Number -ok',
 'find Folder -inum Number -samefile File -xdev',
 'find Folder -execdir -iregex Regex -path Regex',
 'find Folder -delete -mount -uid',
 'find Folder -used Number -fprint0 File -xdev',
 'find Folder -inum Number -Olevel -group Number',
 'find Folder -mmin Number -uid -path Regex',
 'find Folder -ls -noignore_readdir_race -size Number',
 'find Folder -depth -used Number -ilname Regex',
 'find Folder -version -lname Regex -empty',
 'find Folder -or -executable -fprint0 File',
 'find Folder -o -noignore_readdir_race -used Number',
 'find Folder -version -depth -samefile File',
 'find Folder -ipath Regex -

In [10]:
replace('rep_map.json', 'test_unverified.txt', 'test_rep.txt')

In [11]:
with open('test_rep.txt') as fp:
    print(len(fp.read().split('\n')))

3323603


This is way too many commands to verify, so we will make sure every utility has no more commands than half the number that find has.

In [13]:
with open('test_rep.txt') as fp:
    txt = fp.read().split('\n')

d = collections.defaultdict(list)
for line in txt:
    d[line.split(" ")[0]].append(line)
    
print(f"Number of find commands: {len(d['find'])}")

import random
ret_list = []

maximum = len(d['find']) // 2
for ut in d:
    if ut == 'find':
        ret_list.extend(d[ut])
    elif len(d[ut]) > maximum:
        ret_list.extend(random.sample(d[ut], maximum))

with open('short_rep.txt', 'w') as fp:
    fp.write("\n".join(ret_list))

Number of find commands: 64823


In [14]:
len(ret_list)

194467

In [33]:
with open('verified_find_3.txt') as fp:
    txt = fp.read().split('\n')
    print(len(txt))

53564


In [35]:
d2 = collections.defaultdict(list)
for line in txt:
    ut = line.split(" ")[0]
    if "--" not in line:
        d2[ut].append(line)
        
multiplier = len(d2['find'])
final_ret = []
for ut in d2:
    num_wanted = int(scale_d[ut] * multiplier)
    if len(d2[ut]) <= num_wanted:
        final_ret.extend(d2[ut])
    else:
        final_ret.extend(random.sample(d2[ut], num_wanted))
len(final_ret)

35440

In [36]:
with open('new_training.txt', 'w') as fp:
    fp.write('\n'.join(final_ret))

In [40]:
import json
def make_generic(rep_path, in_file, out_file):
    with open(rep_path, 'r') as fp:
        reps = json.load(fp)
    
    d = {}
    for k in reps:
        d[reps[k]] = k
        
    with open(in_file) as fp:
        cmds = fp.read().split('\n')
    
    ret = []
    for cmd in cmds:
        ret_cmd = []
        lst = cmd.split(" ")
        for val in lst:
            if val in d:
                val = d[val]
            ret_cmd.append(val)
        ret.append(" ".join(ret_cmd))
    
    with open(out_file, 'w') as fp:
        fp.write("\n".join(ret))

In [41]:
make_generic('rep_map.json', 'new_training.txt', 'generic_training.txt')
ret = []
import json
with open('generic_training.txt') as fp:
    cmds = fp.read().split('\n')
    print(len(cmds))
    for cmd in cmds:
        cmd = cmd.replace("Folder", "Path")
        ret.append(cmd)
ret = "\n".join(ret)

with open('generic_training.txt', 'w') as fp:
    fp.write(ret)

35440


Verifying the training data

In [None]:
with open('cmds_proccess_train.txt') as fp:
    txt = fp.