# Exploratory Data Analysis: Web Scraping Utilities with Beautiful Soup

In this notebook, I hope to create a parser for web scraping the linux man pages to gather information on some of the most common utilities and their corresponding flags. 

In [1]:
import requests
from bs4 import BeautifulSoup

In [111]:
# some of the most common utilities
utilities = [
    "find",
    "xargs",
    "grep",
    "rm",
    "echo",
    "ls",
    "sort",
    "chmod",
    "wc",
    "cat",
    "cut",
    "head",
    "mv",
    "chown",
    "cp",
    "mkdir",
    "tr",
    "tail",
    "dirname",
    "tar",
    "uniq",
    "ln",
    "split",
    "tee",
    "date",
    "pwd",
    "ssh",
    "diff",
    "du",
    "file",
    "rename",
    "md5sum",
    "comm",
    "mktemp",
    "df",
    "rev",
    "rmdir",
    "od",
    "hostname",
]
len(utilities)

39

## Finding Flags for 1 Utility
First, we want to create a parser to find the available flags for just one of the utilities. We will start with the 'find' utility and use the BeautifulSoup package to parse for the corresponding flags.

In [95]:
utility = "uniq"
utility_url = f'https://man7.org/linux/man-pages/man1/{utility}.1.html'
r = requests.get(utility_url)
soup = BeautifulSoup(r.text)
print(soup.prettify()[:1000])

<!DOCTYPE html>
<html lang="en-US">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <title>
   uniq(1) - Linux manual page
  </title>
  <link href="../../../style.css" rel="stylesheet" title="style" type="text/css"/>
  <link href="../style.css" rel="stylesheet" title="style" type="text/css"/>
 </head>
 <body>
  <div class="page-top">
   <a id="top_of_page">
   </a>
  </div>
  <!--%%%TOP_BAR%%%-->
  <div class="nav-bar">
   <table class="nav-table">
    <tr>
     <td class="nav-cell">
      <p class="nav-text">
       <a href="../../../index.html">
        man7.org
       </a>
       &gt; Linux &gt;
       <a href="../index.html">
        man-pages
       </a>
      </p>
     </td>
     <td class="training-cell">
      <p class="training-text">
       <a class="training-link" href="http://man7.org/training/">
        Linux/UNIX system programming training
       </a>
      </p>
     </td>
    </tr>
   </table>
  </div>
  <hr cl

The syntax example could definitely be extremely useful in the future when creating the generator and determining the order in which to assemble the options and arguments.

In [96]:
desc = soup.find_all('pre')[2].text
print(desc)


       uniq [OPTION]... [INPUT [OUTPUT]]



In [97]:
options = "\n".join([soup.find_all('pre')[3].text, soup.find_all('pre')[4].text])
print(options[:500])


       Filter adjacent matching lines from INPUT (or standard input),
       writing to OUTPUT (or standard output).

       With no options, matching lines are merged to the first
       occurrence.

       Mandatory arguments to long options are mandatory for short
       options too.

       -c, --count
              prefix lines by the number of occurrences

       -d, --repeated
              only print duplicate lines, one for each group

       -D     print all duplicate lines

       --


In [98]:
# extracting and cleaning the corresponding flags
punctuation = set(_ for _ in ",.()")
stripped_options = [line.strip() for line in options.split('\n')]
flag_lines = list(filter(lambda x: x and x[0] == "-", stripped_options))
flags = [line.split(" ")[0] for line in flag_lines]
clean_flags = [f if f[-1] not in punctuation else f[:-1] for f in flags]
clean_flags = [f if "[" not in f else f.split("[")[0] for f in clean_flags]
set(clean_flags)

{'--all-repeated',
 '--group',
 '--help',
 '--version',
 '-D',
 '-c',
 '-d',
 '-f',
 '-i',
 '-s',
 '-u',
 '-w',
 '-z'}

## Scaling for all of our utilities

Now that we've been able to get a list of flags for one of our utilities, we can scale the solution to collect all of the available flags for each utility in our list of 23 utilities. To do this, we will apply the same parsing above to each utility webpage. 

In [99]:
data = {}
descs = {}

In [113]:
for utility in utilities:
    utility_url = f'https://man7.org/linux/man-pages/man1/{utility}.1.html'
    r = requests.get(utility_url)
    soup = BeautifulSoup(r.text)
    desc = soup.find_all('pre')[2].text
    descs[utility] = desc.split('\n')[1].strip()
    pre_len = len(soup.find_all('pre'))
    options = "\n".join([soup.find_all('pre')[i].text for i in range(3, pre_len)])
    stripped_options = [line.strip() for line in options.split('\n')]
    flag_lines = list(filter(lambda x: x and x[0] == "-", stripped_options))
    clean_flags = flag_lines
    d = set(flag for flag in clean_flags)
    data[utility] = {}
    data[utility]['lines'] = d
    print(desc)


       find [-H] [-L] [-P] [-D debugopts] [-Olevel] [starting-point...]
       [expression]


       xargs [options] [command [initial-arguments]]


       grep [OPTION...] PATTERNS [FILE...]
       grep [OPTION...] -e PATTERNS ... [FILE...]
       grep [OPTION...] -f PATTERN_FILE ... [FILE...]


       rm [OPTION]... [FILE]...


       echo [SHORT-OPTION]... [STRING]...
       echo LONG-OPTION


       ls [OPTION]... [FILE]...


       sort [OPTION]... [FILE]...
       sort [OPTION]... --files0-from=F


       chmod [OPTION]... MODE[,MODE]... FILE...
       chmod [OPTION]... OCTAL-MODE FILE...
       chmod [OPTION]... --reference=RFILE FILE...


       wc [OPTION]... [FILE]...
       wc [OPTION]... --files0-from=F


       cat [OPTION]... [FILE]...


       cut OPTION... [FILE]...


       head [OPTION]... [FILE]...


       mv [OPTION]... [-T] SOURCE DEST
       mv [OPTION]... SOURCE... DIRECTORY
       mv [OPTION]... -t DIRECTORY SOURCE...


       chown [OPTION]... [OWNER][:[GROUP

In [114]:
data

{'find': {'lines': {'-D                     4.3.1',
   '-D debugopts',
   '-D help.  Valid debug options include',
   '-H                     4.2.5      POSIX',
   '-H     Do not follow symbolic links, except while processing the',
   '-H     This option is supported.',
   '-H option or the -L option is in effect, then the time of',
   '-H, -L and -P and any previous -follow, but the reference file is',
   '-L                     4.2.5      POSIX',
   '-L     Follow symbolic links.  When find examines or prints',
   '-L     This option is supported.',
   '-O                     4.3.1',
   '-Olevel',
   '-P                     4.2.5      BSD',
   '-P     Never follow symbolic links.  This is the default',
   '-amin n',
   '-anewer reference',
   '-atime n',
   '-cmin n',
   '-cnewer reference',
   '-cnewer.',
   '-context pattern',
   '-ctime n',
   '-d                     4.2.3      BSD',
   '-d     A synonym for -depth, for compatibility with FreeBSD,',
   '-daystart',
   '-delete',
 

In [115]:
# The number of flags in our data structure
sum(len(data[x]['lines']) for x in data)

1001

In [116]:
def get_inner_brackets(s):
    open_idx = s.index("[") + 1
    closed_idx = s.index("]")
    a = s[open_idx: closed_idx]
    a = a.replace("=", "")
    return a

def get_equal_arg(s):
    return remove_punctuation(s.split("=")[1])
    
def remove_punctuation(s):
    punctuation = set(_ for _ in ",.()")
    return "".join([x if x not in punctuation else "" for x in s])
    
def remove_brackets(s):
    brackets = {"[", "]"}
    return "".join([x if x not in brackets else "" for x in s])

def get_flag(line):
    punctuation = set(p for p in "[].,()=[]")
    for val in punctuation:
        line = line.replace(val, " ")
    flag = line.split(" ")[0]
    return flag

## Cleaning Flags

In [117]:
for ut in data:
    for flag_line in data[ut]['lines']:
        flag_line = remove_punctuation(flag_line)
        flag, arg = get_flag(flag_line), None
        if "[" in flag_line and "]" in flag_line:
            arg = get_inner_brackets(flag_line)
        elif "=" in flag_line:
            arg = get_equal_arg(flag_line)
        data[ut][flag] = arg

In [118]:
for ut in data:
    if 'lines' in data[ut]:
        del data[ut]['lines']
        
data

{'find': {'-ls': None,
  '-delete': None,
  '-cnewer': None,
  '-ignore_readdir_race': None,
  '-Olevel': None,
  '-cmin': None,
  '-perm': None,
  '-fstype': None,
  '-not': None,
  '-printf': None,
  '-writable': None,
  '-follow': None,
  '-H': None,
  '-O': None,
  '-print': None,
  '-prune': None,
  '-fls': None,
  '-L': None,
  '-lname': None,
  '-D': None,
  '-links': None,
  '-atime': None,
  '-quit': None,
  '-wholename': None,
  '-name': None,
  '-uid': None,
  '-iwholename': None,
  '-group': None,
  '-iname': None,
  '-execdir': None,
  '-path': None,
  '-newer': None,
  '-inum': None,
  '-fprintf': None,
  '-mtime': None,
  '-noignore_readdir_race': None,
  '-warn': None,
  '-ilname': None,
  '-files0-from': None,
  '-true': None,
  '-ok': None,
  '-xtype': None,
  '-help': None,
  '-iregex': None,
  '-newerXY': None,
  '-regextype': None,
  '-depth': None,
  '-samefile': None,
  '-or': None,
  '-regex': None,
  '-amin': None,
  '-executable': None,
  '-ipath': None,
  '-t

In [119]:
# the number of flags that have arguments
count = 0
for ut in data:
    for flag in data[ut]:
        if data[ut][flag]: count +=1
count

212

## Manually Inserting Argument Types

Now that we have the utilities mapped to the appropriate flags, we will need to manually insert the data types corresponding to each flag.

In [120]:
argument_types = [
    'Regex',
    'File',
    'Directory',
    'Path',
    'Number',
    '+Number',
    '-Number',
    'Quantity',
    '+Quantity',
    '-Quantity',
    'Size',
    '+Size',
    '-Size',
    'Timespan',
    '+Timespan',
    '-Timespan',
    'DateTime',
    '+DateTime',
    '-DateTime',
    'Permission',
    '+Permission',
    '-Permission'
]

Getting a sense of the argument types the parser has discovered

In [121]:
def flag_types(data):
    l = []
    for ut in data:
        for flag in data[ut]:
            if data[ut][flag]:
                l.append(data[ut][flag])
    return l
flag_types(data)[:20]

['cwbkMG',
 'max-args',
 'eof-str',
 'max-procs',
 'file',
 'replace-str',
 'max-lines',
 'name',
 'delim -d delim',
 'SEP',
 'PATTERNS',
 'TYPE',
 'NUM',
 'WHEN',
 'ACTION',
 'FILE',
 'GLOB',
 'LABEL',
 'GLOB',
 'FILE']

In [124]:
type_mapping = {
    'Regex': {'str', 'sep', 'pattern', 'word', 'delim', 'suffix', 'from', 'glob'},
    'File': {'file'},
    'Directory': {'dir'},
    'Path': {'path'},
    'Number': {'num', 'n', 'cols', 'pid', 'max', 'bytes'},
    'Quantity': {},
    'Size':{'size'},
    'Timespan':{},
    'DateTime': {'timestamp', 'time'},
    'Permission':{},
}
# _NUMBER, _PATH,
# _FILE, _DIRECTORY, _DATETIME, _PERMISSION,
# _TIMESPAN, _SIZE
# ,  with  the  default  option  of
# _REGEX

def non_conforming_flags(data, types):
    l = []
    for ut in data:
        for flag in data[ut]:
            if data[ut][flag] and data[ut][flag] not in types:
                l.append(":".join([ut, flag, data[ut][flag]]))
    return l

## Converting Argument Types in Structure to Known Types

Using the type mappings, we can convert the argument types we parsed into the set number of types we know.

In [125]:
def convert_flag_types(data, mapping):
    for ut in data:
        for flag in data[ut]:
            if data[ut][flag]:
                for t in mapping:
                    for substr in mapping[t]:
                        if substr in data[ut][flag].lower():
                            data[ut][flag] = t
                        
convert_flag_types(data, type_mapping)
non_conforming_flags(data, argument_types)

['find:-size:cwbkMG',
 'grep:--binary-files:TYPE',
 'grep:--label:LABEL',
 'rm:--preserve-root:all',
 'ls:-p:slash',
 'sort:--compress-program:PROG',
 'sort:-C:quiet --check',
 'sort:-k:KEYDEF',
 'sort:--files0-from:F',
 'wc:--files0-from:F',
 'cut:-f:LIST',
 'cut:-c:LIST',
 'cut:-b:LIST',
 'head:-n:-',
 'head:-c:-',
 'cp:--no-preserve:ATTR_LIST',
 'cp:--preserve:ATTR_LIST',
 'cp:--context:CTX',
 'mkdir:-m:MODE',
 'mkdir:--context:CTX',
 'tail:-c:+',
 'tar:--hole-detection:METHOD',
 'tar:--group::GID',
 'tar:--owner::UID',
 'tar:--newer-mtime:DATE',
 'tar:-N:DATE --after-date',
 'tar:--file:remotehost:/dev/sr0',
 'tar:--quoting-style:STYLE',
 'tar:-f:ARCHIVE',
 'tar:--sort:ORDER',
 'tar:-V:TEXT',
 'tar:--pax-option:[:',
 'tar:-b:BLOCKS',
 'tar:--atime-preserve:METHOD',
 'tar:-K:MEMBER',
 'tar:-H:FORMAT',
 'uniq:--group:METHOD',
 'uniq:--all-repeated:METHOD',
 'tee:--output-error:MODE',
 'date:--rfc-3339:FMT',
 'date:-I:FMT',
 'diff:-x:PAT',
 'diff:--GTYPE-group-format:GFMT',
 'diff:-I:

In [134]:
len(data['grep'])

51

In [126]:
descs

{'find': 'find [-H] [-L] [-P] [-D debugopts] [-Olevel] [starting-point...]',
 'xargs': 'xargs [options] [command [initial-arguments]]',
 'grep': 'grep [OPTION...] PATTERNS [FILE...]',
 'rm': 'rm [OPTION]... [FILE]...',
 'echo': 'echo [SHORT-OPTION]... [STRING]...',
 'ls': 'ls [OPTION]... [FILE]...',
 'sort': 'sort [OPTION]... [FILE]...',
 'chmod': 'chmod [OPTION]... MODE[,MODE]... FILE...',
 'wc': 'wc [OPTION]... [FILE]...',
 'cat': 'cat [OPTION]... [FILE]...',
 'cut': 'cut OPTION... [FILE]...',
 'head': 'head [OPTION]... [FILE]...',
 'mv': 'mv [OPTION]... [-T] SOURCE DEST',
 'chown': 'chown [OPTION]... [OWNER][:[GROUP]] FILE...',
 'cp': 'cp [OPTION]... [-T] SOURCE DEST',
 'mkdir': 'mkdir [OPTION]... DIRECTORY...',
 'tr': 'tr [OPTION]... SET1 [SET2]',
 'tail': 'tail [OPTION]... [FILE]...',
 'dirname': 'dirname [OPTION] NAME...',
 'tar': 'Traditional usage',
 'uniq': 'uniq [OPTION]... [INPUT [OUTPUT]]',
 'ln': 'ln [OPTION]... [-T] TARGET LINK_NAME',
 'split': 'split [OPTION]... [FILE [P

In [127]:
l = descs['find']

In [128]:
l.strip()

'find [-H] [-L] [-P] [-D debugopts] [-Olevel] [starting-point...]'

In [129]:
def valid_flag(flag):
    return flag in argument_types or not flag

def generate_options(utility):
    flag_map = data[utility]
    
    ret = []
    keys = list(flag_map.keys())
    for i in range(len(keys)):
        for j in range(i + 1, len(keys)):
            for k in range(j + 1, len(keys)):
                if all(valid_flag(flag_map[x]) for x in [keys[i], keys[j], keys[k]]):
                    f1 = " ".join([keys[i], flag_map[keys[i]]]) if flag_map[keys[i]] else keys[i]
                    f2 = " ".join([keys[j], flag_map[keys[j]]]) if flag_map[keys[j]] else keys[j]
                    f3 = " ".join([keys[k], flag_map[keys[k]]]) if flag_map[keys[k]] else keys[k]
                    ret.append(f1)
                    ret.append(" ".join([f1, f2]))
                    ret.append(" ".join([f1, f2, f3]))
    return list(set(ret))

def generate_syntax(utility):
    syntax = descs[utility]
    
    if 'option' not in syntax.lower():
        return f"Invalid syntax for utility {utility}, {syntax}"
        
    s = syntax.replace('...', '')
    s = remove_punctuation(s)
    s = remove_brackets(s)
    s = s.lower()
    
    sp = s.split(" ")
    cleaned = []
    for val in sp:
        if val == utility:
            cleaned.append(val)
        elif "option" in val:
            cleaned.append("option")
        elif val and val[0] != "-":
            s = ""
            for data_type in type_mapping:
                for match in type_mapping[data_type]:
                    if match in val:
                        s = data_type
            cleaned.append(s)
    
    return " ".join(cleaned)

def generate_commands(utility):
    ops = generate_options(utility)
    ret = []
    syntax = generate_syntax(utility)
    if "Invalid" in syntax:
        return ret
    for option_combo in ops:
        ret.append(syntax.replace("option", option_combo))
    return ret

total = []
for ut in utilities:
    print(f"{ut}: {len(generate_commands(ut))}")
    for r in generate_commands(ut):
        total.append(r)
len(total)

find: 0
xargs: 1539
grep: 19599
rm: 219
echo: 19
ls: 32508
sort: 3275
chmod: 119
wc: 55
cat: 285
cut: 119
head: 19
mv: 454
chown: 559
cp: 4494
mkdir: 19
tr: 34
tail: 363
dirname: 9
tar: 0
uniq: 219
ln: 815
split: 815
tee: 19
date: 219
pwd: 9
ssh: 0
diff: 13243
du: 2599
file: 0
rename: 119
md5sum: 285
comm: 164
mktemp: 83
df: 559
rev: 0
rmdir: 19
od: 968
hostname: 0


83823

In [132]:
generate_commands('grep')[:100]

['grep -e Regex -b -A Number Number File',
 'grep -v --exclude Regex -L Number File',
 'grep --count -v -a Number File',
 'grep -r -C Number -b Number File',
 'grep -q --no-messages -a Number File',
 'grep -d -G -a Number File',
 'grep -z -n -l Number File',
 'grep --exclude-dir Regex --exclude Regex -Z Number File',
 'grep --no-group-separator --exclude-dir Regex -P Number File',
 'grep -r -n -h Number File',
 'grep --color Number --line-buffered -o Number File',
 'grep -i -c --only-matching Number File',
 'grep -r -T --line-buffered Number File',
 'grep -T -F Number File',
 'grep --count -c -m Number Number File',
 'grep --no-group-separator --no-ignore-case -i Number File',
 'grep --group-separator Regex --line-buffered -s Number File',
 'grep -e Regex -n -P Number File',
 'grep -C Number -V -R Number File',
 'grep --exclude-from File -v -b Number File',
 'grep -r -G -o Number File',
 'grep --help --only-matching -m Number Number File',
 'grep -e Regex -i -D Number Number File',
 'g

In [131]:
data['find']

{'-ls': None,
 '-delete': None,
 '-cnewer': None,
 '-ignore_readdir_race': None,
 '-Olevel': None,
 '-cmin': None,
 '-perm': None,
 '-fstype': None,
 '-not': None,
 '-printf': None,
 '-writable': None,
 '-follow': None,
 '-H': None,
 '-O': None,
 '-print': None,
 '-prune': None,
 '-fls': None,
 '-L': None,
 '-lname': None,
 '-D': None,
 '-links': None,
 '-atime': None,
 '-quit': None,
 '-wholename': None,
 '-name': None,
 '-uid': None,
 '-iwholename': None,
 '-group': None,
 '-iname': None,
 '-execdir': None,
 '-path': None,
 '-newer': None,
 '-inum': None,
 '-fprintf': None,
 '-mtime': None,
 '-noignore_readdir_race': None,
 '-warn': None,
 '-ilname': None,
 '-files0-from': None,
 '-true': None,
 '-ok': None,
 '-xtype': None,
 '-help': None,
 '-iregex': None,
 '-newerXY': None,
 '-regextype': None,
 '-depth': None,
 '-samefile': None,
 '-or': None,
 '-regex': None,
 '-amin': None,
 '-executable': None,
 '-ipath': None,
 '-type': None,
 '-readable': None,
 '-mmin': None,
 '-user': None