# Exploratory Data Analysis: Web Scraping Utilities with Beautiful Soup

In this notebook, I hope to create a parser for web scraping the linux man pages to gather information on some of the most common utilities and their corresponding flags. 

In [2]:
import requests
from bs4 import BeautifulSoup

In [3]:
# 23 of the most common utilities
utilities = [
    "find",
    "xargs",
    "grep",
    "rm",
    "echo",
    "ls",
    "sort",
    "chmod",
    "wc",
    "cat",
    "cut",
    "head",
    "mv",
    "chown",
    "cp",
    "mkdir",
    "tr",
    "tail",
    "dirname",
    "tar",
    "uniq",
    "ln",
    "split",
]

## Finding Flags for 1 Utility
First, we want to create a parser to find the available flags for just one of the utilities. We will start with the 'find' utility and use the BeautifulSoup package to parse for the corresponding flags.

In [61]:
utility = "uniq"
utility_url = f'https://man7.org/linux/man-pages/man1/{utility}.1.html'
r = requests.get(utility_url)
soup = BeautifulSoup(r.text)
print(soup.prettify()[:])

<!DOCTYPE html>
<html lang="en-US">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <title>
   uniq(1) - Linux manual page
  </title>
  <link href="../../../style.css" rel="stylesheet" title="style" type="text/css"/>
  <link href="../style.css" rel="stylesheet" title="style" type="text/css"/>
 </head>
 <body>
  <div class="page-top">
   <a id="top_of_page">
   </a>
  </div>
  <!--%%%TOP_BAR%%%-->
  <div class="nav-bar">
   <table class="nav-table">
    <tr>
     <td class="nav-cell">
      <p class="nav-text">
       <a href="../../../index.html">
        man7.org
       </a>
       &gt; Linux &gt;
       <a href="../index.html">
        man-pages
       </a>
      </p>
     </td>
     <td class="training-cell">
      <p class="training-text">
       <a class="training-link" href="http://man7.org/training/">
        Linux/UNIX system programming training
       </a>
      </p>
     </td>
    </tr>
   </table>
  </div>
  <hr cl

The syntax example could definitely be extremely useful in the future when creating the generator and determining the order in which to assemble the options and arguments.

In [56]:
desc = soup.find_all('pre')[2].text
print(desc)


       uniq [OPTION]... [INPUT [OUTPUT]]



In [69]:
options = "\n".join([soup.find_all('pre')[3].text, soup.find_all('pre')[4].text])
print(options[:500])


       Filter adjacent matching lines from INPUT (or standard input),
       writing to OUTPUT (or standard output).

       With no options, matching lines are merged to the first
       occurrence.

       Mandatory arguments to long options are mandatory for short
       options too.

       -c, --count
              prefix lines by the number of occurrences

       -d, --repeated
              only print duplicate lines, one for each group

       -D     print all duplicate lines

       --


In [9]:
# extracting and cleaning the corresponding flags
punctuation = set(_ for _ in ",.()")
stripped_options = [line.strip() for line in options.split('\n')]
flag_lines = list(filter(lambda x: x and x[0] == "-", stripped_options))
flags = [line.split(" ")[0] for line in flag_lines]
clean_flags = [f if f[-1] not in punctuation else f[:-1] for f in flags]
clean_flags = [f if "[" not in f else f.split("[")[0] for f in clean_flags]
set(clean_flags)

{'-D',
 '-H',
 '-L',
 '-Olevel',
 '-P',
 '-cnewer',
 '-execdir',
 '-files0-from',
 '-noleaf',
 '-regex'}

## Scaling for all of our utilities

Now that we've been able to get a list of flags for one of our utilities, we can scale the solution to collect all of the available flags for each utility in our list of 23 utilities. To do this, we will apply the same parsing above to each utility webpage. 

In [44]:
data = {}

In [70]:
for utility in utilities:
    utility_url = f'https://man7.org/linux/man-pages/man1/{utility}.1.html'
    r = requests.get(utility_url)
    soup = BeautifulSoup(r.text)
    desc = soup.find_all('pre')[2].text
    options = "\n".join([soup.find_all('pre')[3].text, soup.find_all('pre')[4].text])
    stripped_options = [line.strip() for line in options.split('\n')]
    flag_lines = list(filter(lambda x: x and x[0] == "-", stripped_options))
    clean_flags = flag_lines
    d = set(flag for flag in clean_flags)
    data[utility] = {}
    data[utility]['lines'] = d
    print(desc)


       find [-H] [-L] [-P] [-D debugopts] [-Olevel] [starting-point...]
       [expression]


       xargs [options] [command [initial-arguments]]


       grep [OPTION...] PATTERNS [FILE...]
       grep [OPTION...] -e PATTERNS ... [FILE...]
       grep [OPTION...] -f PATTERN_FILE ... [FILE...]


       rm [OPTION]... [FILE]...


       echo [SHORT-OPTION]... [STRING]...
       echo LONG-OPTION


       ls [OPTION]... [FILE]...


       sort [OPTION]... [FILE]...
       sort [OPTION]... --files0-from=F


       chmod [OPTION]... MODE[,MODE]... FILE...
       chmod [OPTION]... OCTAL-MODE FILE...
       chmod [OPTION]... --reference=RFILE FILE...


       wc [OPTION]... [FILE]...
       wc [OPTION]... --files0-from=F


       cat [OPTION]... [FILE]...


       cut OPTION... [FILE]...


       head [OPTION]... [FILE]...


       mv [OPTION]... [-T] SOURCE DEST
       mv [OPTION]... SOURCE... DIRECTORY
       mv [OPTION]... -t DIRECTORY SOURCE...


       chown [OPTION]... [OWNER][:[GROUP

In [71]:
data

{'find': {'lines': {'-D debugopts',
   '-D help.  Valid debug options include',
   '-H     Do not follow symbolic links, except while processing the',
   '-L     Follow symbolic links.  When find examines or prints',
   '-Olevel',
   '-P     Never follow symbolic links.  This is the default',
   '-cnewer.',
   '-execdir, -ok and -okdir',
   '-files0-from to pass arbitrary starting points to find.',
   '-noleaf.  If you later use the -P option, -noleaf will',
   '-regex) are performed first.'}},
 'xargs': {'lines': {'--delimiter=delim, -d delim',
   '--help Print a summary of the options to xargs and exit.',
   '--process-slot-var=name',
   '--show-limits',
   '--version',
   '-0, --null',
   '-E eof-str',
   '-I replace-str',
   '-L max-lines',
   '-P max-procs, --max-procs=max-procs',
   '-P option.',
   '-a file, --arg-file=file',
   '-e[eof-str], --eof[=eof-str]',
   '-i[replace-str], --replace[=replace-str]',
   '-l[max-lines], --max-lines[=max-lines]',
   '-n max-args, --max-args=

In [73]:
# The number of flags in our data structure
sum(len(data[x]['lines']) for x in data)

399

In [74]:
def get_inner_brackets(s):
    open_idx = s.index("[") + 1
    closed_idx = s.index("]")
    a = s[open_idx: closed_idx]
    a = a.replace("=", "")
    return a

def get_equal_arg(s):
    return remove_punctuation(s.split("=")[1])
    
def remove_punctuation(s):
    punctuation = set(_ for _ in ",.()")
    return "".join([x if x not in punctuation else "" for x in s])
    
def remove_brackets(s):
    brackets = {"[", "]"}
    return "".join([x if x not in brackets else "" for x in s])

def get_flag(line):
    punctuation = set(p for p in "[].,()=[]")
    for val in punctuation:
        line = line.replace(val, " ")
    flag = line.split(" ")[0]
    return flag

## Cleaning Flags

In [75]:
for ut in data:
    for flag_line in data[ut]['lines']:
        flag_line = remove_punctuation(flag_line)
        flag, arg = get_flag(flag_line), None
        if "[" in flag_line and "]" in flag_line:
            arg = get_inner_brackets(flag_line)
        elif "=" in flag_line:
            arg = get_equal_arg(flag_line)
        data[ut][flag] = arg

In [76]:
for ut in data:
    if 'lines' in data[ut]:
        del data[ut]['lines']
        
data

{'find': {'-D': None,
  '-execdir': None,
  '-Olevel': None,
  '-files0-from': None,
  '-regex': None,
  '-cnewer': None,
  '-H': None,
  '-noleaf': None,
  '-P': None,
  '-L': None},
 'xargs': {'-s': 'max-chars',
  '--delimiter': 'delim -d delim',
  '-p': None,
  '-0': None,
  '--process-slot-var': 'name',
  '-n': 'max-args',
  '-t': None,
  '-L': None,
  '-a': 'file',
  '-P': 'max-procs',
  '-l': 'max-lines',
  '-r': None,
  '-I': None,
  '-i': 'replace-str',
  '--version': None,
  '-o': None,
  '--help': None,
  '-E': None,
  '--show-limits': None,
  '-e': 'eof-str',
  '-x': None},
 'grep': {'-n': None,
  '--binary-files': 'TYPE',
  '-b': None,
  '--exclude': 'GLOB',
  '-z': None,
  '-d': None,
  '-R': None,
  '-l': None,
  '--line-buffered': None,
  '--no-ignore-case': None,
  '--count': None,
  '-I': None,
  '-o': None,
  '-x': None,
  '--label': 'LABEL',
  '--include': 'GLOB',
  '-C': 'NUM',
  '-H': None,
  '--no-messages': None,
  '-c': None,
  '-D': 'ACTION',
  '-T': None,
  '-

In [81]:
# the number of flags that have arguments
count = 0
for ut in data:
    for flag in data[ut]:
        if data[ut][flag]: count +=1
count

103

## Manually Inserting Argument Types

Now that we have the utilities mapped to the appropriate flags, we will need to manually insert the data types corresponding to each flag.

In [77]:
argument_types = [
    'Regex',
    'File',
    'Directory',
    'Path'
    'Number',
    '+Number',
    '-Number',
    'Quantity',
    '+Quantity',
    '-Quantity',
    'Size',
    '+Size',
    '-Size',
    'Timespan',
    '+Timespan',
    '-Timespan',
    'DateTime',
    '+DateTime',
    '-DateTime',
    'Permission',
    '+Permission',
    '-Permission'
]

Getting a sense of the argument types the parser has discovered

In [78]:
for ut in data:
    for flag in data[ut]:
        if data[ut][flag]:
            print(data[ut][flag])

max-chars
delim -d delim
name
max-args
file
max-procs
max-lines
replace-str
eof-str
TYPE
GLOB
LABEL
GLOB
NUM
ACTION
FILE
NUM
FILE
NUM
WHEN
GLOB
PATTERNS
NUM
SEP
all
WHEN
none -U disables grouping
PATTERN
COLS
PATTERN
WORD
WHEN
SIZE
WORD
WORD
WHEN
WORD
TIME_STYLE
slash
COLS
WORD
N
DIR
SEP
PROG
KEYDEF
SIZE
NMERGE
F
FILE
diagnose-first
FILE
quiet --check
F
LIST
LIST
LIST
DELIM
STRING
-
-
SUFFIX
DIRECTORY
CONTROL
CURRENT_OWNER:CURRENT_GROUP
RFILE
links
CTX
CONTROL
WHEN
WHEN
ATTR_LIST
SUFFIX
modeownershiptimestamps
ATTR_LIST
DIRECTORY
CTX
MODE
N
PID
name --retry
{name|descriptor}
name in that case  That causes tail to track the named
N
+
N
N
METHOD
N
METHOD
SUFFIX
DIRECTORY
CONTROL
SUFFIX
SIZE
COMMAND
CHUNKS
FROM
N
FROM
SIZE
NUMBER
SEP


In [None]:
type_mappings = {
    'Regex': {'str', 'sep', 'pattern', 'word'},
    'File': {'file'},
    'Directory': {'dir'},
    'Path': {'path'},
    'Number': {'num', 'n'},
    '+Number': {'max'},
}

In [27]:
# # find
# data['find']['-P'] = [None]
# data['find']['-L']= [None]
# data['find']['-noleaf'] = [None]
# data['find']['-H'] = [None]
# data['find']['-cnewer'] = ['File']
# if '-D' in data['find']:
#     del data['find']['-D']
# if '-D' in data['find']:
#     del data['find']['-execdir']
# data['find']['-Olevel'] = ['+Number'] 
# data['find']['-regex'] = ['regex'] 

# # xargs
# data['xargs']['-0'] = [None]
# data['xargs']['-a'] = ['File']
# if '--delimineter=delim' in data['xargs']:
#     del data['xargs']['--delimineter=delim']
# data['xargs']['-E'] = ['Regex']
# if '-e' in data['xargs']:
#     del data['xargs']['-e']
# data['xargs']['-I'] = ['Regex']
# if '-i' in data['xargs']:
#     del data['xargs']['-i']
# data['xargs']['-L'] = ['+Number']
# data['xargs']['-l']= ['+Number']
# data['xargs']['-n'] = ['+Number']
# data['xargs']['-s'] = ['+Number']
# data['xargs']['-P'] = ['Number']
# data['xargs']['-o'] = [None]
# data['xargs']['-p'] = [None]
# if '--process-slot-var=name' in data['xargs']:
#     del data['xargs']['--process-slot-var=name']
# data['xargs']['-r'] = [None]
# data['xargs']['--show-limits'] = [None]
# data['xargs']['-t'] = [None]
# data['xargs']['-x'] = [None]
# data['xargs']['--help'] = [None]
# data['xargs']['--version'] = [None]

# #grep
# data['grep']['--help'] = [None]
# data['grep']['-V'] = [None]
# data['grep']['-E'] = [None]
# data['grep']['-F'] = [None]
# data['grep']['-G'] = [None]
# data['grep']['-P'] = [None]
# data['grep']['-e'] = ['Regex']
# data['grep']['-f'] = ['File']
# data['grep']['-i'] = [None]
# data['grep']['--no-ignore-case'] = [None]
# data['grep']['-v'] = [None]
# data['grep']['-w'] = [None]
# data['grep']['-x'] = [None]
# data['grep']['-y'] = [None]
# data['grep']['-c'] = [None]
# if '--color' in data['grep']:
#     del data['grep']['--color']
# data['grep']['-L'] = [None]
# data['grep']['-l'] = [None]
# data['grep']['-m'] = ['+Number']
# data['grep']['--count'] = [None]
# data['grep']['-o'] = [None]
# data['grep']['-q'] = [None]
# data['grep']['--no-messages'] = [None]
# data['grep']['-s'] = [None]
# data['grep']['-b'] = [None]
# data['grep']['-H'] = [None]
# data['grep']['-h'] = [None]
# data['grep']['--label'] = ['Regex']
# if '--label=LABEL' in data['grep']:
#     del data['grep']['--label=LABEL']
# data['grep']['-n'] = [None]
# data['grep']['-T'] = [None]
# data['grep']['-Z'] = [None]
# if '-z' in data['grep']:
#     del data['grep']['-z']
# data['grep']['-A'] = ['+Number']
# data['grep']['--only-matching'] = [None]
# data['grep']['-B'] = ['+Number']
# data['grep']['-C'] = ['+Number']
# data['grep']['--group-separator'] = ['Regex']
# if '--group-separator=SEP' in data['grep']['']
#     del data['grep']['--group-separator=SEP']
# data['grep']['-a']