# Exploratory Data Analysis: Web Scraping Utilities with Beautiful Soup

In this notebook, I hope to create a parser for web scraping the linux man pages to gather information on some of the most common utilities and their corresponding flags. 

In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
# 23 of the most common utilities
utilities = [
    "find",
    "xargs",
    "grep",
    "rm",
    "echo",
    "ls",
    "sort",
    "chmod",
    "wc",
    "cat",
    "cut",
    "head",
    "mv",
    "chown",
    "cp",
    "mkdir",
    "tr",
    "tail",
    "dirname",
    "tar",
    "uniq",
    "ln",
    "split",
]

## Finding Flags for 1 Utility
First, we want to create a parser to find the available flags for just one of the utilities. We will start with the 'find' utility and use the BeautifulSoup package to parse for the corresponding flags.

In [21]:
utility = "find"
utility_url = f'https://man7.org/linux/man-pages/man1/{utility}.1.html'
r = requests.get(utility_url)
soup = BeautifulSoup(r.text)
print(soup.prettify()[:1000])

<!DOCTYPE html>
<html lang="en-US">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <title>
   find(1) - Linux manual page
  </title>
  <link href="../../../style.css" rel="stylesheet" title="style" type="text/css"/>
  <link href="../style.css" rel="stylesheet" title="style" type="text/css"/>
 </head>
 <body>
  <div class="page-top">
   <a id="top_of_page">
   </a>
  </div>
  <!--%%%TOP_BAR%%%-->
  <div class="nav-bar">
   <table class="nav-table">
    <tr>
     <td class="nav-cell">
      <p class="nav-text">
       <a href="../../../index.html">
        man7.org
       </a>
       &gt; Linux &gt;
       <a href="../index.html">
        man-pages
       </a>
      </p>
     </td>
     <td class="training-cell">
      <p class="training-text">
       <a class="training-link" href="http://man7.org/training/">
        Linux/UNIX system programming training
       </a>
      </p>
     </td>
    </tr>
   </table>
  </div>
  <hr cl

The syntax example could definitely be extremely useful in the future when creating the generator and determining the order in which to assemble the options and arguments.

In [4]:
desc = soup.find_all('pre')[2].text
print(desc)


       find [-H] [-L] [-P] [-D debugopts] [-Olevel] [starting-point...]
       [expression]



In [30]:
options = soup.find_all('pre')[4].text
print(options[2000:4000])

files, and the file is a symbolic link, the information
              used shall be taken from the properties of the symbolic
              link itself.

       -L     Follow symbolic links.  When find examines or prints
              information about files, the information used shall be
              taken from the properties of the file to which the link
              points, not from the link itself (unless it is a broken
              symbolic link or find is unable to examine the file to
              which the link points).  Use of this option implies
              -noleaf.  If you later use the -P option, -noleaf will
              still be in effect.  If -L is in effect and find discovers
              a symbolic link to a subdirectory during its search, the
              subdirectory pointed to by the symbolic link will be
              searched.

              When the -L option is in effect, the -type predicate will
              always match against the type of the file th

In [6]:
# extracting and cleaning the corresponding flags
punctuation = set(_ for _ in ",.()")
stripped_options = [line.strip() for line in options.split('\n')]
flag_lines = list(filter(lambda x: x and x[0] == "-", stripped_options))
flags = [line.split(" ")[0] for line in flag_lines]
clean_flags = [f if f[-1] not in punctuation else f[:-1] for f in flags]
clean_flags = [f if "[" not in f else f.split("[")[0] for f in clean_flags]
set(clean_flags)

{'-D',
 '-H',
 '-L',
 '-Olevel',
 '-P',
 '-cnewer',
 '-execdir',
 '-files0-from',
 '-noleaf',
 '-regex'}

## Scaling for all of our utilities

Now that we've been able to get a list of flags for one of our utilities, we can scale the solution to collect all of the available flags for each utility in our list of 23 utilities. To do this, we will apply the same parsing above to each utility webpage. 

In [7]:
data = {}

In [16]:
for utility in utilities:
    utility_url = f'https://man7.org/linux/man-pages/man1/{utility}.1.html'
    r = requests.get(utility_url)
    soup = BeautifulSoup(r.text)
    desc = soup.find_all('pre')[2].text
    options = soup.find_all('pre')[4].text
    stripped_options = [line.strip() for line in options.split('\n')]
    flag_lines = list(filter(lambda x: x and x[0] == "-", stripped_options))
    flags = [line.split(" ")[0] for line in flag_lines]
    clean_flags = [f if f[-1] not in punctuation else f[:-1] for f in flags]
    clean_flags = [f if "[" not in f else f.split("[")[0] for f in clean_flags]
    d = {flag:[] for flag in clean_flags}
    data[utility] = d
    print(desc)


       find [-H] [-L] [-P] [-D debugopts] [-Olevel] [starting-point...]
       [expression]


       xargs [options] [command [initial-arguments]]


       grep [OPTION...] PATTERNS [FILE...]
       grep [OPTION...] -e PATTERNS ... [FILE...]
       grep [OPTION...] -f PATTERN_FILE ... [FILE...]


       rm [OPTION]... [FILE]...


       echo [SHORT-OPTION]... [STRING]...
       echo LONG-OPTION


       ls [OPTION]... [FILE]...


       sort [OPTION]... [FILE]...
       sort [OPTION]... --files0-from=F


       chmod [OPTION]... MODE[,MODE]... FILE...
       chmod [OPTION]... OCTAL-MODE FILE...
       chmod [OPTION]... --reference=RFILE FILE...


       wc [OPTION]... [FILE]...
       wc [OPTION]... --files0-from=F


       cat [OPTION]... [FILE]...


       cut OPTION... [FILE]...


       head [OPTION]... [FILE]...


       mv [OPTION]... [-T] SOURCE DEST
       mv [OPTION]... SOURCE... DIRECTORY
       mv [OPTION]... -t DIRECTORY SOURCE...


       chown [OPTION]... [OWNER][:[GROUP

In [17]:
data

{'find': {'-files0-from': [],
  '-P': [],
  '-L': [],
  '-noleaf': [],
  '-H': [],
  '-cnewer': [],
  '-D': [],
  '-execdir': [],
  '-Olevel': [],
  '-regex': []},
 'xargs': {'-0': [],
  '-a': [],
  '--delimiter=delim': [],
  '-E': [],
  '-e': [],
  '-I': [],
  '-i': [],
  '-L': [],
  '-l': [],
  '-n': [],
  '-s': [],
  '-P': [],
  '-o': [],
  '-p': [],
  '--process-slot-var=name': [],
  '-r': [],
  '--show-limits': [],
  '-t': [],
  '-x': [],
  '--help': [],
  '--version': []},
 'grep': {'--help': [],
  '-V': [],
  '-E': [],
  '-F': [],
  '-G': [],
  '-P': [],
  '-e': [],
  '-f': [],
  '-i': [],
  '--no-ignore-case': [],
  '-v': [],
  '-w': [],
  '-x': [],
  '-y': [],
  '-c': [],
  '--color': [],
  '-L': [],
  '-l': [],
  '-m': [],
  '--count': [],
  '-o': [],
  '-q': [],
  '--no-messages': [],
  '-s': [],
  '-b': [],
  '-H': [],
  '-h': [],
  '--label=LABEL': [],
  '-n': [],
  '-T': [],
  '-Z': [],
  '-z': [],
  '-A': [],
  '--only-matching': [],
  '-B': [],
  '-C': [],
  '--group-se

In [15]:
# The number of flags in our data structure
sum(len(data[x].values()) for x in data)

124

## Manually Inserting Argument Types

Now that we have the utilities mapped to the appropriate flags, we will need to manually insert the data types corresponding to each flag.

In [None]:
argument_types = [
    'Regex',
    'File',
    'Directory',
    'Path'
    'Number',
    '+Number',
    '-Number',
    'Quantity',
    '+Quantity',
    '-Quantity',
    'Size',
    '+Size',
    '-Size',
    'Timespan',
    '+Timespan',
    '-Timespan',
    'DateTime',
    '+DateTime',
    '-DateTime',
    'Permission',
    '+Permission',
    '-Permission'
]

In [39]:
# find
data['find']['-P'] = [None]
data['find']['-L']= [None]
data['find']['-noleaf'] = [None]
data['find']['-H'] = [None]
data['find']['-cnewer'] = ['File']
if '-D' in data['find']:
    del data['find']['-D']
if '-D' in data['find']:
    del data['find']['-execdir']
data['find']['-Olevel'] = ['+Number'] 
data['find']['-regex'] = ['regex'] 

# xargs
data['xargs']['-0'] = [None]
data['xargs']['-a'] = ['File']
if '--delimineter=delim' in data['xargs']:
    del data['xargs']['--delimineter=delim']
data['xargs']['-E'] = ['Regex']
if '-e' in data['xargs']:
    del data['xargs']['-e']
data['xargs']['-I'] = ['Regex']
if '-i' in data['xargs']:
    del data['xargs']['-i']
data['xargs']['-L'] = ['+Number']
data['xargs']['-l']= ['+Number']
data['xargs']['-n'] = ['+Number']
data['xargs']['-P'] = ['Number']
data['xargs']['-o'] = [None]
data['xargs']['-p'] = [None]
if '--process-slot-var=name' in data['xargs']:
    del data['xargs']['--process-slot-var=name']

In [40]:
data

{'find': {'-files0-from': [],
  '-P': [None],
  '-L': [None],
  '-noleaf': [None],
  '-H': [None],
  '-cnewer': ['File'],
  '-Olevel': ['+Number'],
  '-regex': ['regex']},
 'xargs': {'-0': [None],
  '-a': ['File'],
  '--delimiter=delim': [],
  '-E': ['Regex'],
  '-I': ['Regex'],
  '-L': ['+Number'],
  '-l': ['+Number'],
  '-n': ['+Number'],
  '-s': [],
  '-P': ['Number'],
  '-o': [None],
  '-p': [None],
  '-r': [],
  '--show-limits': [],
  '-t': [],
  '-x': [],
  '--help': [],
  '--version': []},
 'grep': {'--help': [],
  '-V': [],
  '-E': [],
  '-F': [],
  '-G': [],
  '-P': [],
  '-e': [],
  '-f': [],
  '-i': [],
  '--no-ignore-case': [],
  '-v': [],
  '-w': [],
  '-x': [],
  '-y': [],
  '-c': [],
  '--color': [],
  '-L': [],
  '-l': [],
  '-m': [],
  '--count': [],
  '-o': [],
  '-q': [],
  '--no-messages': [],
  '-s': [],
  '-b': [],
  '-H': [],
  '-h': [],
  '--label=LABEL': [],
  '-n': [],
  '-T': [],
  '-Z': [],
  '-z': [],
  '-A': [],
  '--only-matching': [],
  '-B': [],
  '-C':