# Oil & Gas Industry Case Study
Build a business knowledge database using web resources. Keep it short and simple.

## Web Search

In [1]:
# Import dependencies
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from datetime import date

# Define an auxiliary functions
def get_url(link):
  """Extract URL from an <a> element in a google search."""
  if link[:7] == '/url?q=':
    url = re.search(r'q=(.*?)&', link).group(1)
  else:
    url = re.search(r'\A(.*?)&', link).group(1)
  return url

from collections import Counter
def count_elements(soup):
    """Count elements in the soup document"""
    return Counter(tag.name for tag in soup.find_all(True))

In [2]:
# Define core code !Beware: Search returns results when run in Colab!
def search_web(search, count=40, debug=False):
  """Search the web"""

  data = []
  
  for i in range(0, count, 10):
    gsearch = 'https://www.google.com/search?q=' + search + '&start=' + str(i)
    if debug:
      print(f"gsearch = {gsearch}")
    gpage = requests.get(gsearch)
    if debug:
      print(f"gpage.status_code = {gpage.status_code}")
    gsoup = BeautifulSoup(gpage.content, 'html.parser')
    h3_list = gsoup.find_all('h3')
    if debug:
      print(f"len(h3_list) = {len(h3_list)}")

    # Continue only if there are some results
    if len(h3_list) == 0:
      break

    for h3 in h3_list:
      title = h3.text
      if debug:
        print(f"\th3.text = {h3.text} | h3.parent.attrs.keys() = {h3.parent.attrs.keys()}")

      # Skip the rest of loop if no url in h3.parent
      if 'href' not in h3.parent.attrs.keys():
        continue

      url = get_url(h3.parent['href'])
      inn = re.search('//(.*?)/', url).group(1)

      data.append([inn, title, url])

    print(gpage.status_code, gsearch, len(h3_list))

  return data

In [6]:
# Search the web resources
search = 'oil+gas+industry'
data = search_web(search)
data[-1]


200 https://www.google.com/search?q=oil+gas+industry&start=0 11
200 https://www.google.com/search?q=oil+gas+industry&start=10 10
200 https://www.google.com/search?q=oil+gas+industry&start=20 10
200 https://www.google.com/search?q=oil+gas+industry&start=30 10


['www.eia.gov',
 'Where our natural gas comes from - US Energy Information ...',
 'https://www.eia.gov/energyexplained/natural-gas/where-our-natural-gas-comes-from.php']

In [18]:
search_name = search.replace('+','_')
# Save results as a CSV file
pd.DataFrame(data, columns=['inn','title','url']).to_csv(search_name+'.csv', index=False)

In [20]:
from pathlib import Path

# Save data as a markdown document
def save_md(data, search, search_name):
  """Save data as a markdown document"""
  md_lines = []
  md_lines.append(f"# Searched terms: `{search}`")
  for row in data:
    md_lines.append(f"- **`{row[0]}`** : [{row[1]}]({row[2]})")
  Path(search_name+'.md').write_text('\n'.join(md_lines))
  return

save_md(data, search, search_name)

## Keep a Journal
**2022-01-01 Sat**: (1) **Start Prototyping**. An initial version of this notebook has been derived from [20211202_Search_Web_Resources.ipynb](https://github.com/lustraka/data-analyst-portfolio-project-2022/blob/main/cs01_cds_methods/20211202_Search_Web_Resources.ipynb). The code is simplified to just scraping the web and saving the results as a CSV (`['inn','title','url']`) and also an MD file. I realized, that this code returns expected results only when run in Colab environment. Stored files shall be downloaded to the local clone of repository or uploaded to GitHub. (2) **Inspiration**. The building blocks should have the right granularity. Web search should be as simple as possible. Working with data as categorization can be done in subsequent steps of the pipeline. [3 fsp]