# Printer Supplies Scrapper

Script that loads a csv into pandas, parse the IPs in python and request the webpage, update values and save to new csv

-- Jupyter Notebook envirnment running Python 3

## NOTES
https://stackoverflow.com/questions/71025130/how-to-extract-a-table-from-a-website-using-beautifulsoup

Wants: multithreading or multiprocessing to speed up requests resolution -> then short by name.

Im striping the index and the adding back by index. if I just make json file to parse then I dont need to run csv.
- I can break the functions apart and only call what I need
- in flask app I could have the app call the function based on the json and get a return value


```python
JSON({
    Name: Name, 
    {
        Name: VALUE
        IP: VALUE
        # rest of the supply list
        Black Toner: VALUE,
        Cyan Toner: VALUE,
        Magenta Toner: VALUE,
        Yellow Toner: VALUE,
        Drum Cartridge (R1): VALUE,
        Drum Cartridge (R2): VALUE,
        Drum Cartridge (R3): VALUE,
        Drum Cartridge (R4): VALUE,
        Waste Toner Container: VALUE,
        Transfer Belt Cleaner: VALUE,
        Second Bias Transfer Roll: VALUE,
    }
})
```


Json is just unstructured data. Do I care about the dict matching each other. OR should I pull in csv, loop through (name, model & ip) -> into function <- {info}, then append to the json dict with printer['printer_name'] = {info}

now you have a dictionary of dicts with different sets of values.

### PSUEDO CODE:
- Load in csv to Pandas
- Add all additional columns
- Set Index to Name
- Use pandas to_dict function to make a dictionary of dictionaries with the key as printer name
- Loop through dict calling right func for model -> set values through the return
- Save dict to json

### POST scrapper - Flask app
- Cronjob to run .py script 
- Launch flask app
- Load in json and present through index.html template
- reload script every so often - have manual refresh


multithreading

from concurrent.futures import ThreadPoolExecutor

executor = ThreadPoolExecutor(12)

futures = []

for url in urls:
    future = executor.submit(func, val)
    futures.append(future)

for future in futures:
    print(future.result())


## Imports and web driver setup

In [None]:


import pandas, requests, datetime, urllib3, time
from bs4 import BeautifulSoup as bs
from concurrent.futures import ThreadPoolExecutor

import asyncio
from pyppeteer import launch 

# Just to load the webpage for one type of printer
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service

service = Service()
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # it's more scalable to work in headless mode
driver = webdriver.Chrome(service=service, options=options)
driver.implicitly_wait(30)

# Ignore http warning and go to site
urllib3.disable_warnings() 

# Pull in intial file for IPs to scrap
df = pandas.read_csv("test.csv")

## Functions

In [None]:
async def get_url():
    browser = await launch(headless=True)
    page = await browser.newPage()
    await page.goto("http://10.10.140.53/home/index.html#hashSupplies/hashHome")
    await page.screenshot({'path': 'example.png'})
    await browser.close()
    

asyncio.get_running_loop().run_until_complete(get_url())

In [None]:
def get_altalink_levels(printer, Logging: bool=None) -> {}:
    '''
    Input:  printer: {}     -> Dictionary of base info and Data key meant to hold the results
            Logging: bool   -> turn on in function console logging

    Output: {} Dictionary    -> Returns found supply values in dictionary
    '''

    URL = f"https://{ printer['ip'] }/stat/consumables.php"
    results = {}

    #console log
    if(Logging):
        print ("testing URL: " + URL)

    #Request page -> log and return if can't
    try:
        response = requests.get(URL, verify=False)
        
    except:
        if(Logging):
            print(f"Can't reach { printer['ip'] }, Please check the printer")
        results["response"] = "None"
        printer['Data'].update(results)
        return printer
        
    soup = bs(response.content, "html.parser")

    for tab in soup.find_all('table', class_="tableDiv"):
        for tr in tab.find_all('tr'):
            row = [td.text.strip() for td in tr.find_all("td")]
            d = []
            if len(row) < 8:
                continue
            if(row[1]== 'Waste Toner Container'):
                results[row[1]] = row[2]
                continue
            results[row[1]] = row[3]
    
    #Return the full dict
    printer['Data'].update(results)
    if(Logging):
        print(printer)
    
    return printer

#Parse the versalink webpage for the toner level
def get_versalink_levels(printer, Logging: bool=None) -> {}:
    '''
    Input:  ip: str         -> IP to request page from for parsing
            Logging: bool   -> turn on in function console logging

    Output: {} Dictionary    -> Returns found supply values in dictionary
    '''
    
    URL = f"http://{ printer['ip'] }/home/index.html#hashSupplies/hashHome"
    results = {}

    #console log
    if(Logging):
        print ("testing URL: " + URL)

    #request page -> log and return if can't
    try:
        driver.get(URL)
        time.sleep(10)
   
    except:
        if(Logging):
            print(f"Can't reach { printer['ip'] }, Please check the printer")
        results["response"] = "None"
        printer['Data'].update(results)
        return printer

    #Get source and parse for the specifc divs
    soup = bs(driver.page_source, "html.parser")  
    supplies = soup.select("html > body > div > div > div > main > div > article > div > section > div > div")

    for supply in supplies:
        res = supply.get_text().rsplit(maxsplit=1)
        res[0] = res[0].lstrip()
        results[res[0]] = res[1]  #.strip("%") #can be used to strip the % and we could cast to int

    printer['Data'].update(results)
    if(Logging):
        print(printer)
    return printer

## Testing single execution of functions

### Testing of Altalink function

In [None]:
tester = {
    "Network Name": "T802-PS-C01",
    "ip": "10.32.181.20",
    "Model": "AltaLink C8135",
    "Data": {}
}
print(get_altalink_levels(tester, Logging=True))


### Testing of Versalink function

In [None]:

tester = {
    "Network Name": "G311-SA-C01",
    "ip": "10.10.140.53",
    "Model": "Versalink",
    "Data": {}
}
print(get_versalink_levels(tester, Logging=False))

## Function Exploration


### Altalink function exploration for parsing the webpage

In [None]:
IP = "10.32.181.20"
URL = f"https://{ IP }/stat/consumables.php"
results = {}

#console log
print ("testing URL: " + URL)

#Request page -> log and return if can't
try:
    response = requests.get(URL, verify=False)
except:
    print("Can't reach IP, Please check the printer")
    
soup = bs(response.content, "html.parser")

for tab in soup.find_all('table', class_="tableDiv"):
    for tr in tab.find_all('tr'):
        row = [td.text.strip() for td in tr.find_all("td")]
        d = []
        if len(row) < 8:
            continue
        if(row[1]== 'Waste Toner Container'):
            results[row[1]] = row[2]
            continue
        results[row[1]] = row[3]
#Return the full dict
print(results)



### Versalink function exploration for pasring webpage


In [None]:
IP = "10.64.30.30"
URL = f"http://{ IP }/home/index.html#hashSupplies/hashHome"
results = {}

#console log
print ("testing URL: " + URL)

#request page -> log and return if can't
try:
    driver.get(URL)
    time.sleep(5)
except:
    print(f"Can't reach { IP }, Please check the printer")

#Get source and parse for the specifc divs
soup = bs(driver.page_source, "html.parser")
supplies = soup.select("html > body > div > div > div > main > div > article > div > section > div > div")

for supply in supplies:
    #data.append(l.get_text().rsplit(maxsplit=1))
    res = supply.get_text().rsplit(maxsplit=1)
    res[0] = res[0].lstrip()
    results[res[0]] = res[1]

print(results)

## Main Function

### Dataframe conversion for looping

In [None]:
ds = df[['Network Name', 'ip' , 'Model']]

ds_dict = ds.to_dict(orient="index")

#print(ds_dict)
for d in ds_dict:
    ds_dict[d]["Data"] = {}
    #print(ds_dict[d])

### Old function

Loop through dictionary and call specific function -> waits on results from function

In [None]:
# Loop through IP list in Pandas. Set the Levels in new Array
printer_level_list = []
levels = []
for d in ds_dict:
    if "AltaLink" in ds_dict[d]["Model"]:
        levels = get_altalink_levels(ds_dict[d])
    if "VersaLink" in ds_dict[d]["Model"]:
        levels = get_versalink_levels(ds_dict[d])
    printer_level_list.append(levels)

for printer in printer_level_list:
    print(printer)


### New Multi-Threadding function

Sets up a pool of 34 threads to run at "once" 

In [None]:
#Make thread pool to run multiple function at once
executor = ThreadPoolExecutor(10)
futures = []

result_dict = {}

for d in ds_dict:
    if "AltaLink" in ds_dict[d]["Model"]:
        future = executor.submit(get_altalink_levels, ds_dict[d], True)
        #futures.append(future)
    if "VersaLink" in ds_dict[d]["Model"]:
        future = executor.submit(get_versalink_levels, ds_dict[d], True)
    futures.append(future)

for future in futures:
    result_dict[futures.index(future)] = future.result()
    print(future.result())


In [None]:

for d in result_dict:
    print(result_dict[d])