In [1]:
import yaml
import pandas as pd

folder_location = 'C:/Users/ttrol/CodingProjects/MatNet/scripts/webscraper/results_files/'
filename = 'No_AISI.yaml'
file_location = folder_location + filename

with open(file_location, "r") as stream:
    try:
        raw_materials = yaml.safe_load(stream)
        print('Raw Size: ', len(raw_materials))
        raw_df = pd.DataFrame(raw_materials)
    except yaml.YAMLError as exc:
        print(exc)

Raw Size:  11807


In [23]:
import time

def time_it(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        execution_time = end_time - start_time
        print(f"Function '{func.__name__}' executed in {execution_time:.6f} seconds.")
        return result
    return wrapper

In [11]:
import requests
import random
from bs4 import BeautifulSoup

#oxylabs authentication
USERNAME = "pcho69"
PASSWORD = "StealthyWebsitePumpk1n"
ENDPOINT = "pr.oxylabs.io:7777"


entry = ('http://customer-%s:%s@%s' %
    (USERNAME, PASSWORD, ENDPOINT))
query = {
    'http': entry,
    'https': entry,
}
session = requests.Session()


# link = raw_df.link[random.randint(0,len(raw_df)-1)]
link = raw_df.link[0]

def convert_to_numeric_value(value_string):
    try:
        split_string = value_string.split()
        if len(split_string) == 2:
            numeric_value = float(split_string[0])
            units = split_string[1]
            return {'value': numeric_value, 'units': units}
        elif len(split_string) == 1:
            numeric_value = int(value_string)
            return {'value': numeric_value, 'units': ''}
        else:
            return None
    except ValueError:
        # If the conversion fails, return None or handle the error accordingly
        return None

def parse_page(link, sesh):
    result = {}
    result['source'] = link

    response = sesh.get(
        headers={
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
        },
        url=link,
        proxies= query,
        verify=True
    )

    soup = BeautifulSoup(response.content,'html')

    #find name of material in page and remove unnecessary whitespace
    material_name = soup.find('title').get_text() # pyright: ignore[reportOptionalMemberAccess]  # noqa: E501
    material_name = material_name.strip()
    result['name'] = material_name

    table = soup.select('div#ctl00_ContentMain_ucDataSheet1_pnlMaterialData table.tabledataformat')
    if len(table) != 2:
        raise Exception('IP Blocked')

    #find supplementary notes in tables
    matl_notes_table = table[0].find(id= "ctl00_ContentMain_ucDataSheet1_trMatlNotes")
    category_table = table[0].find(id= "ctl00_ContentMain_ucDataSheet1_trMatlGroups")
    
    material_notes = None
    categories = None

    try:
        if matl_notes_table is not None:
            material_notes = matl_notes_table.find('td').text   #type: ignore
        
        if category_table is not None:
            categories = category_table.find('td').text  #type: ignore
            categories = [item.strip() for item in categories.split(";")]
        
        result['categories'] = categories
        result['material_notes'] = material_notes
    except Exception:
        exception_msg = 'Issues with grabbing material and category notes: ' \
            + link
        print(exception_msg)
        

    #Second table in html page has correct content
    main_table_rows = table[1].find_all('tr')
    property_tables = []
    if main_table_rows is None:
        raise TypeError('Cannot parse None type')
  
    
    #This creates a list of tables from the main content table. 
    property_type = None
    for row in main_table_rows:
        type = row.select('th:not([class])')
        if len(type) > 0:
            property_type = type[0].text.strip()
            property_type = property_type.replace(',','').replace(' ','_').lower()
            continue

        prop = row.select('td:not([class])')
        if len(prop) > 0:
            prop = prop[0].text.strip().replace(',','').replace(' ','_').lower()
        if prop:
            ele = row.select('td.dataCell')[0]
            if ele == []:
                raise Exception('Cannot find dataCell')
            
            if property_type not in result:
                result[property_type] = {}

            if prop not in result[property_type]:
                result[property_type][prop] = convert_to_numeric_value(ele.text.strip())
                # result[property_type][prop] = ele.text.strip()

    return result

result = parse_page(link,session)
result


{'source': 'https://matweb.com/search/DataSheet.aspx?MatGUID=0cd24112533d4d18b2b09265af4747fc',
 'name': 'Polymer Resources PPX-FR1 Modified-PPO, V-0 Flame Retardant',
 'categories': ['Polymer',
  'Thermoplastic',
  'Polyphenylene Ether/PPO',
  'Polyphenylene Ether, Heat Resistant'],
 'material_notes': 'Modified-Polyphenylene Oxide, V-0 Flame Retardant, 190F HDTFeatures:  • Bromine Free  • Chlorine FreeProcess: Injection MoldingNotes: All physical, mechanical and thermal testing conducted on 1/8-inch thick, un-pigmented, test samples.Information provided by Polymer Resources Corporation.',
 'physical_properties': {'specific_gravity': {'value': 1.08, 'units': 'g/cc'},
  'linear_mold_shrinkage': None,
  'melt_flow': None},
 'mechanical_properties': {'hardness_rockwell_r': {'value': 118, 'units': ''},
  'tensile_strength_at_break': {'value': 41.4, 'units': 'MPa'},
  'tensile_strength_yield': {'value': 44.8, 'units': 'MPa'},
  'elongation_at_break': {'value': 9.0, 'units': '%'},
  'elongat

In [13]:
from pymongo import MongoClient
from tqdm.notebook import tqdm
import time

#connect to mongodb client
client = MongoClient("mongodb://localhost:27017/")
db = client.matjet
collection = db.materials


session = requests.Session()


n = 4
errCount = 0
i=63
while(errCount <= 3 and i <= len(raw_df.link)-1):
    link_range = range(i,len(raw_df.link))
    for i in tqdm(link_range, desc="Processing", unit="iteration"):
        if (i + 1) % n == 0:
            session = requests.Session()
        
        try:
            document = parse_page(link = raw_df.link[i], sesh=session)
        except Exception:
            if errCount > 3:
                raise Exception(f'Error at index{i}')
            print(f'Error at index{i}')
            time.sleep(180)
            errCount += 1
            break
        
        duplicates = collection.count_documents({'name': document['name']})    
        
        if duplicates > 0:
            print('Duplicate found: Did not add.')
            errCount = 0
        else:
            collection.insert_one(document)
            errCount = 0
        

Processing:   0%|          | 0/11744 [00:00<?, ?iteration/s]

Error at index223


Processing:   0%|          | 0/11584 [00:00<?, ?iteration/s]

Error at index563


Processing:   0%|          | 0/11244 [00:00<?, ?iteration/s]