In [372]:
import requests
from bs4 import BeautifulSoup
import pandas as pd 
import polars as pl
import json
import os
import re

In [374]:
SONY_URL = "https://lens-db.com/system/sony-e/"

page = requests.get(SONY_URL)
soup = BeautifulSoup(page.content, 'html.parser')

with open('source/all-lens.html', 'w') as f: 
    f.write(soup.prettify())
    f.close

In [366]:
from typing import List 

def get_lens_model(full_lens_name: str) -> List[str]:
    "Parse the raw text and split into the name, model and filter size"
    pattern = r'^(.*?)\s*\[(.*?)\](?:.*⌀(\d+))?'
    match = re.match(pattern, full_lens_name)
    
    if match:
        results = {}
        lens_name = match.group(1).strip()
        lens_model = match.group(2).strip()
        filter_size = match.group(3)
        if filter_size: 
            filter_size = filter_size.strip()
        else: 
            filter_size = None

        results['name']  = lens_name
        results['model'] = lens_model
        results['filter_size'] = filter_size
        
        return results

In [380]:
all_lenses = soup.find_all("td", {"class": "uk-table-expand"})
# for lens in all_lenses: 

lenses = []
for lens in all_lenses:     
    single_lens = {} 
    lens_data = get_lens_model(lens.get_text())
    # should only have one link per lens and return the href element
    link = lens.find_all('a', href=True)[0]['href']
    
    single_lens['link'] = link 
    single_lens = {**single_lens, **lens_data}
    lenses.append(single_lens)

In [3]:
SINGLE_LENS_URL = "https://lens-db.com/sony-fe-50mm-f14-gm-sel50f14gm-2023/"

page = requests.get(SINGLE_LENS_URL)
soup = BeautifulSoup(page.content, 'html.parser')

with open('source/single-lens.html', 'w') as f:
    f.write(soup.prettify())
    f.close

In [6]:
soup  = BeautifulSoup(open('source/single-lens.html', 'r').read())
source_data = soup.find_all('div', {"itemtype":"http://schema.org/Table"})

In [36]:
features = soup.find_all('td')

In [170]:
def test_regex(string_to_test: str): 
    for regex_pattern in regex_patterns.values(): 
        if re.match(regex_pattern, string_to_test): 
            print(string_to_test)
            return False 
    return True 

In [307]:
def clean_str_list(text: str) -> str: 
    split_texts = text.strip().split('\n')
    cleaned_texts = []
    for split_text in split_texts: 
        test_string = split_text.strip()
        is_white_space = re.match(re_white_space, test_string)
        print(is_white_space)
        if is_white_space: 
            pass 
        else: 
            cleaned_texts.append(test_string)
    
    return ' '.join(cleaned_texts).strip().strip(":")

In [176]:
clean_str_list(features[3].get_text())

'Professional lens with high quality optics and robust build. Meets the highest standards and provides excellent performance and flawless image quality unachievable with traditional optical technologies.'

In [185]:
features_cleaned = []
for feature in features: 
    cleaned_text = clean_str_list(feature.get_text())
    features_cleaned.append(cleaned_text)

In [232]:
import yaml 
required_features = yaml.SafeLoader(open('config/features.yml')).get_data()
feature_keys = required_features.get('specs')

In [233]:
def camel_to_snake(name: str) -> str:
    name = name.replace(' ', '_')
    name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
    return re.sub("([a-z0-9])([A-Z])", r"\1_\2", name).lower().replace(" ", "")

In [234]:
lens_args = [camel_to_snake(spec) for spec in feature_keys]


lens_feature = {} 
for feature_index, feature in enumerate(features_cleaned): 
    if feature in feature_keys: 
        feature_value = features_cleaned[feature_index + 1]
        feature_key = camel_to_snake(feature)
        lens_feature[feature_key] = feature_value
    

In [295]:
from dataclasses import dataclass 
from typing import Optional, Any
import logging 
@dataclass
class Lens: 
    closest_focusing_distance: str
    filters: str
    lens_construction: str
    maximum_magnification: str
    original_name: str
    production_status: str
    speed: str
    system: str
    weight: str
    focal_length: str
    focusing_modes: str
    number_of_blades: Any
    
    # derived fields 
    min_aperture = Optional[float]
    max_aperture = Optional[float]
    min_focal_length = Optional[float]
    max_focal_length = Optional[float]
    is_prime = Optional[bool]
    is_zoom = Optional[bool]
    
    def __post_init__(self): 
        self.min_aperture, self.max_aperture = self.get_apertures()
        self.min_focal_length, self.max_focal_length = self.get_focal_lengths()
    
    @property 
    def filter_size(self) -> float: 
        return self.filters.split('mm').split(' ')
    
    @property 
    def min_focusing_distance(self) -> float: 
        return self.closest_focusing_distance.strip('m [AF]')

    @property 
    def no_of_blades(self) -> int: 
        return int(self.number_of_blades.split(' ')[0])
    
    @property 
    def is_autofocus(self) -> bool: 
        has_autofocus = "auto" in self.focusing_modes.lower()
        return has_autofocus
    
    def get_apertures(self) -> float: 
        apertures = self.speed.strip('F/').split('-')
        if len(apertures) == 1: 
            min = float(apertures[0])
            max = float(apertures[0])
            self.is_zoom = False
        elif len(apertures) >= 2: 
            min = float(apertures[0])
            max = float(apertures[1])
            self.is_zoom = True
        else: 
            logging.error('Error parsing apertures; none found for %s',self.original_name)
        return (min, max)
    
    def get_focal_lengths(self) -> float: 
        focal_length_nos = self.focal_length.strip('mm').split('-') 
        if len(focal_length_nos) == 1: 
            min = float(focal_length_nos[0])
            max = float(focal_length_nos[0])
            self.is_prime = True
        elif len(focal_length_nos) >= 2: 
            min = float(focal_length_nos[0])
            max = float(focal_length_nos[1])
            self.is_prime = False
        else: 
            logging.error('Error parsing focal lengths; none found for %s',self.original_name)
        
        return (min, max)
    
    

In [312]:
pl.from_dict(Lens(**lens_feature).__dict__).to_pandas().to_parquet('outputs/lens.parquet')

In [384]:
def parse_lens_link(lens_link: str) -> pl.DataFrame:
    page = requests.get(lens_link)
    soup = BeautifulSoup(page.content, 'html.parser')
    features = soup.find_all('td')
    features_cleaned = []
    for feature in features:
        cleaned_text = clean_str_list(feature.get_text())
        features_cleaned.append(cleaned_text)
    print(features_cleaned)
    lens_feature = {}
    for feature_index, feature in enumerate(features_cleaned):
        if feature in feature_keys:
            feature_value = features_cleaned[feature_index + 1]
            feature_key = camel_to_snake(feature)
            lens_feature[feature_key] = feature_value
    print(lens_feature)
    df = pl.from_dict(Lens(**lens_feature).__dict__)
    return df

In [385]:
lenses = [{'link': 'https://lens-db.com/sony-fe-50mm-f14-gm-sel50f14gm-2023/'}]
for lens in lenses:
    link = lens.get('link')
    df_lens = parse_lens_link(lens_link = link)


None
None
None
None
<re.Match object; span=(0, 2), match='■ '>
None
None
None
<re.Match object; span=(0, 2), match='●\xa0'>
None
None
None
None
<re.Match object; span=(0, 2), match='■ '>
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
<re.Match object; span=(0, 2), match='■ '>
None
None
None
None
None
None
<re.Match object; span=(0, 2), match='■ '>
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
<re.Match object; span=(0, 2), match='■ '>
None
None
None
None
None
None
<re.Match object; span=(0, 2), match='■ '>
None
<re.Match object; span=(0, 1), match='-'>
<re.Match object; span=(0, 2), match='■ '>
None
None
None
<re.Match object; span=(0, 1), match='⌀'>
None
None
None
None
<re.Match object; span=(0, 2), match='■ '>
None
None
None
None
None
None
<re.Match object; span=(0, 2), match='■ '>
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
Non

TypeError: Lens.__init__() missing 2 required positional arguments: 'optical_design' and 'production_details'