In [372]:
import requests
from bs4 import BeautifulSoup
import pandas as pd 
import polars as pl
import json
import os
import re

In [374]:
SONY_URL = "https://lens-db.com/system/sony-e/"

page = requests.get(SONY_URL)
soup = BeautifulSoup(page.content, 'html.parser')

with open('source/all-lens.html', 'w') as f: 
    f.write(soup.prettify())
    f.close

In [366]:
from typing import List 

def get_lens_model(full_lens_name: str) -> List[str]:
    "Parse the raw text and split into the name, model and filter size"
    pattern = r'^(.*?)\s*\[(.*?)\](?:.*⌀(\d+))?'
    match = re.match(pattern, full_lens_name)
    
    if match:
        results = {}
        lens_name = match.group(1).strip()
        lens_model = match.group(2).strip()
        filter_size = match.group(3)
        if filter_size: 
            filter_size = filter_size.strip()
        else: 
            filter_size = None

        results['name']  = lens_name
        results['model'] = lens_model
        results['filter_size'] = filter_size
        
        return results

In [380]:
all_lenses = soup.find_all("td", {"class": "uk-table-expand"})
# for lens in all_lenses: 

lenses = []
for lens in all_lenses:     
    single_lens = {} 
    lens_data = get_lens_model(lens.get_text())
    # should only have one link per lens and return the href element
    link = lens.find_all('a', href=True)[0]['href']
    
    single_lens['link'] = link 
    single_lens = {**single_lens, **lens_data}
    lenses.append(single_lens)

In [3]:
SINGLE_LENS_URL = "https://lens-db.com/sony-fe-50mm-f14-gm-sel50f14gm-2023/"

page = requests.get(SINGLE_LENS_URL)
soup = BeautifulSoup(page.content, 'html.parser')

with open('source/single-lens.html', 'w') as f:
    f.write(soup.prettify())
    f.close

In [6]:
soup  = BeautifulSoup(open('source/single-lens.html', 'r').read())
source_data = soup.find_all('div', {"itemtype":"http://schema.org/Table"})

In [36]:
features = soup.find_all('td')

In [418]:
def clean_str_list(text: str) -> str: 
    split_texts = text.strip().split('\n')
    cleaned_texts = []
    for split_text in split_texts: 
        test_string = split_text.strip()
        is_white_space = re.match(RE_WHITE_SPACE, test_string)
        if is_white_space: 
            pass 
        else: 
            cleaned_texts.append(test_string)
    
    return ' '.join(cleaned_texts).strip().strip(":")

def camel_to_snake(name: str) -> str:
    name = name.replace(' ', '_')
    name = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", name)
    return re.sub("([a-z0-9])([A-Z])", r"\1_\2", name).lower().replace(" ", "")

In [401]:
import yaml 
required_features = yaml.SafeLoader(open('config/features.yml')).get_data()
feature_keys = required_features.get('specs')

In [404]:
lens_args = [camel_to_snake(spec) for spec in feature_keys]


lens_feature = {} 
for feature_index, feature in enumerate(features_cleaned): 
    if feature in feature_keys: 
        feature_value = features_cleaned[feature_index + 1]
        feature_key = camel_to_snake(feature)
        lens_feature[feature_key] = feature_value
    

In [425]:
import logging 
from dataclasses import field
from typing import Optional

# global variables
RE_WHITE_SPACE = "\W+"
RE_SPEED = r'F/(\d+(?:\.\d+)?)'
RE_FOCAL = r'(\d+)mm'
SONY_URL = "https://lens-db.com/system/sony-e/"


@dataclass
class Lens:
    closest_focusing_distance: str
    filters: str
    lens_construction: str
    maximum_magnification: str
    original_name: str
    production_status: str
    system: str
    weight: str
    focusing_modes: str
    number_of_blades: Any

    # derived fields
    min_aperture: float = field(init=False)
    max_aperture: float = field(init=False)
    min_focal_length: float = field(init=False)
    max_focal_length: float = field(init=False)
    is_prime: bool = field(init=False)
    is_zoom: bool = field(init=False)

    # due to inconsistency between descriptions, have to parse alternative features as well
    speed: Optional[str] = ""
    focal_length: Optional[str] = ""
    speed_range: Optional[str] = ""
    focal_length_range: Optional[str] = ""


    def __post_init__(self):
        self.min_aperture, self.max_aperture = self.get_apertures()
        self.min_focal_length, self.max_focal_length = self.get_focal_lengths()

    @property
    def filter_size(self) -> float:
        return self.filters.split('mm').split(' ')

    @property
    def min_focusing_distance(self) -> float:
        return self.closest_focusing_distance.strip('m [AF]')

    @property
    def no_of_blades(self) -> int:
        return int(self.number_of_blades.split(' ')[0])

    @property
    def is_autofocus(self) -> bool:
        has_autofocus = "auto" in self.focusing_modes.lower()
        return has_autofocus


    def get_apertures(self) -> float:
        matches = re.findall(RE_SPEED, self.speed if self.speed != "" else self.speed_range)
        speed_ranges = [float(match) for match in matches]

        if len(speed_ranges) == 1:
            min = speed_ranges[0]
            max = speed_ranges[0]
            self.is_zoom = False
        elif len(speed_ranges) >= 2:
            min = speed_ranges[0]
            max = speed_ranges[1]
            self.is_zoom = True
        else:
            logging.error('Error parsing apertures; none found for %s',self.original_name)
        return (min, max)

    def get_focal_lengths(self) -> float:
        matches = re.findall(RE_FOCAL, self.focal_length if self.focal_length != "" else self.focal_length_range)
        focal_ranges = [int(match) for match in matches]
        if len(focal_ranges) == 1:
            min = focal_ranges[0]
            max = focal_ranges[0]
            self.is_prime = True
        elif len(focal_ranges) >= 2:
            min = focal_ranges[0]
            max = focal_ranges[1]
            self.is_prime = False
        else:
            logging.error('Error parsing focal lengths; none found for %s',self.original_name)

        return (min, max)

In [406]:
def parse_lens_link(lens_link: str) -> pl.DataFrame:
    page = requests.get(lens_link)
    soup = BeautifulSoup(page.content, 'html.parser')
    features = soup.find_all('td')
    features_cleaned = []
    for feature in features:
        cleaned_text = clean_str_list(feature.get_text())
        features_cleaned.append(cleaned_text)
    lens_feature = {}
    for feature_index, feature in enumerate(features_cleaned):
        if feature in feature_keys:
            feature_value = features_cleaned[feature_index + 1]
            feature_key = camel_to_snake(feature)
            lens_feature[feature_key] = feature_value
    df = pl.from_dict(Lens(**lens_feature).__dict__)
    return df

In [427]:
lenses = [{'link': 'https://lens-db.com/sony-fe-12-24mm-f4-g-sel1224g-2017/'}]
for lens in lenses:
    link = lens.get('link')
    df_lens_zoom = parse_lens_link(lens_link = link)

In [430]:
lenses = [{'link': 'https://lens-db.com/sony-fe-50mm-f14-gm-sel50f14gm-2023/'}]
for lens in lenses:
    link = lens.get('link')
    df_lens = parse_lens_link(lens_link = link)


In [432]:
df_lens_zoom.select('speed')

speed
str
""""""


In [431]:
df_lens.select('speed')

speed
str
"""F/1.4"""


In [394]:
df_lenses = pl.DataFrame({})

In [396]:
pl.concat([df_lenses, df_lens])

closest_focusing_distance,filters,lens_construction,maximum_magnification,original_name,production_status,speed,system,weight,focal_length,focusing_modes,number_of_blades,is_zoom,min_aperture,max_aperture,is_prime,min_focal_length,max_focal_length
str,str,str,str,str,str,str,str,str,str,str,str,bool,f64,f64,bool,f64,f64
"""0.41m [AF]""","""Screw-type 67m…","""14 elements in…","""1:6.25 [AF] at…","""SONY FE 1.4/50…","""""","""F/1.4""","""Sony E (2013)""","""516g""","""50mm""","""Autofocus, man…","""11 (eleven)""",False,1.4,1.4,True,50.0,50.0
