In [42]:
import requests
from urllib.parse import urljoin
import pandas as pd
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', None)  # or 199

import os
import subprocess
import zipfile
import json
from pandas_profiling import ProfileReport
from collections import defaultdict, Counter
import unicodedata
from googletrans import Translator
from langdetect import detect
import tqdm
import time
from urllib.parse import urlsplit, urlunsplit
import imghdr
import urllib
from html.parser import HTMLParser
from PIL import Image

In [43]:
def clean_string(s):
    return HTMLParser().unescape(
        unicodedata.normalize('NFKD', s)
            .replace('(', ' (')
            .replace(')', ') ')
            .replace('（', ' (')
            .replace('）', ') ')
            .replace('—', ' ')
            .replace('°', ' ')
            .replace('  ', ' ')
            .replace('   ', ' ')
            .strip()
            # Replace for safe csv saving
            .replace(';', ',')
    )

def is_english(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True
    
def remove_query_params_and_fragment(url):
    return urlunsplit(urlsplit(url)._replace(query="", fragment=""))

def reverse_dict(mydict):
    reversed_dict = defaultdict(list)
    for key, value in mydict.items():
        reversed_dict[value].append(key)
        
    return reversed_dict
    
translator = Translator(service_urls=[
  'translate.google.com',
])

In [44]:
VEVS_BASE_URL = 'https://ru.kapiba.ru/mihome/files/old/'
VEVS_PASSWORD = '7549845563'

ASSETS_PATH = '../assets'
ASSETS_ICONS_RAW_PATH = os.path.join(ASSETS_PATH, 'icons', 'raw')
ASSETS_ICONS_PROCESSED_PATH = os.path.join(ASSETS_PATH, 'icons', 'processed')

DATA_PATH_ROOT = '../data'
DATA_PATH_RAW = os.path.join(DATA_PATH_ROOT, 'raw')
DATA_PATH_PROCESSED = os.path.join(DATA_PATH_ROOT, 'processed')
DATA_PATH_MANUAL = os.path.join(DATA_PATH_ROOT, 'manual')

FILE_NAME_RAW_DEVICES = 'plugin_config_all'
FILE_PATH_RAW_DEVICES = os.path.join(DATA_PATH_RAW, FILE_NAME_RAW_DEVICES)
FILENAME_RAW_CATEGORIES = 'recommend_deivce_name.json'
FILE_PATH_RAW_CATEGORIES = os.path.join(DATA_PATH_RAW, FILENAME_RAW_CATEGORIES)

FILE_NAME_PROCESSED_DEVICES = 'devices.csv'
FILE_PATH_PROCESSED_DEVICES = os.path.join(DATA_PATH_PROCESSED, FILE_NAME_PROCESSED_DEVICES)
FILE_NAME_PROCESSED_CATEGORIES = 'categories.csv'
FILE_PATH_PROCESSED_CATEGORIES = os.path.join(DATA_PATH_PROCESSED, FILE_NAME_PROCESSED_CATEGORIES)

FILE_NAME_PROCESSED_SUBCATE_CATE_DICT = 'subcate_cate.json'
FILE_PATH_PROCESSED_SUBCATE_CATE_DICT = os.path.join(DATA_PATH_PROCESSED, FILE_NAME_PROCESSED_SUBCATE_CATE_DICT)

FILE_NAME_PROCESSED_NAME_TRANSLATIONS = 'name_translations.json'
FILE_PATH_PROCESSED_NAME_TRANSLATIONS = os.path.join(DATA_PATH_PROCESSED, FILE_NAME_PROCESSED_NAME_TRANSLATIONS)
FILE_NAME_PROCESSED_DESC_TRANSLATIONS = 'desc_translations.json'
FILE_PATH_PROCESSED_DESC_TRANSLATIONS = os.path.join(DATA_PATH_PROCESSED, FILE_NAME_PROCESSED_DESC_TRANSLATIONS)

FILE_NAME_MANUAL_PRODUCT_NUMBERS_RAW = 'product_number_raw.csv'
FILE_PATH_MANUAL_PRODUCT_NUMBERS_RAW = os.path.join(DATA_PATH_MANUAL, FILE_NAME_MANUAL_PRODUCT_NUMBERS_RAW)

FILE_NAME_PROCESSED_PRODUCT_NUMBERS = 'product_number.csv'
FILE_PATH_PROCESSED_PRODUCT_NUMBERS = os.path.join(DATA_PATH_PROCESSED, FILE_NAME_PROCESSED_PRODUCT_NUMBERS)

ARCHIVE_PATH_DEVICES = 'assets/device_config/plugin_config_all'
ARCHIVE_PATH_CATEGORIES = 'assets/cardControl/recommend_deivce_name.json'

REGEXP_STR_PRODUCT_CODE = "[A-Z]{2,}[a-z]{0,2}[0-9\-\/]{2,}[a-z]{0,2}[a-zA-Z0-9\-\/]+"
REGEXP_STR_MODEL_ID = "^[a-z0-9_]+\.[a-z0-9_]+\.[a-z0-9_]+$"

# Extract Data

##### Download MiHome APK

In [45]:
s = requests.Session()
s.auth = ('', VEVS_PASSWORD)
page = s.get(VEVS_BASE_URL)

# Select latest file
df = pd.read_html(page.content)[0]
vevs_package_filename = (
    df.loc[df['Type'] == 'application/vnd.android.package-archive']
    .tail(1)['Name']
    .tolist()[0]
)

s = f'wget --user "" --password {VEVS_PASSWORD} --read-timeout=5 --tries=0 --no-clobber -P {DATA_PATH_RAW} {urljoin(VEVS_BASE_URL, vevs_package_filename)}'
output = subprocess.check_output(s, shell=True)

##### Extract Data from APK

In [46]:
vevs_package_filename

'MiHome_6.1.701_63411_vevs.apk'

In [47]:
# # You can also read directly from the APK
# archive = zipfile.ZipFile(vevs_package_filename, 'r')
# archive.read(ARCHIVE_PATH_DEVICES)

with zipfile.ZipFile(os.path.join(DATA_PATH_RAW, vevs_package_filename), 'r') as z:
    with open(os.path.join(FILE_PATH_RAW_DEVICES), 'wb') as f:
        f.write(z.read(ARCHIVE_PATH_DEVICES))
        
    with open(os.path.join(FILE_PATH_RAW_CATEGORIES), 'wb') as f:
        f.write(z.read(ARCHIVE_PATH_CATEGORIES))

##### Load Devices data

In [48]:
with open(FILE_PATH_RAW_DEVICES, 'r') as f:
    devices_json = json.load(f)
devices_pd = pd.DataFrame.from_dict(devices_json['result']['devices']['list'])
devices_pd['name'] = devices_pd['name'].apply(clean_string)
devices_pd['desc'] = devices_pd['desc'].apply(clean_string)
devices_pd['model'] = devices_pd['model'].apply(clean_string)
devices_pd = devices_pd.rename(columns={"desc": "desc_cn"})
# devices_pd.head()

  return HTMLParser().unescape(


# Translate

##### Devices names

In [49]:
with open(FILE_PATH_PROCESSED_NAME_TRANSLATIONS, 'r') as f:
    devices_name_translation_dict = json.load(f)

In [50]:
devices_name_cn_list = []
devices_name_en_list = []

for index, row in tqdm.notebook.tqdm(devices_pd.iterrows(), total=devices_pd.shape[0]):
    if row['name'] and not is_english(row['name']):
        name_cn = row['name']
        
        if name_cn in devices_name_translation_dict:
            name_en = devices_name_translation_dict[name_cn]
        else:
            name_en = translator.translate(name_cn, src='zh-cn', dest='en').text
            devices_name_translation_dict[name_cn] = name_en
            time.sleep(2)
    else:
        name_cn = ''
        name_en = row['name']

    devices_name_cn_list.append(name_cn)
    devices_name_en_list.append(name_en)
    
devices_pd['name_cn'] = devices_name_cn_list
devices_pd['name_en'] = devices_name_en_list

  0%|          | 0/3033 [00:00<?, ?it/s]

In [51]:
# for name_cn, name_en in zip(devices_name_cn_list, devices_name_en_list):
#     if name_cn and name_cn not in devices_name_translation_dict:
#         devices_name_translation_dict[name_cn] = name_en

In [52]:
with open(FILE_PATH_PROCESSED_NAME_TRANSLATIONS, 'w') as f:
    json.dump(devices_name_translation_dict, f)

##### Devices descriptions

In [53]:
with open(FILE_PATH_PROCESSED_DESC_TRANSLATIONS, 'r') as f:
    devices_desc_translation_dict = json.load(f)

In [54]:
devices_desc_en_list = []

for index, row in tqdm.notebook.tqdm(devices_pd.iterrows(), total=devices_pd.shape[0]):
    desc_en = ''
    desc_cn = row['desc_cn']
    
    if desc_cn: 
        if not is_english(desc_cn):
            if desc_cn in devices_desc_translation_dict:
                desc_en = devices_desc_translation_dict[desc_cn]
            else:
                desc_en = translator.translate(desc_cn, src='zh-cn', dest='en').text
                devices_desc_translation_dict[desc_cn] = desc_en
                time.sleep(2)
        else:
            desc_en = row['desc_cn']

    devices_desc_en_list.append(desc_en)
    
devices_pd['desc_en'] = devices_desc_en_list

  0%|          | 0/3033 [00:00<?, ?it/s]

In [55]:
# devices_desc_translation_dict = {}

# for desc_cn, desc_en in zip(list(devices_pd['desc_cn'].values), list(devices_pd['desc_en'].values)):
#     if desc_cn and desc_cn not in devices_desc_translation_dict:
#         devices_desc_translation_dict[desc_cn] = desc_en

In [56]:
with open(FILE_PATH_PROCESSED_DESC_TRANSLATIONS, 'w') as f:
    json.dump(devices_desc_translation_dict, f)

# Create Metacategories

**Original dependencies hell looks like that:**
- subcategory_id (one) -> microcategory_name (many)
- microcategory_name (one) -> devices (many) -> subcategory_id (many)
- cate_name (one) -> subcategory_id (many)

**Our Metasubcategory transformation:**

If device not exists in:
- microcategory_name (one) -> devices (many)

Then:
- microcategory_name (one) -> subcategory_id (one, most popular)
- subcategory_id (one) -> microcategory_name (one, most popular)

In [57]:
with open(FILE_PATH_RAW_CATEGORIES, 'r') as f:
    category_name_json = json.load(f)
category_name_pd = pd.DataFrame.from_dict(category_name_json['data'])
category_name_pd['models'] = category_name_pd['product_type'].apply(lambda x: list(x['model']))
category_name_pd['name_en'] = category_name_pd['product_type'].apply(lambda x: x['defaultName']['en'] if 'en' in x['defaultName'] else translator.translate(x['defaultName'].get('zh_CN'), src='zh-cn', dest='en').text)
category_name_pd['name_en'] = category_name_pd['name_en'].apply(lambda x: clean_string(x).capitalize())
category_name_pd.to_csv(FILE_PATH_PROCESSED_CATEGORIES)
# category_name_pd.head()

  return HTMLParser().unescape(


In [58]:
model_subcategory_id_dict = pd.Series(devices_pd.subcategory_id.values, index=devices_pd.model).to_dict()
model_cate_name_dict = pd.Series(devices_pd.cate_name.values, index=devices_pd.model).to_dict()

In [59]:
model_microcategory_name_dict = {}
subcategory_id_microcategory_names_dict = defaultdict(list)
subcategory_id_cate_names_dict = defaultdict(list)
# a = defaultdict(list)

for index, row in category_name_pd.iterrows():
    microcategory_name = row['name_en']
    
    subcategories_ids = []
    cate_names = []
    microcategories_names = []
    for m in row['models']:
        if m in model_subcategory_id_dict:
            subcategories_ids.append(model_subcategory_id_dict[m])
            
            if model_cate_name_dict[m] != 'Other':
                cate_names.append(model_cate_name_dict[m])

            model_microcategory_name_dict[m] = microcategory_name
            microcategories_names.append(microcategory_name)
            
    subcategory_id_microcategory_names_dict[Counter(subcategories_ids).most_common()[0][0]].extend(microcategories_names)
#     a[Counter(microcategories_names).most_common()[0][0]].extend(cate_names)
    subcategory_id_cate_names_dict[Counter(subcategories_ids).most_common()[0][0]].extend(cate_names)
    
subcategory_id_microcategory_name_dict = {}
for k,v in subcategory_id_microcategory_names_dict.items():
    subcategory_id_microcategory_name_dict[k] = Counter(v).most_common()[0][0]
    
subcategory_id_cate_name_dict = {}
for k,v in subcategory_id_cate_names_dict.items():
    if len(v):
        subcategory_id_cate_name_dict[k] = Counter(v).most_common()[0][0]
        
# b = {}
# for k,v in a.items():
#     if len(v):
#         b[k] = Counter(v).most_common()[0][0]

In [60]:
def get_metasubcategory_name(s):
    if s['model'] in model_microcategory_name_dict:
        r = model_microcategory_name_dict[s['model']]
    elif s['subcategory_id'] in subcategory_id_microcategory_name_dict:
        r = subcategory_id_microcategory_name_dict[s['subcategory_id']]
    else:
        r = 'Other'
    return r

def get_metacategory_name(s):
    metacategory_name = s['cate_name']
    subcategory_id = s['subcategory_id']
    if (subcategory_id in subcategory_id_cate_name_dict):
#         if subcategory_id_cate_name_dict[subcategory_id] != s['cate_name']:
#             print(s['name_en'], '|||', s['metasubcategory_name'], '|||', subcategory_id_cate_name_dict[subcategory_id], '-', s['cate_name'])
            
        metacategory_name = subcategory_id_cate_name_dict[subcategory_id]
    return metacategory_name

devices_pd['metasubcategory_name'] = devices_pd.apply(get_metasubcategory_name, axis=1)
devices_pd['metacategory_name'] = devices_pd.apply(get_metacategory_name, axis=1)
print(
    devices_pd['metasubcategory_name'].value_counts(True)['Other'],
    devices_pd['cate_name'].value_counts(True)['Other'],
    devices_pd['metacategory_name'].value_counts(True)['Other']
)

0.008242664029014177 0.23112429937355752 0.09990108803165183


In [61]:
# devices_pd[['name_en', 'cate_name', 'metacategory_name', 'metasubcategory_name', 'subcategory_id']][devices_pd['cate_name'] == 'Other']

In [62]:
# Wrong matching name ! LEGACY

# # Model - Subcategory ID
# model_subcategory_id_dict = pd.Series(devices_pd.subcategory_id.values, index=devices_pd.model).to_dict()

# # Microcategory Name - Subcategory ID
# microcategory_name_subcategory_ids_dict = defaultdict(list)

# for index, row in category_name_pd.iterrows():
#     for m in row['models']:
#         if m in model_subcategory_id_dict:
#             microcategory_name_subcategory_ids_dict[row['name_en']].append(model_subcategory_id_dict[m])

# microcategory_name_subcategory_id_dict = {}    
# for subcategory_name, subcategory_ids in subcategory_name_subcategory_ids_dict.items():
#     microcategory_name_subcategory_id_dict[subcategory_name] = Counter(subcategory_ids).most_common()[0][0]
    
# # Subcategory ID - Subcategory Name
# subcategory_id_subcategory_name_dict = reverse_dict(subcategory_name_subcategory_id_dict)
    
# # Subcategory Name - Category Name
# devices_pd['subcategory_name'] = devices_pd['subcategory_id'].apply(lambda x: subcategory_id_subcategory_name_dict.get(x, 'Unknown'))
# subcate_cate_dict = (
#     devices_pd[devices_pd['subcategory_name'] != 'Unknown']
#         .groupby('subcategory_name').apply(lambda x: Counter(x['cate_name']).most_common()[0][0]).to_dict()
# )

In [63]:
devices_pd.shape

(3033, 59)

# Clean devices

In [64]:
# Remove all virtual devices
devices_pd = devices_pd[~(
    devices_pd['name_en'].str.contains('virtual', case=False) |
    devices_pd['desc_en'].str.contains('virtual', case=False) |
    devices_pd['model'].str.contains('virtual', case=False) | 
    devices_pd['model'].str.contains('vtl', case=False)
)]

# Remove strange devices that contain foreign model_id in description
devices_pd = devices_pd[
    devices_pd.apply(lambda x: x['desc_en'] not in set(devices_pd['model'].unique()), axis=1)
]

# Swap desc_en -> model_id for others
condition = devices_pd['desc_en'].str.contains(REGEXP_STR_MODEL_ID, case=True, regex=True)
devices_pd.loc[condition, 'model'] = devices_pd[condition]['desc_en']
devices_pd.loc[condition, 'desc_en'] = ''

devices_pd.shape

(2971, 59)

# Product Number

### Automatic (from descriptions)

In [65]:
regexp_exclusion_product_numbers = [
    'ikecin.airfresh.95',
    'rotai.magic_touch.sx300',
    'rotai.massage.rt5870',
    'rotai.massage.rt5870',
    'viomi.dishwasher.v12',
    'viomi.aircondition.v21',
    'isleep.blanket.hs2001',
    'lemesh.switch.sw2a02',
    'viomi.airpurifier.v2',
    'lemesh.switch.sw1a02',
    'tsd.light.test01',
    'tsd.light.tp1',
    'linked.light.ft155q',
    'innolinks.plug.ap3200',
    'ezhome.switch.z4002',
    'wainft.light.wy0a01',
    'viomi.airpurifier.v1',
    'linked.light.ft179',
    'linked.airpurifier.800fac',
    'ezhome.switch.z4003',
    'viomi.bhf_light.v4',
    'yeelink.light.ceiling15',
    'kuju.plug.36',
    'linp.gateway.n2',
    'tsd.light.tsl001',
    'imou99.camera.tp2',
    'jihisi.light.wy0a01',
    'lemesh.switch.sw3a02',
    'linked.light.ft154d',
    'ezhome.switch.z4001',
    'viomi.fan.v5',
    'knx.light.ktexxd',
    'scds.light.wy0a01',
    'xiaomi.aircondition.ma5',
    'ikea.light.led1537r6',
    'lumi.acpartner.v3',
    'scmkcz.switch.cr2p',
    'ikea.light.led1650r5'
]

In [66]:
# REGEXP_STR_PRODUCT_CODE = '[0-9]{0,4}[A-Z]{2,}[\/\\\-]*[a-z\/\-]{0,2}[0-9\/]{1,}[\-]*[a-z]{0,2}[a-zA-Z0-9\-\/]+'
REGEXP_STR_PRODUCT_CODE = "[A-Z]{2,}[a-z]{0,2}[0-9\-\/]{2,}[a-z]{0,2}[a-zA-Z0-9\-\/]+"
devices_pd['product_number'] = devices_pd['desc_en'].str.findall(REGEXP_STR_PRODUCT_CODE).apply(lambda x: ', '.join(set(x)))
devices_pd['product_number'] = devices_pd.apply(lambda x: '' if (x['model'] in regexp_exclusion_product_numbers or 'ESP' in x['product_number']) else x['product_number'], axis=1)
devices_pd[['name_en', 'desc_en', 'model', 'product_number']][devices_pd['product_number'].str.contains(REGEXP_STR_PRODUCT_CODE)]

product_number_auto_dict = dict(
    devices_pd[
        devices_pd['product_number'] != ''
    ][['model', 'product_number']].values
)
len(product_number_auto_dict)

87

### Manual

In [67]:
product_number_manual_raw_pd = pd.read_csv(FILE_PATH_MANUAL_PRODUCT_NUMBERS_RAW, index_col=0)

In [68]:
product_number_manual_raw_pd.loc[product_number_manual_raw_pd['product_number'].isin(['-', '?']), 'product_additional'] = product_number_manual_raw_pd.loc[product_number_manual_raw_pd['product_number'].isin(['-', '?']), 'product_number']
product_number_manual_raw_pd.loc[product_number_manual_raw_pd['product_number'].isin(['-', '?']), 'product_number'] = pd.NA
# product_number_manual_raw_pd.loc[product_number_manual_raw_pd['product_additional'].isin(['-', '?']), 'product_additional']

In [69]:
product_number_manual_pd = product_number_manual_raw_pd[
        (~product_number_manual_raw_pd['product_number'].isnull() | ~product_number_manual_raw_pd['product_additional'].isnull())
][['model_id', 'product_number', 'product_additional']].reset_index(drop=True)
product_number_manual_pd['product_number_source_type'] = 'manual'
# product_number_manual_pd['product_number'] = product_number_manual_pd['product_number'].fillna('').astype('str')
len(product_number_manual_pd)

298

In [70]:
product_number_manual_dict = dict(
    product_number_manual_pd[
        product_number_manual_pd['product_number'].notnull()
    ][['model_id', 'product_number']].values
)
len(product_number_manual_dict)

287

### Hardcode (temporaly manual)

In [71]:
product_number_hardcode_dict = {
    'lumi.switch.b2naus01': 'WS-USC04',
    'lumi.light.cwjwcn01': 'JWSP001A',
    'lumi.switch.b1laus01': 'WS-USC01',
    'lumi.light.cbacn1': 'HLQDQ01LM',
    'lumi.light.rgbac1': 'ZNTGMK11LM',
    'lumi.switch.b2laus01': 'WS-USC02',
    'lumi.switch.l2aeu1':  'WS-EUK02',
    'lumi.light.cwjwcn02' :'JWDL001A',
}

### Union

In [72]:
product_number_dict = {}
product_number_dict.update(product_number_auto_dict)
product_number_dict.update(product_number_manual_dict)
product_number_dict.update(product_number_hardcode_dict)
len(product_number_dict)

288

In [77]:
product_number_pd = pd.DataFrame(product_number_dict.items(), columns=['model_id', 'product_number'])

def product_number_source_type(d):
    t = ''
    if d['model_id'] in product_number_auto_dict:
        t = 'auto'
    elif d['model_id'] in product_number_manual_dict or d['model_id'] in product_number_hardcode_dict:
        t = 'manual'

    return t

product_number_pd['product_number_source_type'] = product_number_pd.apply(
    lambda x: product_number_source_type(x),
    axis=1
)
product_number_pd = product_number_pd.merge(product_number_manual_pd[['model_id', 'product_additional']], how='outer', on='model_id')
product_number_pd['product_number_source_type'].fillna('manual', inplace=True)
product_number_pd = product_number_pd.sort_values('product_number_source_type', ascending=False).reset_index(drop=True)

# assert product_number_pd.shape[0] == len(product_number_pd['model_id'].unique())

In [78]:
product_number_pd

Unnamed: 0,model_id,product_number,product_number_source_type,product_additional
0,lumi.sensor_86sw1.v1,WXKG03LM,manual,2016 rev1
1,lumi.sensor_86sw2.v1,WXKG02LM,manual,"2016 rev1, Double Wall Button"
2,lumi.weather.v1,WSDCGQ11LM,manual,
3,lumi.remote.b486opcn01,WXCJKG12LM,manual,Opple Four Button
4,ikea.light.led1649c5,LED1649C5,manual,
5,lumi.sensor_wleak.aq1,SJCGQ11LM,manual,
6,lumi.relay.c2acn01,LLKZMK11LM,manual,
7,lumi.airrtc.vrfegl01,KTBL01LM,manual,
8,lumi.switch.b2nacn02,QBKG24LM,manual,Double Wall Switch D1
9,zimi.mosq.v1,WX08ZM,manual,


In [79]:
product_number_pd.to_csv(FILE_PATH_PROCESSED_PRODUCT_NUMBERS)

In [80]:
devices_pd.drop('product_number', inplace=True, axis=1)
devices_pd = devices_pd.merge(
    product_number_pd, 
    how='left', 
    left_on=['model'], 
    right_on=['model_id']
).drop('model_id', axis=1)
# assert devices_pd[~devices_pd['product_number_source_type'].isnull()].shape[0] == len(product_number_dict)

In [81]:
devices_pd[['name_en', 'desc_en', 'model', 'product_number', 'product_additional', 'product_number_source_type']].rename({'model': 'model_id'}, axis=1).to_csv(FILE_PATH_MANUAL_PRODUCT_NUMBERS_RAW)

# Save enriched devices data

In [82]:
devices_pd.to_csv(FILE_PATH_PROCESSED_DEVICES)

# Download icons

In [83]:
for index, row in tqdm.notebook.tqdm(devices_pd.iterrows(), total=devices_pd.shape[0]):
    icon_url = row['icon_real']
    if not icon_url:
        continue
    
    # Magic string for extracting extension from url instead of slow MIME Headers
    # But extensions is wrong lol
    img_ext = os.path.splitext(urllib.parse.urlparse(icon_url).path)[1]
    img_path = f'{os.path.join(ASSETS_ICONS_RAW_PATH, row["model"])}{img_ext}'
    
#     # Show real extensions
#     if os.path.isfile(img_path):
#         print(imghdr.what(img_path))
    
    if (not os.path.isfile(img_path) or not imghdr.what(img_path)) and row['icon_real']:
        try:
            s = f'wget --tries=10 -O {img_path} "{row["icon_real"]}"'
            output = subprocess.check_output(s, shell=True)
        except Exception as e:
            print(e)

  0%|          | 0/2973 [00:00<?, ?it/s]

In [None]:
# for img_name in os.listdir(ASSETS_ICONS_RAW_PATH):
#     try:
#         dest_img_path = os.path.join(ASSETS_ICONS_PROCESSED_PATH, img_name)
#         src_img_path = os.path.join(ASSETS_ICONS_RAW_PATH, img_name)
#         if (not os.path.isfile(dest_img_path) or not imghdr.what(dest_img_path)):
#             im = Image.open(src_img_path)
#             im.thumbnail([128, 128], Image.ANTIALIAS)
#             im.save(os.path.join(dest_img_path), "PNG")
#     except IOError:
#         print("cannot create thumbnail for '%s'" % item)

# Create Table

In [84]:
df = (
    devices_pd[['icon_real', 'name_en', 'desc_en', 'metacategory_name', 'metasubcategory_name', 'model', 'product_number']]
        .sort_values(['metacategory_name', 'metasubcategory_name', 'name_en', 'model'])
        .reset_index()
        .groupby(['metacategory_name', 'metasubcategory_name'])
        .apply(lambda x: pd.concat([x['icon_real'], x['name_en'], x['desc_en'], x['model'], x['product_number']], axis=1))
        .rename({
            'icon_real': 'Pic', 
            'name_en': 'Name', 
            'desc_en': 'Description', 
            'model': 'Model',
            'product_number': 'Product Number'
        }, axis=1)
        .fillna('')
#         .rename({'metacategory_name': 'Category', 'metasubcategory_name': 'Subcategory'}, axis=0)
)

# df['Pic'] = df['Pic'].apply(lambda x: f'<img src="{x}">')

img_pre = './assets/icons/raw/'
# img_pre = '/home/home/Projects/mizoo/assets/icons/raw/'
df['Pic'] = df.apply(
    lambda x: f'<img src="{os.path.join(img_pre, x["Model"] + ".png")}">', 
    axis=1
)

# df.head()

In [85]:
with open('../README.md', 'w') as f:
    print('# Mi-zoo\n', file=f)
    print('<sup><sub>Complete list of Xiaomi Mi Smart Home Ecosystem devices with descriptions and codes.</sub></sup>\n---', file=f)
    for category, df_sub in df.groupby(level=0):
        print(f'---\n# {category}', file=f)
        for subcategory, df_dev in df_sub.groupby(level=1):
            print(f'## {subcategory}', file=f)
            print(
                df_dev[['Pic', 'Name', 'Model', 'Product Number']]
                .reset_index(level=['metasubcategory_name', 'metacategory_name'], drop=True)
                .to_html(escape=False)
                .replace('\\n', '<br />')
#                 .replace('<th>Pic</th>', '<th style="width:30%">Pic</th>')
#                 .replace('<th>Name</th>', '<th style="width:20%">Name</th>')
#                 .replace('<th>Model</th>', '<th style="width:10%">Model</th>')
#                 .replace('<th>Description</th>', '<th style="width:40%">Description</th>')
                ,file=f
            )
            print('', file=f)