In [1]:
import requests
from urllib.parse import urljoin
import pandas as pd
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', None)  # or 199

import os
import subprocess
import zipfile
import json
from pandas_profiling import ProfileReport
from collections import defaultdict, Counter
import unicodedata
from googletrans import Translator
from langdetect import detect
import tqdm
import time
from urllib.parse import urlsplit, urlunsplit
import imghdr
import urllib
from html.parser import HTMLParser
from PIL import Image

In [2]:
def clean_string(s):
    return HTMLParser().unescape(
        unicodedata.normalize('NFKD', s)
            .replace('(', ' (')
            .replace(')', ') ')
            .replace('（', ' (')
            .replace('）', ') ')
            .replace('—', ' ')
            .replace('°', ' ')
            .replace('  ', ' ')
            .replace('   ', ' ')                                          
            .strip()
    )

def is_english(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True
    
def remove_query_params_and_fragment(url):
    return urlunsplit(urlsplit(url)._replace(query="", fragment=""))

def reverse_dict(mydict):
    reversed_dict = defaultdict(list)
    for key, value in mydict.items():
        reversed_dict[value].append(key)
        
    return reversed_dict
    
translator = Translator(service_urls=[
  'translate.google.com',
])

In [3]:
VEVS_BASE_URL = 'https://ru.kapiba.ru/mihome/files/old/'
VEVS_PASSWORD = '7549845563'

ASSETS_PATH = '../assets'
ASSETS_ICONS_RAW_PATH = os.path.join(ASSETS_PATH, 'icons', 'raw')
ASSETS_ICONS_PROCESSED_PATH = os.path.join(ASSETS_PATH, 'icons', 'processed')

DATA_PATH_ROOT = '../data'
DATA_PATH_RAW = os.path.join(DATA_PATH_ROOT, 'raw')
DATA_PATH_PROCESSED = os.path.join(DATA_PATH_ROOT, 'processed')

FILE_NAME_RAW_DEVICES = 'plugin_config_all'
FILE_PATH_RAW_DEVICES = os.path.join(DATA_PATH_RAW, FILE_NAME_RAW_DEVICES)
FILENAME_RAW_CATEGORIES = 'recommend_deivce_name.json'
FILE_PATH_RAW_CATEGORIES = os.path.join(DATA_PATH_RAW, FILENAME_RAW_CATEGORIES)

FILE_NAME_PROCESSED_DEVICES = 'devices.csv'
FILE_PATH_PROCESSED_DEVICES = os.path.join(DATA_PATH_PROCESSED, FILE_NAME_PROCESSED_DEVICES)
FILE_NAME_PROCESSED_CATEGORIES = 'categories.csv'
FILE_PATH_PROCESSED_CATEGORIES = os.path.join(DATA_PATH_PROCESSED, FILE_NAME_PROCESSED_CATEGORIES)

FILE_NAME_PROCESSED_SUBCATE_CATE_DICT = 'subcate_cate.json'
FILE_PATH_PROCESSED_SUBCATE_CATE_DICT = os.path.join(DATA_PATH_PROCESSED, FILE_NAME_PROCESSED_SUBCATE_CATE_DICT)

FILE_NAME_PROCESSED_NAME_TRANSLATIONS = 'name_translations.json'
FILE_PATH_PROCESSED_NAME_TRANSLATIONS = os.path.join(DATA_PATH_PROCESSED, FILE_NAME_PROCESSED_NAME_TRANSLATIONS)
FILE_NAME_PROCESSED_DESC_TRANSLATIONS = 'desc_translations.json'
FILE_PATH_PROCESSED_DESC_TRANSLATIONS = os.path.join(DATA_PATH_PROCESSED, FILE_NAME_PROCESSED_DESC_TRANSLATIONS)

ARCHIVE_PATH_DEVICES = 'assets/device_config/plugin_config_all'
ARCHIVE_PATH_CATEGORIES = 'assets/cardControl/recommend_deivce_name.json'

# Extract Data

##### Download MiHome APK

In [4]:
s = requests.Session()
s.auth = ('', VEVS_PASSWORD)
page = s.get(VEVS_BASE_URL)

# Select latest file
df = pd.read_html(page.content)[0]
vevs_package_filename = (
    df.loc[df['Type'] == 'application/vnd.android.package-archive']
    .tail(1)['Name']
    .tolist()[0]
)

s = f'wget --user "" --password {VEVS_PASSWORD} --read-timeout=5 --tries=0 --no-clobber -P {DATA_PATH_RAW} {urljoin(VEVS_BASE_URL, vevs_package_filename)}'
output = subprocess.check_output(s, shell=True)

##### Extract Data from APK

In [5]:
# # You can also read directly from the APK
# archive = zipfile.ZipFile(vevs_package_filename, 'r')
# archive.read(ARCHIVE_PATH_DEVICES)

with zipfile.ZipFile(os.path.join(DATA_PATH_RAW, vevs_package_filename), 'r') as z:
    with open(os.path.join(FILE_PATH_RAW_DEVICES), 'wb') as f:
        f.write(z.read(ARCHIVE_PATH_DEVICES))
        
    with open(os.path.join(FILE_PATH_RAW_CATEGORIES), 'wb') as f:
        f.write(z.read(ARCHIVE_PATH_CATEGORIES))

##### Load Devices data

In [6]:
with open(FILE_PATH_RAW_DEVICES, 'r') as f:
    devices_json = json.load(f)
devices_pd = pd.DataFrame.from_dict(devices_json['result']['devices']['list'])
devices_pd['name'] = devices_pd['name'].apply(clean_string)
devices_pd['desc'] = devices_pd['desc'].apply(clean_string)
devices_pd['model'] = devices_pd['model'].apply(clean_string)
devices_pd = devices_pd.rename(columns={"desc": "desc_cn"})
# devices_pd.head()

  return HTMLParser().unescape(


# Translate

##### Devices names

In [7]:
with open(FILE_PATH_PROCESSED_NAME_TRANSLATIONS, 'r') as f:
    devices_name_translation_dict = json.load(f)

In [8]:
devices_name_cn_list = []
devices_name_en_list = []

for index, row in tqdm.notebook.tqdm(devices_pd.iterrows(), total=devices_pd.shape[0]):
    if row['name'] and not is_english(row['name']):
        name_cn = row['name']
        
        if name_cn in devices_name_translation_dict:
            name_en = devices_name_translation_dict[name_cn]
        else:
            name_en = translator.translate(name_cn, src='zh-cn', dest='en').text
            devices_name_translation_dict[name_cn] = name_en
            time.sleep(2)
    else:
        name_cn = ''
        name_en = row['name']

    devices_name_cn_list.append(name_cn)
    devices_name_en_list.append(name_en)
    
devices_pd['name_cn'] = devices_name_cn_list
devices_pd['name_en'] = devices_name_en_list

  0%|          | 0/3033 [00:00<?, ?it/s]

In [9]:
# for name_cn, name_en in zip(devices_name_cn_list, devices_name_en_list):
#     if name_cn and name_cn not in devices_name_translation_dict:
#         devices_name_translation_dict[name_cn] = name_en

In [10]:
with open(FILE_PATH_PROCESSED_NAME_TRANSLATIONS, 'w') as f:
    json.dump(devices_name_translation_dict, f)

##### Devices descriptions

In [11]:
with open(FILE_PATH_PROCESSED_DESC_TRANSLATIONS, 'r') as f:
    devices_desc_translation_dict = json.load(f)

In [12]:
devices_desc_en_list = []

for index, row in tqdm.notebook.tqdm(devices_pd.iterrows(), total=devices_pd.shape[0]):
    desc_en = ''
    desc_cn = row['desc_cn']
    
    if desc_cn: 
        if not is_english(desc_cn):
            if desc_cn in devices_desc_translation_dict:
                desc_en = devices_desc_translation_dict[desc_cn]
            else:
                desc_en = translator.translate(desc_cn, src='zh-cn', dest='en').text
                devices_desc_translation_dict[desc_cn] = desc_en
                time.sleep(2)
        else:
            desc_en = row['desc_cn']

    devices_desc_en_list.append(desc_en)
    
devices_pd['desc_en'] = devices_desc_en_list

  0%|          | 0/3033 [00:00<?, ?it/s]

In [13]:
# devices_desc_translation_dict = {}

# for desc_cn, desc_en in zip(list(devices_pd['desc_cn'].values), list(devices_pd['desc_en'].values)):
#     if desc_cn and desc_cn not in devices_desc_translation_dict:
#         devices_desc_translation_dict[desc_cn] = desc_en

In [14]:
with open(FILE_PATH_PROCESSED_DESC_TRANSLATIONS, 'w') as f:
    json.dump(devices_desc_translation_dict, f)

# Create Metacategories

**Original dependencies hell looks like that:**
- subcategory_id (one) -> microcategory_name (many)
- microcategory_name (one) -> devices (many) -> subcategory_id (many)
- cate_name (one) -> subcategory_id (many)

**Our Metasubcategory transformation:**

If device not exists in:
- microcategory_name (one) -> devices (many)

Then:
- microcategory_name (one) -> subcategory_id (one, most popular)
- subcategory_id (one) -> microcategory_name (one, most popular)

In [15]:
with open(FILE_PATH_RAW_CATEGORIES, 'r') as f:
    category_name_json = json.load(f)
category_name_pd = pd.DataFrame.from_dict(category_name_json['data'])
category_name_pd['models'] = category_name_pd['product_type'].apply(lambda x: list(x['model']))
category_name_pd['name_en'] = category_name_pd['product_type'].apply(lambda x: x['defaultName']['en'] if 'en' in x['defaultName'] else translator.translate(x['defaultName'].get('zh_CN'), src='zh-cn', dest='en').text)
category_name_pd['name_en'] = category_name_pd['name_en'].apply(lambda x: clean_string(x).capitalize())
category_name_pd.to_csv(FILE_PATH_PROCESSED_CATEGORIES)
# category_name_pd.head()

  return HTMLParser().unescape(


In [16]:
model_subcategory_id_dict = pd.Series(devices_pd.subcategory_id.values, index=devices_pd.model).to_dict()
model_cate_name_dict = pd.Series(devices_pd.cate_name.values, index=devices_pd.model).to_dict()

In [17]:
model_microcategory_name_dict = {}
subcategory_id_microcategory_names_dict = defaultdict(list)
subcategory_id_cate_names_dict = defaultdict(list)
# a = defaultdict(list)

for index, row in category_name_pd.iterrows():
    microcategory_name = row['name_en']
    
    subcategories_ids = []
    cate_names = []
    microcategories_names = []
    for m in row['models']:
        if m in model_subcategory_id_dict:
            subcategories_ids.append(model_subcategory_id_dict[m])
            
            if model_cate_name_dict[m] != 'Other':
                cate_names.append(model_cate_name_dict[m])

            model_microcategory_name_dict[m] = microcategory_name
            microcategories_names.append(microcategory_name)
            
    subcategory_id_microcategory_names_dict[Counter(subcategories_ids).most_common()[0][0]].extend(microcategories_names)
#     a[Counter(microcategories_names).most_common()[0][0]].extend(cate_names)
    subcategory_id_cate_names_dict[Counter(subcategories_ids).most_common()[0][0]].extend(cate_names)
    
subcategory_id_microcategory_name_dict = {}
for k,v in subcategory_id_microcategory_names_dict.items():
    subcategory_id_microcategory_name_dict[k] = Counter(v).most_common()[0][0]
    
subcategory_id_cate_name_dict = {}
for k,v in subcategory_id_cate_names_dict.items():
    if len(v):
        subcategory_id_cate_name_dict[k] = Counter(v).most_common()[0][0]
        
# b = {}
# for k,v in a.items():
#     if len(v):
#         b[k] = Counter(v).most_common()[0][0]

In [18]:
def get_metasubcategory_name(s):
    if s['model'] in model_microcategory_name_dict:
        r = model_microcategory_name_dict[s['model']]
    elif s['subcategory_id'] in subcategory_id_microcategory_name_dict:
        r = subcategory_id_microcategory_name_dict[s['subcategory_id']]
    else:
        r = 'Other'
    return r

def get_metacategory_name(s):
    metacategory_name = s['cate_name']
    subcategory_id = s['subcategory_id']
    if (subcategory_id in subcategory_id_cate_name_dict):
#         if subcategory_id_cate_name_dict[subcategory_id] != s['cate_name']:
#             print(s['name_en'], '|||', s['metasubcategory_name'], '|||', subcategory_id_cate_name_dict[subcategory_id], '-', s['cate_name'])
            
        metacategory_name = subcategory_id_cate_name_dict[subcategory_id]
    return metacategory_name

devices_pd['metasubcategory_name'] = devices_pd.apply(get_metasubcategory_name, axis=1)
devices_pd['metacategory_name'] = devices_pd.apply(get_metacategory_name, axis=1)
print(
    devices_pd['metasubcategory_name'].value_counts(True)['Other'],
    devices_pd['cate_name'].value_counts(True)['Other'],
    devices_pd['metacategory_name'].value_counts(True)['Other']
)

0.008242664029014177 0.23112429937355752 0.09990108803165183


In [19]:
# devices_pd[['name_en', 'cate_name', 'metacategory_name', 'metasubcategory_name', 'subcategory_id']][devices_pd['cate_name'] == 'Other']

In [20]:
# Wrong matching name ! LEGACY

# # Model - Subcategory ID
# model_subcategory_id_dict = pd.Series(devices_pd.subcategory_id.values, index=devices_pd.model).to_dict()

# # Microcategory Name - Subcategory ID
# microcategory_name_subcategory_ids_dict = defaultdict(list)

# for index, row in category_name_pd.iterrows():
#     for m in row['models']:
#         if m in model_subcategory_id_dict:
#             microcategory_name_subcategory_ids_dict[row['name_en']].append(model_subcategory_id_dict[m])

# microcategory_name_subcategory_id_dict = {}    
# for subcategory_name, subcategory_ids in subcategory_name_subcategory_ids_dict.items():
#     microcategory_name_subcategory_id_dict[subcategory_name] = Counter(subcategory_ids).most_common()[0][0]
    
# # Subcategory ID - Subcategory Name
# subcategory_id_subcategory_name_dict = reverse_dict(subcategory_name_subcategory_id_dict)
    
# # Subcategory Name - Category Name
# devices_pd['subcategory_name'] = devices_pd['subcategory_id'].apply(lambda x: subcategory_id_subcategory_name_dict.get(x, 'Unknown'))
# subcate_cate_dict = (
#     devices_pd[devices_pd['subcategory_name'] != 'Unknown']
#         .groupby('subcategory_name').apply(lambda x: Counter(x['cate_name']).most_common()[0][0]).to_dict()
# )

# Save enriched devices data

In [21]:
devices_pd.to_csv(FILE_PATH_PROCESSED_DEVICES)

# Download icons

In [22]:
for index, row in tqdm.notebook.tqdm(devices_pd.iterrows(), total=devices_pd.shape[0]):
    icon_url = row['icon_real']
    if not icon_url:
        continue
    
    # Magic string for extracting extension from url instead of slow MIME Headers
    # But extensions is wrong lol
    img_ext = os.path.splitext(urllib.parse.urlparse(icon_url).path)[1]
    img_path = f'{os.path.join(ASSETS_ICONS_RAW_PATH, row["model"])}{img_ext}'
    
#     # Show real extensions
#     if os.path.isfile(img_path):
#         print(imghdr.what(img_path))
    
    if (not os.path.isfile(img_path) or not imghdr.what(img_path)) and row['icon_real']:
        try:
            s = f'wget --tries=10 -O {img_path} "{row["icon_real"]}"'
            output = subprocess.check_output(s, shell=True)
        except Exception as e:
            print(e)

  0%|          | 0/3033 [00:00<?, ?it/s]

In [23]:
# for img_name in os.listdir(ASSETS_ICONS_RAW_PATH):
#     try:
#         dest_img_path = os.path.join(ASSETS_ICONS_PROCESSED_PATH, img_name)
#         src_img_path = os.path.join(ASSETS_ICONS_RAW_PATH, img_name)
#         if (not os.path.isfile(dest_img_path) or not imghdr.what(dest_img_path)):
#             im = Image.open(src_img_path)
#             im.thumbnail([128, 128], Image.ANTIALIAS)
#             im.save(os.path.join(dest_img_path), "PNG")
#     except IOError:
#         print("cannot create thumbnail for '%s'" % item)

# Create Table

In [24]:
df = (
    devices_pd[['icon_real', 'name_en', 'desc_en', 'metacategory_name', 'metasubcategory_name', 'model']]
        .sort_values(['metacategory_name', 'metasubcategory_name', 'name_en', 'model'])
        .reset_index()
        .groupby(['metacategory_name', 'metasubcategory_name'])
        .apply(lambda x: pd.concat([x['icon_real'], x['name_en'], x['desc_en'], x['model']], axis=1))
        .rename({'icon_real': 'Pic', 'name_en': 'Name', 'desc_en': 'Description', 'model': 'Model'}, axis=1)
#         .rename({'metacategory_name': 'Category', 'metasubcategory_name': 'Subcategory'}, axis=0)
)

# df['Pic'] = df['Pic'].apply(lambda x: f'<img src="{x}">')
# img_pre = '/home/home/Projects/mizoo/assets/icons/raw/'
img_pre = './assets/icons/raw/'
df['Pic'] = df.apply(
    lambda x: f'<img src="{os.path.join(img_pre, x["Model"] + ".png")}">', 
    axis=1
)

# df.head()

In [25]:
with open('../README.md', 'w') as f:
    print('# Mi-zoo\n', file=f)
    print('<sup><sub>Complete list of Xiaomi Mi Smart Home Ecosystem devices with descriptions and codes.</sub></sup>\n---', file=f)
    for category, df_sub in df.groupby(level=0):
        print(f'---\n# {category}', file=f)
        for subcategory, df_dev in df_sub.groupby(level=1):
            print(f'## {subcategory}', file=f)
            print(
                df_dev[['Pic', 'Name', 'Model', 'Description']]
                .reset_index(level=['metasubcategory_name', 'metacategory_name'], drop=True)
                .to_html(escape=False)
                .replace('\\n', '<br />')
#                 .replace('<th>Pic</th>', '<th style="width:30%">Pic</th>')
#                 .replace('<th>Name</th>', '<th style="width:20%">Name</th>')
#                 .replace('<th>Model</th>', '<th style="width:10%">Model</th>')
#                 .replace('<th>Description</th>', '<th style="width:40%">Description</th>')
                ,file=f
            )
            print('', file=f)