In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from downcfg import USHER_CFG

MEDIA_DIR = USHER_CFG.dest_dir 

In [None]:
dbpath = os.path.join(MEDIA_DIR, "metadata_db.csv")
assert os.path.exists(dbpath)
df = pd.read_csv(dbpath, keep_default_na=False).sort_values('stars', ascending=False)
df

In [None]:
extensions = df['name'].str.lower().str.rsplit('.', n=1, expand=True)[1].value_counts()
extensions.plot.bar(title="file extensions")

In [None]:
df.plot(subplots=True, figsize=(14,12))

In [None]:
toptags = df['tags'].str.split().explode().value_counts()
toptags[:60].plot.bar(figsize=(12,4), title="tag frequency")

In [None]:
topawards = df['awards'].str.split().explode().value_counts()
topawards[:60].plot.bar(figsize=(12,4), title="award frequency")

In [None]:
if False:
    with open("./tags_vocab.py", "w") as f:
        data = "VOCAB = [\n" + ''.join([f'  "{t}",\n' for t in toptags.index]) + "]"
        f.write(data)

In [None]:
df.hist(column=['Glicko_pts', 'Glicko_rd', 'ELO_pts', 'stars', 'nmatches'], bins=100, figsize=(20,12))

## Health checks

Metadata and file checks

In [None]:
import unittest
from tags_vocab import VOCAB
from metadata import ManualMetadata, get_metadata


class TestMeidaItem(unittest.TestCase):
    def __init__(self, row):
        super().__init__()
        self.row = row

    def setUp(self):
        fullname = os.path.join(MEDIA_DIR, self.row['name'])
        self.assertTrue(os.path.exists(fullname))
        self.disk_meta = get_metadata(fullname)
    
    def test_row(self):
        self.assertFalse(any(self.row.isna()), self.row)
        df_meta = ManualMetadata.from_str(self.row['tags'], int(self.row['stars']), self.row['awards'])
        self.assertEqual(df_meta, self.disk_meta)
    
    def test_disk_meta(self):
        self.assertTrue(self.disk_meta.tags)
        self.assertTrue(self.disk_meta.stars >= 0)
        self.assertFalse([t for t in self.disk_meta.tags if t not in VOCAB])
        self.assertFalse([a for a in self.disk_meta.awards if a.startswith("e_") and a[2:] not in VOCAB])
    
    # Idk how to make it beautiful parametrized :(
    def runTest(self):
        self.test_row()
        self.test_disk_meta()
    def shortDescription(self):
        return f"test for file {self.row['name']}"


suite = unittest.TestSuite(TestMeidaItem(row) for _,row in df[:301].iterrows())
unittest.TextTestRunner().run(suite)

Troublesome filenames:

In [None]:
import re
import random

def trouble_lvl(s:str) -> int:
    FORBIDDEN = re.escape(r'<>:"/\|?*,')
    EXCELLENT = r'_0-9a-zA-Z\.'
    GOOD = EXCELLENT + r' АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдеёжзийклмнопрстуфхцчшщъыьэюя\+\-'
    OKAY = GOOD + r'\[\]\(\)='
    if not s or s.isspace():             return 99
    if not s.isprintable():              return 98
    if re.search('['+FORBIDDEN+']', s):  return 97
    if s.startswith('-'):                return 89
    if re.search('[^'+OKAY+']', s):      return 16
    if re.search('[^'+GOOD+']', s):      return 15
    if re.search('[^'+EXCELLENT+']', s): return 3
    if len(s) < 10:                      return 1
    return 0

def better_fname(fname:str) -> str:
    class Table:
        rus = {'а':'a', 'б':'b', 'в':'v', 'г':'g', 'д':'d', 'е':'e', 'ё':'e', 'ж':'zh', 'з':'z', 'и':'i', 'й':'j', 'к':'k', 'л':'l', 'м':'m', 'н':'n', 'о':'o', 'п':'p', 'р':'r', 'с':'s', 'т':'t', 'у':'u', 'ф':'f', 'х':'h', 'ц':'c', 'ч':'ch', 'ш':'sh', 'щ':'sh', 'ъ':'_', 'ы':'i', 'ь':'_', 'э':'e', 'ю':'ju', 'я':'ya'}
        def __getitem__(self, i):
            c = chr(i)
            if not re.search(r'[^_0-9a-zA-Z]', c): return c
            if c in " -,+[]()": return '_'
            if c in self.rus: return self.rus[c]
            if c.lower() in self.rus: return self.rus[c.lower()].upper()
            return str(random.randint(10,99))

    root, ext = os.path.splitext(fname)
    root = root.translate(Table())
    if len(root) < 5:
        root += str(random.randint(100000,999999))
    return root + ext

def show_troubled_fnames():
    troubles = [(trouble_lvl(f), f) for f in os.listdir(MEDIA_DIR) if os.path.isfile(os.path.join(MEDIA_DIR, f))]
    for lvl, f in sorted(troubles, reverse=True):
        print(lvl, f, "\n  ", better_fname(f))

def fix_troubled_fnames(n):
    for _,row in df.iterrows():
        if n<1: break
        f = row['name']
        better_f = better_fname(f)
        if input(f"do you wanna rename {f} to {better_f}? ") in "Yy":
            n -= 1
            # row['name'] = better_f ????
            # os.rename(os.path.join(MEDIA_DIR, f), os.path.join(MEDIA_DIR, better_f))
    # df.to_csv(os.path.join(MEDIA_DIR, "metadata_db_fixed.csv"))

show_troubled_fnames()


Rating systems disagree too much:

In [None]:
from rating_backends import Glicko, ELO
from ae_rater_types import Rating

TOO_MUCH_DISAGREEMENT = 0.9

df_consensus = df.copy()[['name', 'Glicko_pts', 'Glicko_rd', 'ELO_pts', 'stars']]
for cls in Glicko, ELO:
    rts = lambda pts: cls().rating_to_stars(Rating(pts))
    df_consensus[f'{cls.__name__}_exp_stars'] = df_consensus[f'{cls.__name__}_pts'].map(rts)
df_consensus['stars_disagreement'] = (df_consensus['Glicko_exp_stars'] - df_consensus['ELO_exp_stars']).abs()
df_consensus = df_consensus[df_consensus['stars_disagreement']>TOO_MUCH_DISAGREEMENT]
df_consensus.sort_values('stars_disagreement', ascending=False, inplace=True, ignore_index=True)
df_consensus['stars_disagreement'].plot()
df_consensus