In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from downcfg import USHER_CFG

MEDIA_DIR = USHER_CFG.dest_dir 

In [None]:
dbpath = os.path.join(MEDIA_DIR, "metadata_db.csv")
assert os.path.exists(dbpath)
df = pd.read_csv(dbpath, keep_default_na=False).sort_values('Glicko_pts', ascending=False)
df

In [None]:
extensions = df['name'].str.lower().str.rsplit('.', n=1, expand=True)[1].value_counts()
extensions.plot.bar(title="file extensions")

In [None]:
df.plot(subplots=True, figsize=(10,10))

In [None]:
toptags = df['tags'].str.split().explode().value_counts()
toptags[:60].plot.bar(figsize=(11,4), title="tag frequency")

In [None]:
topawards = df['awards'].str.split().explode().value_counts()
topawards[:60].plot.bar(figsize=(11,4), title="award frequency")

In [None]:
if False:
    with open("./tags_vocab.py", "w") as f:
        data = "VOCAB = [\n" + ''.join([f'  "{t}",\n' for t in toptags.index]) + "]"
        f.write(data)

In [None]:
df.hist(column=['Glicko_pts', 'Glicko_rd', 'ELO_pts', 'stars', 'nmatches'], bins=100, figsize=(20,12))

## Health checks

In [None]:
import unittest
from tags_vocab import VOCAB
from metadata import ManualMetadata, get_metadata


class TestMeidaItem(unittest.TestCase):
    def __init__(self, row):
        super().__init__()
        self.row = row

    def setUp(self):
        fullname = os.path.join(MEDIA_DIR, self.row['name'])
        self.assertTrue(os.path.exists(fullname))
        self.disk_meta = get_metadata(fullname)
    
    def test_row(self):
        self.assertFalse(any(self.row.isna()), self.row)
        df_meta = ManualMetadata.from_str(self.row['tags'], int(self.row['stars']), self.row['awards'])
        self.assertEqual(df_meta, self.disk_meta)
    
    def test_disk_meta(self):
        self.assertTrue(self.disk_meta.tags)
        self.assertTrue(self.disk_meta.stars >= 0)
        self.assertFalse([t for t in self.disk_meta.tags if t not in VOCAB])
        self.assertFalse([a for a in self.disk_meta.awards if a.startswith("e_") and a[2:] not in VOCAB])
    
    # Idk how to make it beautiful parametrized :(
    def runTest(self):
        self.test_row()
        self.test_disk_meta()
    def shortDescription(self):
        return f"test for file {self.row['name']}"


suite = unittest.TestSuite(TestMeidaItem(row) for _,row in df[:301].iterrows())
unittest.TextTestRunner().run(suite)