# Combine

In [None]:
import re
import csv
import difflib
from glob import glob
from typing import IO, Callable, Dict, List

def sanitize_tag(tag: str) -> str:
    if re.match(r'.*_\w.*', tag) or re.match(r'.*\w_.*', tag):
        tag = tag.replace('_', ' ')
    tag = tag.lower()
    return (
        tag.replace('\n', '').replace('\r', '')
           .replace('\(', '(').replace('(', '\(')
           .replace('\)', ')').replace(')', '\)')
           .replace('\[', '[').replace('[', '\[')
           .replace('\]', ']').replace(']', '\]')
    )

def explode_tags(tags: str) -> List[str]:
    return [
        tag.strip()
        for tag in tags.split(',')
        if len(tag.strip()) > 0
    ]

def show_similar(tags: List[str]) -> None:
    for i, i_val in enumerate(tags):
        for j, j_val in enumerate(tags, start=i):
            if i_val == j_val:
                continue
            if re.match(r'.*?\\\(.*?\\\).*?', j_val):
                continue
            ratio = difflib.SequenceMatcher(None, i_val, j_val).ratio()
            if ratio > 0.8:
                print(f"{i_val} -> {j_val} : {ratio}")

def tags_from_txt(
        file: IO,
        filter:Callable[[str], bool] = None
    ) -> Dict[str, int]:

    result: Dict[str, int] = dict()
    for line in file:
        if filter is not None:
            is_match = filter(line)
            if not is_match:
                continue
        for tag in explode_tags(line):
            result[tag] = result.get(tag, 0) + 1

    return result

def tags_from_csv(file: IO, column_index: int = 0) -> Dict[str, int]:
    csv_file = csv.reader(file)
    next(csv_file) # skip header
    result: Dict[str, int] = dict()
    for tag in [row[column_index] for row in csv_file]:
        result[tag] = result.get(tag, 0) + 1
    return result

def tags_to_csv(tags: Dict[str, int], output: str) -> None:
    header = ['tag', 'count']
    sorted_tags = sorted(tags.items(), key=lambda x: x[1], reverse=True)
    file = open(output, 'w')
    writer = csv.DictWriter(file, fieldnames=header)
    writer.writeheader()
    writer.writerows([{'tag': tag, 'count': count} for tag, count in sorted_tags])

def tags_to_txt(tags: Dict[str, int], output: str) -> None:
    tags = sorted([key for key, _ in tags.items()])
    file = open(output, 'w')
    file.write('\n'.join(tags))

def show_sorted_tags(tags: Dict[str, int], limit: int = 100) -> None:
    print(f'Top {limit} most used tags')
    sorted_tags = sorted(tags.items(), key=lambda x: x[1], reverse=True)
    for key, value in sorted_tags[:limit]:
        print(f'  {key} : {value}')


if __name__ == '__main__':
    files = glob('input/*', recursive=False)
    tags: Dict[str, int] = dict()

    for file in files:
        if file.endswith('.txt'):
            tags_found = tags_from_txt(open(file, 'r'))
        elif file.endswith('.csv'):
            tags_found = tags_from_csv(open(file, 'r'), 1)
        else:
            continue

        for key, value in tags_found.items():
            tag = sanitize_tag(key)
            tags[tag] = tags.get(key, 0) + value

        print(f'{file} - {len(tags_found.items())} tags found')

    print(f'from {len(files)} files - {len(tags.items())} tags found')

    show_sorted_tags(tags)
    # show_similar(tags)

    tags_to_txt(tags, 'tags.txt')
    tags_to_csv(tags, 'tags.csv')