In [None]:
from context import sniff

In [None]:
from pathlib import Path
import re
import itertools

import pypandoc
import spacy
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from tqdm import tqdm

In [None]:
def load_txt_competence_profile(file_path: Path) -> str:
    with file_path.open('r') as file_handle:
        return file_handle.readlines()

In [None]:
BASE_PATH = Path('/home/matt/competence_profiles/')

In [None]:
def get_all_short_names_of_files(path_to_folder: Path) -> list[str]:
    """Given all the competence profile files at the path_to_folder location, extract all ipt short names from it."""
    
    file_paths = path_to_folder.glob('Competence Profile_???.docx')
    
    def extract_shortname_from_file_name(file_name: str) -> str:
        return file_name.split('_')[1].split('.')[0]
    
    short_names = [extract_shortname_from_file_name(path.name) for path in file_paths]
    return list(short_names)

def convert_competence_profile_to_plain_text(competence_profile_file_path: Path) -> str:
    """This will convert a .docx competence profile to simple text and removes weird characters originating from tables."""
    
    txt = pypandoc.convert_file(str(competence_profile_file_path), 'org')
    txt = txt.split('\n', 1)[1]  # remove first line (comes from profile image)
    
    # The following filters are necessary since there are a lot of weird symbols due to the tables in the word document in the text.
    txt = txt.replace('|', ' ')
    txt = txt.replace('-', ' ')
    txt = txt.replace('=', ' ')
    txt = txt.replace('+', ' ')
    txt = txt.replace('*', ' ')
    txt = txt.replace('/', ' ')
    txt = txt.replace('# begin_quote', ' ')
    txt = txt.replace('# end_quote', ' ')
    txt = txt.replace('\n', ' ')
    txt = txt.strip()
    return txt
    
def extract_all_words(competence_profile_plain_txt: str) -> list[str]:
    """"Given plain text, extract all the words contained in this unstructured text."""
    
    regex_pattern = r"\(\d\s+('?\w+)"
    return re.findall("[a-zA-Z\-\.'/]+", competence_profile_plain_txt)

def extract_core_competence_words(competence_profile_words: list[str]) -> list[str]:
    """This will filter the input words such that only words after 'Kernkompetenzen' and before 'ipt Projekte' will remain."""
    
    arrived_at_core_competences = False
    core_competence_words = []
    for idx, word in enumerate(competence_profile_words):
        if word == 'Kernkompetenzen':
            arrived_at_core_competences = True
            continue
        if arrived_at_core_competences:
            if word == 'ipt' and competence_profile_words[idx + 1] == 'Projekte':
                break
            else:
                core_competence_words.append(word)
    return core_competence_words

def to_lower(words: list[str]) -> list[str]:
    return [word for word in words]

def filter_out_stop_words(words: list[str]) -> list[str]:
    sp = spacy.load('de_core_news_sm')
    stop_words = sp.Defaults.stop_words
    return [word for word in words if word.lower() not in stop_words]

def build_competence_dict(base_path: Path) -> dict[str, list[str]]:
    """
    Given a base_path pointing to a folder were all the .docx competence profiles reside, build a a competence_dict.
    
    The competence_dict's structure is as follows
    {'ABC': ['masta_skill', 'epic_skill'], ...}
    """
    short_names = get_all_short_names_of_files(BASE_PATH)
    competence_dict = {short_name: [] for short_name in short_names}

    for idx, short_name in tqdm(enumerate(short_names), total=len(short_names)):
        file_path = BASE_PATH / f'Competence Profile_{short_name}.docx'
        txt = convert_competence_profile_to_plain_text(file_path)
        words = extract_all_words(txt)
        core_competence_words = extract_core_competence_words(words)
        core_competence_words = filter_out_stop_words(core_competence_words)
        core_competence_words = to_lower(core_competence_words)
        competence_dict[short_name] = core_competence_words
    
    return competence_dict

def generate_word_cloud_from_competence_dict(competence_dict: dict) -> None:
    """Given the competence_dict this will generate and store a word cloud with word sizes weighted on occurrence."""
    
    HARCODED_FILTER_WORDS = ['Software', 'Architektur', 'Technologien', 'f']  # These sort of clutter everythin
    all_words = list(itertools.chain(*competence_dict.values())) 
    
    all_words = [word for word in all_words if word not in HARCODED_FILTER_WORDS]
    wordcloud = WordCloud(width = 1600, height = 1000,
                background_color ='white',
                min_font_size = 10).generate(' '.join(all_words))
 
    # plot the WordCloud image                      
    plt.figure(figsize = (10, 10), facecolor = None)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.tight_layout(pad = 0)

    plt.savefig('ipt_competence_word_cloud.png')
    plt.show()

# Executable Code Section

In [None]:
competence_dict = build_competence_dict(BASE_PATH)

In [None]:
generate_word_cloud_from_competence_dict(competence_dict)