# Importing and preprocessing of data

In [None]:
from google.colab import drive

drive.mount('/content/drive/')


Mounted at /content/drive/


In [None]:
TXTS_PATH = '/content/drive/My Drive/Becode/Kpmg/selection_fr'

In [None]:
import os

In [None]:
import pandas as pd

In [None]:
RESPONSES_PATH = '/content/drive/My Drive/Becode/Kpmg/responses_2018_now.json'

In [None]:
txt_filenames = [f for f in os.listdir(TXTS_PATH) if f.endswith(".txt")]

In [None]:
#reading json file to retrieve tags
responses = pd.read_json(RESPONSES_PATH)

In [None]:
#filtering only relevant columns
columns = ['jcId', 'jcFr', 'titleFr', 'themesFr', 'scopeFr', 'noScopeFr', 'documentLink']
responses2 = responses.copy(deep=True).loc[:,columns]
#getting txt_name from original pdf name
responses2["txt_name"] = responses2["documentLink"]
responses2["txt_name"] = responses2["txt_name"].str.replace("/","-")
responses2["txt_name"] = responses2["txt_name"].str.replace(".pdf","_FR.txt")
#filtering json only for selected files and columns
columns += ['txt_name']
responses2 = responses2.loc[responses2.txt_name.isin(txt_filenames), columns]
responses2.head(2)

Unnamed: 0,jcId,jcFr,titleFr,themesFr,scopeFr,noScopeFr,documentLink,txt_name
34,1110000,COMMISSION PARITAIRE DES CONSTRUCTIONS METALLI...,modification du régime de pension sectoriel so...,[PENSIONS COMPÉMENTAIRES ET ASSURANCES GROUPES],,[les employeurs et ouvriers des entreprises ex...,111/111-2018-013525.pdf,111-111-2018-013525_FR.txt
35,1110000,COMMISSION PARITAIRE DES CONSTRUCTIONS METALLI...,allocation spéciale compensatoire,[PRIME SYNDICALE],,,111/111-2018-012196.pdf,111-111-2018-012196_FR.txt


In [None]:
#generating all possible FR themes
themes_fr = []
for r in  responses2.themesFr:
    if r is not None:
        for t in r:
            if t not in themes_fr:
                themes_fr += [t]
len(themes_fr)


53

In [None]:
#code to retrieve the entire body
#%%
for filename in responses2.txt_name.to_list():
    # filename = responses2.txt_name.to_list()[10] #test
    file_path = os.path.join(TXTS_PATH, filename)
    with open(file_path, 'r', encoding="utf8") as f:
        #to retrieve entire body not necessary
        responses2.loc[responses2.txt_name == filename, "doc_bodies"] = f.read()
        f.close()

In [None]:
t = themes_fr[0]
mask = responses2.dropna(axis=0, subset=['themesFr']).themesFr.map(lambda x: t in x)

In [None]:
len(responses2)

549

In [None]:
 #len(responses2.dropna(axis=0, subset=['themesFr'])[map])

# Preprocessing for NLP

In [3]:
from pandas import Series

from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer

# import entirely spacy to create Doc objects through nlp
import spacy
from spacy import load, lang

from wordcloud import WordCloud

from collections import Counter

from typing import List
from typing import Dict

# WARNINGS
# W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
# I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine

# GLOBAL VARIABLES
NAMED_ENTITIES = ['microsoft']

#testing
from os import getcwd as cwd
from os.path import dirname as dir
from os.path import join
import pandas as pd


def lemmatize(text_tokens: List[str]) -> List[str]:
    def get_wordnet_pos(word):
        """Map POS tag to first character lemmatize() accepts"""
        tag = pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}
        return tag_dict.get(tag, wordnet.NOUN)

    # Instantiate the WordNetLemmatizer
    wordnet_lemmatizer = WordNetLemmatizer()
    # Lemmatize all tokens into a new list: lemmatized
    texts_lemmatized = [wordnet_lemmatizer.lemmatize(t, get_wordnet_pos(t)) for t in text_tokens]
    return texts_lemmatized


#DEV NOTE: not used
def filter_words(texts_lemmatized: List[List[str]], freq_min=None, freq_max=None) -> List[List[str]]:
    frequency_absolute = Counter([item for elem in texts_lemmatized for item in elem])
    wordcloud = WordCloud(width=1000, height=500).generate_from_frequencies(frequency_absolute)
    frequency_relative = wordcloud.words_
    if freq_min is not None and freq_min > 0 and freq_min < 1:
        rel_freq_filtered = {k: v for k, v in frequency_relative.items() if v > freq_min}
    if freq_max is not None and freq_max > 0 and freq_max < 1:
        rel_freq_filtered = {k: v for k, v in frequency_relative.items() if v < freq_max}
    texts_filtered = [[t for t in pub_lem if t in rel_freq_filtered.keys()] for pub_lem in texts_lemmatized]
    return texts_filtered


class Preprocess:
    def __init__(self, nlp_model='en_core_web_md'):
        self.nlp = load(nlp_model)
        if 'en_' in nlp_model:
          self.stop_words = lang.en.stop_words.STOP_WORDS
        elif 'fr_' in nlp_model:
          self.stop_words = lang.fr.stop_words.STOP_WORDS
        self.named_entities = set(NAMED_ENTITIES)

    def get_named_entities(self, texts: Series, inplace=True) -> set:
        # creating a single ner set
        nes = set()
        # function to extract NER from text
        def get_named_entities(text) -> set:
            doc = self.nlp(text)
            named_entities = set([ent.text for ent in doc.ents])
            return named_entities
        [[nes.add(n) for n in get_named_entities(text)] for text in texts]
        # adding predefined NER
        [nes.add(n) for n in self.named_entities]
        if inplace:
            self.named_entities = nes
        return nes

    def tokenize_text(self, text:str, stop_words: List[str] = None, named_entities: List[str] = None,
                   lenght_min: int=2) -> List[str]:
        if stop_words is None:
            stop_words = self.stop_words
        if named_entities is None:
            named_entities = self.named_entities
        text = text.replace("\n", " ")
        # split string into words (tokens)
        tokens = word_tokenize(text.lower())
        # keep strings with only alphabets
        tokens = [t for t in tokens if t.isalpha()]
        tokens = lemmatize(tokens)
        # remove short words, they're probably not useful
        tokens = [t for t in tokens if len(t) > lenght_min]
        # remove stopwords
        tokens = [t for t in tokens if t not in stop_words]
        # remove
        tokens = [t for t in tokens if t not in named_entities]
        return tokens

    def clean_text(self, text:str, stop_words: List[str] = None, named_entities: List[str] = None,
                   lenght_min: int=2) -> str:
        tokens = self.tokenize_text(text, stop_words, named_entities, lenght_min)
        text_cleaned = " ".join(tokens)
        return text_cleaned

    def tokenize_texts(self, texts:Series, stop_words: List[str] = None, named_entities: List[str] = None,
                   lenght_min: int=2) -> List[List[str]]:
        texts_tokens = []
        for text in texts:
            texts_tokens = texts_tokens.append(self.tokenize_text(text, stop_words, named_entities, lenght_min))
        return texts_tokens

In [None]:
! python -m spacy download fr_core_news_md

Collecting fr_core_news_md==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_md-2.2.5/fr_core_news_md-2.2.5.tar.gz (88.6MB)
[K     |████████████████████████████████| 88.6MB 1.3MB/s 
Building wheels for collected packages: fr-core-news-md
  Building wheel for fr-core-news-md (setup.py) ... [?25l[?25hdone
  Created wheel for fr-core-news-md: filename=fr_core_news_md-2.2.5-cp37-none-any.whl size=90338490 sha256=b795eafecf77beb60abacab5ab2980db9e586ea4bfcfa8646c04fbd57187e33e
  Stored in directory: /tmp/pip-ephem-wheel-cache-ir98s7j9/wheels/c6/18/b6/f628642acc7872a53cf81269dd1c394d96da69564ccfac5425
Successfully built fr-core-news-md
Installing collected packages: fr-core-news-md
Successfully installed fr-core-news-md-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('fr_core_news_md')


In [1]:
import spacy
nlp = spacy.load('fr_core_news_md')

In [None]:
! python --version

In [4]:
preprocess = Preprocess(nlp_model='fr_core_news_md')

AttributeError: ignored

In [6]:
lang.fr.stop_words.STOP_WORDS

{'a',
 'abord',
 'absolument',
 'afin',
 'ah',
 'ai',
 'aie',
 'ailleurs',
 'ainsi',
 'ait',
 'allaient',
 'allo',
 'allons',
 'allô',
 'alors',
 'anterieur',
 'anterieure',
 'anterieures',
 'apres',
 'après',
 'as',
 'assez',
 'attendu',
 'au',
 'aucun',
 'aucune',
 'aujourd',
 "aujourd'hui",
 'aupres',
 'auquel',
 'aura',
 'auraient',
 'aurait',
 'auront',
 'aussi',
 'autre',
 'autrefois',
 'autrement',
 'autres',
 'autrui',
 'aux',
 'auxquelles',
 'auxquels',
 'avaient',
 'avais',
 'avait',
 'avant',
 'avec',
 'avoir',
 'avons',
 'ayant',
 'bah',
 'bas',
 'basee',
 'bat',
 'beau',
 'beaucoup',
 'bien',
 'bigre',
 'boum',
 'bravo',
 'brrr',
 "c'",
 'car',
 'ce',
 'ceci',
 'cela',
 'celle',
 'celle-ci',
 'celle-là',
 'celles',
 'celles-ci',
 'celles-là',
 'celui',
 'celui-ci',
 'celui-là',
 'cent',
 'cependant',
 'certain',
 'certaine',
 'certaines',
 'certains',
 'certes',
 'ces',
 'cet',
 'cette',
 'ceux',
 'ceux-ci',
 'ceux-là',
 'chacun',
 'chacune',
 'chaque',
 'cher',
 'chers',
