In [261]:
#imports

import numpy as np
import pandas as pd
import collections
import re

# Chargement des données

Chargement du fichier texte et des fréquences d'apparition dans un dataframe Pandas

In [262]:
filepath="../../frontend/dataset/Stage EasyBroadcast 2019.pdf.txt"

fileContent = ""
with open(filepath, encoding="utf-8") as file:
    fileContent = file.read().lower()

words=re.findall("[\w']+", fileContent)

# count times of appearance of each word
frequency = dict(collections.Counter(words))

In [263]:
dataframe = pd.DataFrame(frequency.items(), columns=["word", "n_appear"])

In [264]:
dataframe[:10]

Unnamed: 0,word,n_appear
0,proposition,1
1,de,29
2,stage,4
3,2019,1
4,durée,2
5,6,1
6,mois,2
7,e,1
8,mail,1
9,job,1


In [265]:
dataframe.dtypes

word        object
n_appear     int64
dtype: object

# Nettoyage


## Stopwords

In [266]:
stopwords = set()
stopwords_path = "stopWordsFR.txt"
with open(stopwords_path) as file:
    for word in file:
        stopwords.add(word[:-1])

In [267]:
dataframe['word'] = dataframe['word'].astype(str)

In [268]:
dataframe.dtypes

word        object
n_appear     int64
dtype: object

In [269]:
filtered_df = dataframe[~dataframe["word"].isin(stopwords)]

In [270]:
filtered_df[:20]

Unnamed: 0,word,n_appear
0,proposition,1
1,de,29
2,stage,4
3,2019,1
4,durée,2
5,6,1
6,mois,2
7,e,1
8,mail,1
9,job,1


# Attribution des catégories

On donne des mots clés pour chaque catégorie numérotée de 0 à n

In [271]:
from collections import defaultdict

keywords = {
    "développement": 0,
    "web" : 0,
    "développeur" : 0,
    "machine" : 1,
    "learning" : 1,
    "data" : 1
}

categories = {
    "Developpement",
    "Machine Learning",
}

keyword_df = pd.DataFrame(keywords.items(), columns=["word", "category"])
categories_df = pd.DataFrame(categories, columns=["label"])

In [272]:
categories_df

Unnamed: 0,label
0,Developpement
1,Machine Learning


In [273]:
keyword_df

Unnamed: 0,word,category
0,développement,0
1,web,0
2,développeur,0
3,machine,1
4,learning,1
5,data,1


In [274]:
# Inner join

in_text_kw_df = keyword_df.merge(filtered_df, left_on='word', right_on='word')
in_text_kw_df

Unnamed: 0,word,category,n_appear
0,développement,0,3
1,web,0,2
2,développeur,0,1


## Calcul du softmax

A l'image du tutoriel hyperplan.io/docs

In [275]:
cat_count = in_text_kw_df.groupby(['category']).sum()

In [276]:
cat_count

Unnamed: 0_level_0,n_appear
category,Unnamed: 1_level_1
0,6


In [277]:
categories_df['n_appear'] = cat_count['n_appear']

In [278]:
categories_df

Unnamed: 0,label,n_appear
0,Developpement,6.0
1,Machine Learning,


In [279]:
categories_df['exponential'] = np.exp(categories_df['n_appear'])
exp_sum = np.sum(categories_df['exponential'])
if exp_sum != 0: 
    categories_df['probability'] = categories_df['exponential'] / exp_sum
else:
    categories_df['probability'] = 1 / categories_df.shape[0]

In [280]:
categories_df

Unnamed: 0,label,n_appear,exponential,probability
0,Developpement,6.0,403.428793,1.0
1,Machine Learning,,,


In [281]:
json_df = categories_df[['label', 'probability']]
json_df = json_df.fillna(0)

In [282]:
result = json_df.to_json(orient='records')
print(result)

[{"label":"Developpement","probability":1.0},{"label":"Machine Learning","probability":0.0}]
