In [2]:
from google.colab import drive 
drive.mount('/content/drive/', force_remount=True)
directory = '/content/drive/My Drive/'

Mounted at /content/drive/


In [3]:
!pip install num2words



In [1]:
import json
import os
from os import listdir
from os.path import isfile, join
import numpy as np
import pandas as pd
import pathlib

# Functions

In [2]:
# -*- coding: utf-8 -*-

import pandas as pd
import pathlib


def read_data(filename: str):
    """ Read the json file and convert it into a pd.DataFrame
    preprocessing: format datetime

    Args:
        filename: json filename

    Returns:
        pd.DataFrame
    """
    assert filename.endswith('.json'), 'filename should be a json file'
    assert pathlib.Path(filename).exists(), 'filename does not exists'

    df = pd.read_json(filename, encoding='UTF-8')
    df = df.assign(createdAt=pd.to_datetime(df.createdAt),
                   publishedAt=pd.to_datetime(df.publishedAt),
                   updatedAt=pd.to_datetime(df.updatedAt))

    # remove trashed data
    df = df[~df.trashed]

    return df

In [3]:
def extract_responses_by_id(responses: list, key: str='138'):
    """ Extract a specific question

    Args:
        responses: list (example df.iloc[0].responses)
        key: questionId (example '142')

    Returns:
        responses as a string
    """

    response = [x['formattedValue'] for x in responses
                if x['questionId'] == key]
    if len(response):
        return response[0]
    else:
        return None

In [4]:
def get_responses(df: pd.DataFrame):
    """ Extract responses and return a pd.DataFrame
    with columns: authorId, questionId, formattedValue

    Args:
        df: dataframe from read_data
    
    Returns:
        pd.DataFrame with responses
    """
    responses = []
    for i, x in df.iterrows():
        df_tmp = (pd.DataFrame(x.responses).
                  filter(['questionId', 'formattedValue']).
                  assign(authorId=x.authorId))
        responses.append(df_tmp)

    return pd.concat(responses, ignore_index=True)

In [5]:
def get_ids_open_reponses(df: pd.DataFrame):
    """ Return the ids of open questions
    i.e does not have a predefined set of possible responses
    """
    list_questions = df.iloc[0].responses

    ids_open_questions = [x['questionId'] for x in list_questions
                          if x['value'] is None or
                          '{"labels"' not in x['value']]
    return ids_open_questions

In [6]:
def get_open_reponses(df: pd.DataFrame):
    """ Filter the data to only return non empty open responses

    Args:
        df: dataframe from read_data

    Returns:
        pd.DataFrame
    """

    df_open = get_responses(df)
    ids_open = get_ids_open_reponses(df)

    df_open = df_open[df_open.questionId.isin(ids_open)]
    df_open = df_open[~pd.isnull(df_open.formattedValue)]
    return df_open

# Read data

In [7]:
data_fiscalisation = read_data('LA_FISCALITE_ET_LES_DEPENSES_PUBLIQUES.json')

In [8]:
responses = get_open_reponses(data_fiscalisation)

In [15]:
responses.to_csv('responses_ficalisation.csv')

In [22]:
responses = pd.read_csv('responses_ficalisation.csv')

# Preprocessing

## Preprocessing - Stopwords

In [9]:
def get_stopswords(type="veronis"):
    '''returns the veronis stopwords in unicode, or if any other value is passed, it returns the default nltk french stopwords'''
    if type=="veronis":
        #VERONIS STOPWORDS
        raw_stopword_list = ["Ap.", "Apr.", "GHz", "MHz", "USD", "a", "afin", "ah", "ai", "aie", "aient", "aies", "ait", "alors", "après", "as", "attendu", "au", "au-delà", "au-devant", "aucun", "aucune", "audit", "auprès", "auquel", "aura", "aurai", "auraient", "aurais", "aurait", "auras", "aurez", "auriez", "aurions", "aurons", "auront", "aussi", "autour", "autre", "autres", "autrui", "aux", "auxdites", "auxdits", "auxquelles", "auxquels", "avaient", "avais", "avait", "avant", "avec", "avez", "aviez", "avions", "avons", "ayant", "ayez", "ayons", "b", "bah", "banco", "ben", "bien", "bé", "c", "c'", "c'est", "c'était", "car", "ce", "ceci", "cela", "celle", "celle-ci", "celle-là", "celles", "celles-ci", "celles-là", "celui", "celui-ci", "celui-là", "celà", "cent", "cents", "cependant", "certain", "certaine", "certaines", "certains", "ces", "cet", "cette", "ceux", "ceux-ci", "ceux-là", "cf.", "cg", "cgr", "chacun", "chacune", "chaque", "chez", "ci", "cinq", "cinquante", "cinquante-cinq", "cinquante-deux", "cinquante-et-un", "cinquante-huit", "cinquante-neuf", "cinquante-quatre", "cinquante-sept", "cinquante-six", "cinquante-trois", "cl", "cm", "cm²", "comme", "contre", "d", "d'", "d'après", "d'un", "d'une", "dans", "de", "depuis", "derrière", "des", "desdites", "desdits", "desquelles", "desquels", "deux", "devant", "devers", "dg", "différentes", "différents", "divers", "diverses", "dix", "dix-huit", "dix-neuf", "dix-sept", "dl", "dm", "donc", "dont", "douze", "du", "dudit", "duquel", "durant", "dès", "déjà", "e", "eh", "elle", "elles", "en", "en-dehors", "encore", "enfin", "entre", "envers", "es", "est", "et", "eu", "eue", "eues", "euh", "eurent", "eus", "eusse", "eussent", "eusses", "eussiez", "eussions", "eut", "eux", "eûmes", "eût", "eûtes", "f", "fait", "fi", "flac", "fors", "furent", "fus", "fusse", "fussent", "fusses", "fussiez", "fussions", "fut", "fûmes", "fût", "fûtes", "g", "gr", "h", "ha", "han", "hein", "hem", "heu", "hg", "hl", "hm", "hm³", "holà", "hop", "hormis", "hors", "huit", "hum", "hé", "i", "ici", "il", "ils", "j", "j'", "j'ai", "j'avais", "j'étais", "jamais", "je", "jusqu'", "jusqu'au", "jusqu'aux", "jusqu'à", "jusque", "k", "kg", "km", "km²", "l", "l'", "l'autre", "l'on", "l'un", "l'une", "la", "laquelle", "le", "lequel", "les", "lesquelles", "lesquels", "leur", "leurs", "lez", "lors", "lorsqu'", "lorsque", "lui", "lès", "m", "m'", "ma", "maint", "mainte", "maintes", "maints", "mais", "malgré", "me", "mes", "mg", "mgr", "mil", "mille", "milliards", "millions", "ml", "mm", "mm²", "moi", "moins", "mon", "moyennant", "mt", "m²", "m³", "même", "mêmes", "n", "n'avait", "n'y", "ne", "neuf", "ni", "non", "nonante", "nonobstant", "nos", "notre", "nous", "nul", "nulle", "nº", "néanmoins", "o", "octante", "oh", "on", "ont", "onze", "or", "ou", "outre", "où", "p", "par", "par-delà", "parbleu", "parce", "parmi", "pas", "passé", "pendant", "personne", "peu", "plus", "plus_d'un", "plus_d'une", "plusieurs", "pour", "pourquoi", "pourtant", "pourvu", "près", "puisqu'", "puisque", "q", "qu", "qu'", "qu'elle", "qu'elles", "qu'il", "qu'ils", "qu'on", "quand", "quant", "quarante", "quarante-cinq", "quarante-deux", "quarante-et-un", "quarante-huit", "quarante-neuf", "quarante-quatre", "quarante-sept", "quarante-six", "quarante-trois", "quatorze", "quatre", "quatre-vingt", "quatre-vingt-cinq", "quatre-vingt-deux", "quatre-vingt-dix", "quatre-vingt-dix-huit", "quatre-vingt-dix-neuf", "quatre-vingt-dix-sept", "quatre-vingt-douze", "quatre-vingt-huit", "quatre-vingt-neuf", "quatre-vingt-onze", "quatre-vingt-quatorze", "quatre-vingt-quatre", "quatre-vingt-quinze", "quatre-vingt-seize", "quatre-vingt-sept", "quatre-vingt-six", "quatre-vingt-treize", "quatre-vingt-trois", "quatre-vingt-un", "quatre-vingt-une", "quatre-vingts", "que", "quel", "quelle", "quelles", "quelqu'", "quelqu'un", "quelqu'une", "quelque", "quelques", "quelques-unes", "quelques-uns", "quels", "qui", "quiconque", "quinze", "quoi", "quoiqu'", "quoique", "r", "revoici", "revoilà", "rien", "s", "s'", "sa", "sans", "sauf", "se", "seize", "selon", "sept", "septante", "sera", "serai", "seraient", "serais", "serait", "seras", "serez", "seriez", "serions", "serons", "seront", "ses", "si", "sinon", "six", "soi", "soient", "sois", "soit", "soixante", "soixante-cinq", "soixante-deux", "soixante-dix", "soixante-dix-huit", "soixante-dix-neuf", "soixante-dix-sept", "soixante-douze", "soixante-et-onze", "soixante-et-un", "soixante-et-une", "soixante-huit", "soixante-neuf", "soixante-quatorze", "soixante-quatre", "soixante-quinze", "soixante-seize", "soixante-sept", "soixante-six", "soixante-treize", "soixante-trois", "sommes", "son", "sont", "sous", "soyez", "soyons", "suis", "suite", "sur", "sus", "t", "t'", "ta", "tacatac", "tandis", "te", "tel", "telle", "telles", "tels", "tes", "toi", "ton", "toujours", "tous", "tout", "toute", "toutefois", "toutes", "treize", "trente", "trente-cinq", "trente-deux", "trente-et-un", "trente-huit", "trente-neuf", "trente-quatre", "trente-sept", "trente-six", "trente-trois", "trois", "très", "tu", "u", "un", "une", "unes", "uns", "v", "vers", "via", "vingt", "vingt-cinq", "vingt-deux", "vingt-huit", "vingt-neuf", "vingt-quatre", "vingt-sept", "vingt-six", "vingt-trois", "vis-à-vis", "voici", "voilà", "vos", "votre", "vous", "w", "x", "y", "z", "zéro", "à", "ç'", "ça", "ès", "étaient", "étais", "était", "étant", "étiez", "étions", "été", "étée", "étées", "étés", "êtes", "être", "ô"]
    else:
        #get French stopwords from the nltk kit
        raw_stopword_list = stopwords.words('french') #create a list of all French stopwords
    #stopword_list = [word.decode('utf8x') for word in raw_stopword_list] #make to decode the French stopwords as unicode objects rather than ascii
    return raw_stopword_list

List_stop_words = get_stopswords()
#A mettre a jour!!!!!!!!!!

List_ponctuation = [',',';',':','!','?','.','/','%','*','§','^','¨','$','£','=',')','°',']','+','}','_','&','\t','(']


def lister_mots(phrase):
    for i in range (len(List_ponctuation)): #Enleve la ponctuation de la phrase
        if List_ponctuation[i] in phrase:
            phrase = phrase.replace(List_ponctuation[i],"")
        if('\n' in phrase):
            phrase = phrase.replace('\n'," ")
    List_mots = phrase.split(" ")
    return List_mots



def enlever_les_stops_words(L):
    for mot in List_stop_words:
        if mot in L:
            L.remove(mot)
    return L

## Preprocessing - Tokenization

We put the words in lower case and convert the numbers (ints) to their text form

In [10]:
#WARNING you must have the num2words library installed
from num2words import num2words
num2words(169, lang='fr')

'cent soixante-neuf'

In [11]:
def tokenization(word_list):
    tokenized_list = []
    for word in word_list:
        try:
            word = int(word) #if the word is the string of a number (i.e '99'), we convert it back to an int
            word = num2words(word,lang='fr')
        except:
            pass
        word = word.lower() #We put the word in lower case
        tokenized_list.append(word)
    return tokenized_list
      

## Preprocessing - Stemming

In [12]:
from nltk.stem import SnowballStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

ps = SnowballStemmer('french')
ps.stem('constitutionelles')

'constitutionel'

## Exemple avec une bdd simple

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = [
'on doit supprimer isf',
'il faut supprimer l isf',
'isf doit etre enlever',
'enlever isf',
'repousser retraite',
'retraite plus tard',
'allonger retraite'
]

vectorizer = CountVectorizer()
matrice = np.array(vectorizer.fit_transform(corpus).todense())
dictionnaire = vectorizer.vocabulary_ 

In [15]:
import numpy as np
u, s, vh = np.linalg.svd(matrice.T, full_matrices=False)

In [20]:
from scipy.sparse.linalg import svds
matrice = matrice.astype(float)
u, s, vh = svds(matrice.T, k=2)

In [25]:
u

array([[-2.37981575e-01, -5.51787292e-17],
       [-8.22187216e-17,  3.94588107e-01],
       [-1.02736381e-16,  3.15304138e-01],
       [-4.97240466e-17,  1.93420395e-01],
       [-9.13641095e-17,  1.70069595e-01],
       [-9.13641095e-17,  1.70069595e-01],
       [-2.26595166e-16,  6.86541446e-01],
       [-3.24946749e-17,  2.01167712e-01],
       [-3.36556771e-01, -5.86296509e-17],
       [-2.37981575e-01, -3.75439531e-17],
       [-8.12519920e-01, -1.51352333e-16],
       [-1.23858784e-16,  3.71237307e-01],
       [-3.36556771e-01, -5.86296509e-17]])

In [26]:
indices = [i for i, x in enumerate(np.array(abs(u[:,0])>10e-2).reshape(u.shape[0]).tolist()) if x]
word_topic = [x for x in dictionnaire.keys() if (dictionnaire[x] in indices)]
print(word_topic)

['repousser', 'retraite', 'plus', 'tard', 'allonger']


In [27]:
indices = [i for i, x in enumerate(np.array(abs(u[:,1])>10e-2).reshape(u.shape[0]).tolist()) if x]
word_topic = [x for x in dictionnaire.keys() if (dictionnaire[x] in indices)]
print(word_topic)

['on', 'doit', 'supprimer', 'isf', 'il', 'faut', 'etre', 'enlever']


In [115]:
dictionnaire

{'allonger': 0,
 'doit': 1,
 'enlever': 2,
 'etre': 3,
 'faut': 4,
 'il': 5,
 'isf': 6,
 'on': 7,
 'plus': 8,
 'repousser': 9,
 'retraite': 10,
 'supprimer': 11,
 'tard': 12}

In [28]:
# permet de récuperer les indices des mots triés par fréquence
indexes_sorted = np.argsort((matrice.T).sum(axis=1))

# indices des 5 mots les plus importants
max_indexes = indexes_sorted[-5:]

In [105]:
indices = [i for i, x in enumerate(np.array(u[:,1]>0.1).reshape(u.shape[0]).tolist()) if x]

On introduit un seuil de O.1 pour déterminer si un mot appartient a un topic ou non 

In [0]:
[x for x in dictionnaire.keys() if (dictionnaire[x] in indices)]

['like', 'sunshine', 'loves', 'to', 'sit', 'this', 'for', 'some', 'reason']

In [0]:
index_topic = 0
indices = [i for i, x in enumerate(np.array(u[:,index_topic]>0.1).reshape(u.shape[0]).tolist()) if x]
word_topic = [x for x in dictionnaire.keys() if (dictionnaire[x] in indices)]
print(word_topic)

[]


## Preprocessing function

In [29]:
from nltk.stem import SnowballStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

def preprocessing(doc):
    mots = list(lister_mots(doc))
    mots = enlever_les_stops_words(mots)
    mots = tokenization(mots)
    ps = SnowballStemmer('french')
    for indice in range(len(mots)) : 
        mots[indice] = ps.stem(mots[indice])
    return ' '.join(mots)

def preprocessing_list(liste):
    for i in range(len(liste)):
        liste[i]=preprocessing(liste[i])
    return liste

In [30]:
liste = list((responses[responses['questionId']=='163'])['formattedValue'])

In [0]:
del responses
del data_fiscalisation

In [31]:
corpus = preprocessing_list(liste)

In [32]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
matrix_topic =  vectorizer.fit_transform(corpus).todense() 
vocab =  vectorizer.vocabulary_ 

In [33]:
matrix = np.array(matrix_topic)

In [34]:
# permet de récuperer les indices des mots triés par fréquence
indexes_sorted = np.argsort((matrice.T).sum(axis=1))

# indices des 5 mots les plus importants
max_indexes = indexes_sorted[-200:]

In [None]:
new_dic = {key,vocab[key] for value in max_indexes if }

In [33]:
np.save("matrix_163.npy",matrix)

In [3]:
matrix = np.load("matrix_163.npy")

In [67]:
matrix = matrix[:1000,28000:]

In [60]:
for mot in vocab.keys():
    if 29000 <vocab[mot]  :
        print (mot)

étranger
élément
égal
évolu
équilibr
éven
énerg
être
établ
économ
énonc
éolien
électr
île
équip
électromécaniques
éliminon
énigmat
édit
éduc
élèvent
écrit
écoul
été
étrang
êtr
évalu
économis
émolu
œuvr
écolog
équit
était
état
échapp
équivalent
époqu
échang
élect
étendr
éventail
zéro
élimin
émiss
épargn
économiepuis
étud
élev
équtabl
énerget
élus
éthiqu
échel
évas
écol
évade
étrangl
élarg
éligibil
évident
évit
éventuel
ça
évoqu
évoluent
égar
électorat
électeur
éternel
égalite
électoral
écart
écout
étag
épanou
échelon
énorm
éduqu
étre
zon
émetteur
échappent
étudi
éloign
âgé
âg
yeux
évad
égard
émis
yatch
éclat
écotax
échu
écrir
éradiqu
éligibl
égalitair
épicer
états
œil
étalon
yc
étrger
équivoqu
éco
éhont
éclair
équivaut
évélu
éducationl
âm
énumer
échapper
éditorial
égarementset
étap
évasionl
échouent
élis
étendu
œuvres
âgeil
étion
émouvoir
îl
éman
élu
étrier
çel
équilibrag
électron
érig
édifice
éché
zonepay
évaporationoptimis
étant
éconnom
établiess
étatsancien
étatsl
écolesrestaur
échéa

In [22]:
matrix_topic.shape

(53716, 29638)

In [18]:
def latent_decomposition(k, matrice, dictionnaire):
  topics = []
  u, s, vh = np.linalg.svd(matrice.T, full_matrices=False)
  for index_topic in range(k):
    indices = [i for i, x in enumerate(np.array(u[:,index_topic]>0.1).reshape(u.shape[0]).tolist()) if x]
    word_topic = [x for x in dictionnaire.keys() if (dictionnaire[x] in indices)]
    topics.append(word_topic)
  return topics

def latent_decomposition_from_corpus(corpus,k):
  vectorizer = CountVectorizer()
  matrix_topic =  vectorizer.fit_transform(corpus).todense() 
  vocab =  vectorizer.vocabulary_ 
  return latent_decomposition(k,matrix_topic,vocab)

def latent_decomposition_from_question_id(id,k):
  liste = list((responses[responses['questionId']=='163'])['formattedValue'])
  corpus = preprocessing_list(liste)
  return latent_decomposition_from_corpus(corpus,k)

In [84]:
matrix = np.array(matrix).astype(float)

In [86]:
matrix.T.shape

(1881, 1000)

In [68]:
from scipy.sparse.linalg import svds

In [98]:
u, s, vh = svds((matrix.T), k=5)

In [99]:
indices = [i for i, x in enumerate(np.array(u[:,1]>10e-6).reshape(u.shape[0]).tolist()) if x]
word_topic = [x for x in dictionnaire.keys() if (dictionnaire[x] in indices)]

In [100]:
print(word_topic)

[]


In [101]:
u.shape

(1881, 5)