In [36]:
import json
import os
from os import listdir
from os.path import isfile, join

import pandas as pd
import pathlib

# Reading and understanding the Data

In [37]:
#The data is contained in the json files: 'DEMOCRATIE_ET_CITOYENNETE.json','LA_FISCALITE_ET_LES_DEPENSES_PUBLIQUES.json','LA_TRANSITION_ECOLOGIQUE.json','ORGANISATION_DE_LETAT_ET_DES_SERVICES_PUBLICS.json'
mypath = os.getcwd()
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
onlyfiles

['DEMOCRATIE_ET_CITOYENNETE.json',
 'desktop.ini',
 'LA_FISCALITE_ET_LES_DEPENSES_PUBLIQUES.json',
 'LA_TRANSITION_ECOLOGIQUE.json',
 'ORGANISATION_DE_LETAT_ET_DES_SERVICES_PUBLICS.json',
 'Tchibozo_notebook_Grand_Debat.ipynb']

In [38]:
#Example of a post (each post is a dictionary).
with open('LA_FISCALITE_ET_LES_DEPENSES_PUBLIQUES.json',encoding="UTF-8") as json_data:
    d = json.load(json_data)
    print(d[1])

{'reference': '3-3', 'title': "Augmentation du pouvoir d'achat", 'createdAt': '2019-01-22 09:36:50', 'publishedAt': '2019-01-22 09:38:01', 'updatedAt': None, 'trashed': False, 'trashedStatus': None, 'authorId': 'VXNlcjo5NmNhYWM4ZS0xZTIwLTExZTktOTRkMi1mYTE2M2VlYjExZTE=', 'authorType': 'Citoyen / Citoyenne', 'authorZipCode': '27500', 'responses': [{'questionId': '162', 'questionTitle': "Quelles sont toutes les choses qui pourraient être faites pour améliorer l'information des citoyens sur l'utilisation des impôts ?", 'value': None, 'formattedValue': None}, {'questionId': '163', 'questionTitle': 'Que faudrait-il faire pour rendre la fiscalité plus juste et plus efficace ?', 'value': None, 'formattedValue': None}, {'questionId': '164', 'questionTitle': "Quels sont selon vous les impôts qu'il faut baisser en priorité ?", 'value': None, 'formattedValue': None}, {'questionId': '206', 'questionTitle': 'Afin de financer les dépenses sociales, faut-il selon vous...', 'value': '{"labels":[],"othe

Each dictionary contains 11 keys:


    -'reference' (string) : This is a reference to each post within the database. i.e '4-6','4-7', ...,'4-196'
    The format is : 'x-y' where x is an int representing which dataset/file the post is from (1 represents 'DEMOCRATIE_ET_CITOYENNETE.json', 2 represents 'LA_TRANSITION_ECOLOGIQUE.json', 3 represents 'LA_FISCALITE_ET_LES_DEPENSES_PUBLIQUES.json', 4 represents 'ORGANISATION_DE_LETAT_ET_DES_SERVICES_PUBLICS.json') , and y is the rank of the dictionary within each x file.
    This is useful, it means we can group all the json files together in one large database and we will still be able to know which post came from which file
    
    
    -'title' (string) : The title that the user has chosen for their post. i.e : 'Cesser de tirer sur les retraités !', 'Fiscalité plus juste', 'Gestion participative',
    
    -'createdAt' (Timestamp) : Timestamp of the creation of the post. i.e : '2019-01-24 21:44:13'
    
    -'publishedAt' (Timestamp) : Timestamp of the publication of the post : i.e '2019-01-24 21:44:13'
    NB : Most of the time, the 'publishedAt' time is equal to the 'createdAt' time, but it is not always the case
    
    -'updatedAt' (Timestamp/None) : Timestamp of when the user modified their post.
    
    -'trashed' (True/False) : Boolean of ???
    
    -'trashedStatus' (None/'Visible) : ???
    
    -'authorId' (string) : A 56-character code which uniquely identifies each user. i.e: 'VXNlcjo2Nzc5MjE4OC0xZTIxLTExZTktOTRkMi1mYTE2M2VlYjExZTE='
    
    -'authorType' (string/None) : Represents the type of user: 'Citoyen / Citoyenne', None,'Organisation à but lucratif', 'Organisation à but non lucratif', 'Élu / élue et Institution'
    
    -'authorZipCode' (string) : The Zip code of the user's "Departement". i.e : '92200', '75013','69540'
    NB: There are some mistakes in 'authorZipCode', it seems some users misunderstood what to write (i.e: '3', '0325829580'). This key is not entirely reliable.
    
    -'responses' (dictionary) : Dictionary containing the answer to specific questions. The dictionary has a 'questionId', 'questionTitle', 'value', 'formattedValue'. This is the most interesting part, it contains all of the opinions of the users. And we can filter out by question. There are more than 206 questions

In [4]:
#Examples of responses to the Grand Debat questions
author_type_list = []
for i in range(len(d)):
    author_type_list.append(d[i]['responses'])
author_type_list


[[{'questionId': '162',
   'questionTitle': "Quelles sont toutes les choses qui pourraient être faites pour améliorer l'information des citoyens sur l'utilisation des impôts ?",
   'value': None,
   'formattedValue': None},
  {'questionId': '163',
   'questionTitle': 'Que faudrait-il faire pour rendre la fiscalité plus juste et plus efficace ?',
   'value': None,
   'formattedValue': None},
  {'questionId': '164',
   'questionTitle': "Quels sont selon vous les impôts qu'il faut baisser en priorité ?",
   'value': None,
   'formattedValue': None},
  {'questionId': '206',
   'questionTitle': 'Afin de financer les dépenses sociales, faut-il selon vous...',
   'value': '{"labels":[],"other":"diminuer la CSG de 2 points et en contrepartie, augmenter le taux de TVA de 2 points"}',
   'formattedValue': 'diminuer la CSG de 2 points et en contrepartie, augmenter le taux de TVA de 2 points'},
  {'questionId': '205',
   'questionTitle': "S'il faut selon vous revoir les conditions d'attribution de

# Grouping and Formatting the data

We will join the dictionaries of each of the 4 files, and store them in a DataFrame (our new database)

In [5]:
import pandas as pd
pd.DataFrame(d[0])

Unnamed: 0,reference,title,createdAt,publishedAt,updatedAt,trashed,trashedStatus,authorId,authorType,authorZipCode,responses
0,3-2,TVA sociale,2019-01-22 09:35:18,2019-01-22 09:35:18,,False,,VXNlcjo3ZTVjYTUwMi0xZDZlLTExZTktOTRkMi1mYTE2M2...,Citoyen / Citoyenne,94800,"{'questionId': '162', 'questionTitle': 'Quelle..."
1,3-2,TVA sociale,2019-01-22 09:35:18,2019-01-22 09:35:18,,False,,VXNlcjo3ZTVjYTUwMi0xZDZlLTExZTktOTRkMi1mYTE2M2...,Citoyen / Citoyenne,94800,"{'questionId': '163', 'questionTitle': 'Que fa..."
2,3-2,TVA sociale,2019-01-22 09:35:18,2019-01-22 09:35:18,,False,,VXNlcjo3ZTVjYTUwMi0xZDZlLTExZTktOTRkMi1mYTE2M2...,Citoyen / Citoyenne,94800,"{'questionId': '164', 'questionTitle': 'Quels ..."
3,3-2,TVA sociale,2019-01-22 09:35:18,2019-01-22 09:35:18,,False,,VXNlcjo3ZTVjYTUwMi0xZDZlLTExZTktOTRkMi1mYTE2M2...,Citoyen / Citoyenne,94800,"{'questionId': '206', 'questionTitle': 'Afin d..."
4,3-2,TVA sociale,2019-01-22 09:35:18,2019-01-22 09:35:18,,False,,VXNlcjo3ZTVjYTUwMi0xZDZlLTExZTktOTRkMi1mYTE2M2...,Citoyen / Citoyenne,94800,"{'questionId': '205', 'questionTitle': 'S'il f..."
5,3-2,TVA sociale,2019-01-22 09:35:18,2019-01-22 09:35:18,,False,,VXNlcjo3ZTVjYTUwMi0xZDZlLTExZTktOTRkMi1mYTE2M2...,Citoyen / Citoyenne,94800,"{'questionId': '165', 'questionTitle': 'Quels ..."
6,3-2,TVA sociale,2019-01-22 09:35:18,2019-01-22 09:35:18,,False,,VXNlcjo3ZTVjYTUwMi0xZDZlLTExZTktOTRkMi1mYTE2M2...,Citoyen / Citoyenne,94800,"{'questionId': '166', 'questionTitle': 'Pour q..."
7,3-2,TVA sociale,2019-01-22 09:35:18,2019-01-22 09:35:18,,False,,VXNlcjo3ZTVjYTUwMi0xZDZlLTExZTktOTRkMi1mYTE2M2...,Citoyen / Citoyenne,94800,"{'questionId': '167', 'questionTitle': 'Y a-t-..."


We observe that pandas separates each user's post into several posts, each corresponding to one question. 

We need to choose whether we will group our data by user (in which case all of the questions will be in one row of the dataframe) or by question (in which case each user will appear several times in the database: once for every question he has answered).

We keep the default pandas separation for the time being (see below).

In [6]:
pd.read_json('LA_FISCALITE_ET_LES_DEPENSES_PUBLIQUES.json',encoding="UTF-8")

Unnamed: 0,authorId,authorType,authorZipCode,createdAt,publishedAt,reference,responses,title,trashed,trashedStatus,updatedAt
0,VXNlcjo3ZTVjYTUwMi0xZDZlLTExZTktOTRkMi1mYTE2M2...,Citoyen / Citoyenne,94800,2019-01-22 09:35:18,2019-01-22 09:35:18,3-2,"[{'questionId': '162', 'questionTitle': 'Quell...",TVA sociale,False,,
1,VXNlcjo5NmNhYWM4ZS0xZTIwLTExZTktOTRkMi1mYTE2M2...,Citoyen / Citoyenne,27500,2019-01-22 09:36:50,2019-01-22 09:38:01,3-3,"[{'questionId': '162', 'questionTitle': 'Quell...",Augmentation du pouvoir d'achat,False,,
2,VXNlcjo3ZTVjYTUwMi0xZDZlLTExZTktOTRkMi1mYTE2M2...,Citoyen / Citoyenne,94800,2019-01-22 09:38:03,2019-01-22 09:38:03,3-4,"[{'questionId': '162', 'questionTitle': 'Quell...",le patrimoine en or dans l'IFI,False,,
3,VXNlcjpjNDY0ZjllMy0xZDk4LTExZTktOTRkMi1mYTE2M2...,Citoyen / Citoyenne,35000,2019-01-22 09:40:03,2019-01-22 09:40:03,3-6,"[{'questionId': '162', 'questionTitle': 'Quell...",Pouvoir d achat,False,,
4,VXNlcjo3MDdkM2IzOC0xZDYxLTExZTktOTRkMi1mYTE2M2...,Citoyen / Citoyenne,22680,2019-01-22 09:40:53,2019-01-22 09:40:53,3-8,"[{'questionId': '162', 'questionTitle': 'Quell...",droits et devoirs,False,,
5,VXNlcjo3NmI3NTM2MS0xYjI2LTExZTktOTRkMi1mYTE2M2...,Citoyen / Citoyenne,66540,2019-01-22 09:41:00,2019-01-22 09:41:00,3-9,"[{'questionId': '162', 'questionTitle': 'Quell...",Proposition de Refondation du Capitalisme et d...,False,,
6,VXNlcjo2YTgwZTNmYi0xZTIxLTExZTktOTRkMi1mYTE2M2...,,92200,2019-01-22 09:41:58,2019-01-22 09:42:23,3-10,"[{'questionId': '162', 'questionTitle': 'Quell...",Indexation retraites,False,,
7,VXNlcjoxZTNlOTExYi0xZTIwLTExZTktOTRkMi1mYTE2M2...,Citoyen / Citoyenne,13390,2019-01-22 09:42:29,2019-01-22 09:42:29,3-11,"[{'questionId': '162', 'questionTitle': 'Quell...",Réduire la fracture sociale,False,,
8,VXNlcjo1ODljMWRiMy0xZDVhLTExZTktOTRkMi1mYTE2M2...,Citoyen / Citoyenne,18100,2019-01-22 09:42:31,2019-01-22 09:42:31,3-12,"[{'questionId': '162', 'questionTitle': 'Quell...",impots,False,,
9,VXNlcjo0OTUzNmNmYy0xZTIwLTExZTktOTRkMi1mYTE2M2...,Citoyen / Citoyenne,61130,2019-01-22 09:42:36,2019-01-22 09:42:36,3-13,"[{'questionId': '162', 'questionTitle': 'Quell...",PROPOSITION 1,False,,


In [7]:
#The df dataframe will contain all of our structured data
df_fiscalite = pd.read_json('LA_FISCALITE_ET_LES_DEPENSES_PUBLIQUES.json',encoding="UTF-8")
df_democratie = pd.read_json('DEMOCRATIE_ET_CITOYENNETE.json',encoding='UTF-8')
df_ecologie = pd.read_json('LA_TRANSITION_ECOLOGIQUE.json',encoding='UTF-8')
df_organisation = pd.read_json('ORGANISATION_DE_LETAT_ET_DES_SERVICES_PUBLICS.json',encoding='UTF-8')

frames = [df_fiscalite,df_democratie,df_ecologie,df_organisation]
df = pd.concat(frames)
df=df[~df.trashed] #We remove the trashed elements
df

Unnamed: 0,authorId,authorType,authorZipCode,createdAt,publishedAt,reference,responses,title,trashed,trashedStatus,updatedAt
0,VXNlcjo3ZTVjYTUwMi0xZDZlLTExZTktOTRkMi1mYTE2M2...,Citoyen / Citoyenne,94800,2019-01-22 09:35:18,2019-01-22 09:35:18,3-2,"[{'questionId': '162', 'questionTitle': 'Quell...",TVA sociale,False,,
1,VXNlcjo5NmNhYWM4ZS0xZTIwLTExZTktOTRkMi1mYTE2M2...,Citoyen / Citoyenne,27500,2019-01-22 09:36:50,2019-01-22 09:38:01,3-3,"[{'questionId': '162', 'questionTitle': 'Quell...",Augmentation du pouvoir d'achat,False,,
2,VXNlcjo3ZTVjYTUwMi0xZDZlLTExZTktOTRkMi1mYTE2M2...,Citoyen / Citoyenne,94800,2019-01-22 09:38:03,2019-01-22 09:38:03,3-4,"[{'questionId': '162', 'questionTitle': 'Quell...",le patrimoine en or dans l'IFI,False,,
3,VXNlcjpjNDY0ZjllMy0xZDk4LTExZTktOTRkMi1mYTE2M2...,Citoyen / Citoyenne,35000,2019-01-22 09:40:03,2019-01-22 09:40:03,3-6,"[{'questionId': '162', 'questionTitle': 'Quell...",Pouvoir d achat,False,,
4,VXNlcjo3MDdkM2IzOC0xZDYxLTExZTktOTRkMi1mYTE2M2...,Citoyen / Citoyenne,22680,2019-01-22 09:40:53,2019-01-22 09:40:53,3-8,"[{'questionId': '162', 'questionTitle': 'Quell...",droits et devoirs,False,,
5,VXNlcjo3NmI3NTM2MS0xYjI2LTExZTktOTRkMi1mYTE2M2...,Citoyen / Citoyenne,66540,2019-01-22 09:41:00,2019-01-22 09:41:00,3-9,"[{'questionId': '162', 'questionTitle': 'Quell...",Proposition de Refondation du Capitalisme et d...,False,,
6,VXNlcjo2YTgwZTNmYi0xZTIxLTExZTktOTRkMi1mYTE2M2...,,92200,2019-01-22 09:41:58,2019-01-22 09:42:23,3-10,"[{'questionId': '162', 'questionTitle': 'Quell...",Indexation retraites,False,,
7,VXNlcjoxZTNlOTExYi0xZTIwLTExZTktOTRkMi1mYTE2M2...,Citoyen / Citoyenne,13390,2019-01-22 09:42:29,2019-01-22 09:42:29,3-11,"[{'questionId': '162', 'questionTitle': 'Quell...",Réduire la fracture sociale,False,,
8,VXNlcjo1ODljMWRiMy0xZDVhLTExZTktOTRkMi1mYTE2M2...,Citoyen / Citoyenne,18100,2019-01-22 09:42:31,2019-01-22 09:42:31,3-12,"[{'questionId': '162', 'questionTitle': 'Quell...",impots,False,,
9,VXNlcjo0OTUzNmNmYy0xZTIwLTExZTktOTRkMi1mYTE2M2...,Citoyen / Citoyenne,61130,2019-01-22 09:42:36,2019-01-22 09:42:36,3-13,"[{'questionId': '162', 'questionTitle': 'Quell...",PROPOSITION 1,False,,


Our database contains 188996 posts.

# Preprocessing - Stopwords

In [7]:
def get_stopswords(type="veronis"):
    '''returns the veronis stopwords in unicode, or if any other value is passed, it returns the default nltk french stopwords'''
    if type=="veronis":
        #VERONIS STOPWORDS
        raw_stopword_list = ["Ap.", "Apr.", "GHz", "MHz", "USD", "a", "afin", "ah", "ai", "aie", "aient", "aies", "ait", "alors", "après", "as", "attendu", "au", "au-delà", "au-devant", "aucun", "aucune", "audit", "auprès", "auquel", "aura", "aurai", "auraient", "aurais", "aurait", "auras", "aurez", "auriez", "aurions", "aurons", "auront", "aussi", "autour", "autre", "autres", "autrui", "aux", "auxdites", "auxdits", "auxquelles", "auxquels", "avaient", "avais", "avait", "avant", "avec", "avez", "aviez", "avions", "avons", "ayant", "ayez", "ayons", "b", "bah", "banco", "ben", "bien", "bé", "c", "c'", "c'est", "c'était", "car", "ce", "ceci", "cela", "celle", "celle-ci", "celle-là", "celles", "celles-ci", "celles-là", "celui", "celui-ci", "celui-là", "celà", "cent", "cents", "cependant", "certain", "certaine", "certaines", "certains", "ces", "cet", "cette", "ceux", "ceux-ci", "ceux-là", "cf.", "cg", "cgr", "chacun", "chacune", "chaque", "chez", "ci", "cinq", "cinquante", "cinquante-cinq", "cinquante-deux", "cinquante-et-un", "cinquante-huit", "cinquante-neuf", "cinquante-quatre", "cinquante-sept", "cinquante-six", "cinquante-trois", "cl", "cm", "cm²", "comme", "contre", "d", "d'", "d'après", "d'un", "d'une", "dans", "de", "depuis", "derrière", "des", "desdites", "desdits", "desquelles", "desquels", "deux", "devant", "devers", "dg", "différentes", "différents", "divers", "diverses", "dix", "dix-huit", "dix-neuf", "dix-sept", "dl", "dm", "donc", "dont", "douze", "du", "dudit", "duquel", "durant", "dès", "déjà", "e", "eh", "elle", "elles", "en", "en-dehors", "encore", "enfin", "entre", "envers", "es", "est", "et", "eu", "eue", "eues", "euh", "eurent", "eus", "eusse", "eussent", "eusses", "eussiez", "eussions", "eut", "eux", "eûmes", "eût", "eûtes", "f", "fait", "fi", "flac", "fors", "furent", "fus", "fusse", "fussent", "fusses", "fussiez", "fussions", "fut", "fûmes", "fût", "fûtes", "g", "gr", "h", "ha", "han", "hein", "hem", "heu", "hg", "hl", "hm", "hm³", "holà", "hop", "hormis", "hors", "huit", "hum", "hé", "i", "ici", "il", "ils", "j", "j'", "j'ai", "j'avais", "j'étais", "jamais", "je", "jusqu'", "jusqu'au", "jusqu'aux", "jusqu'à", "jusque", "k", "kg", "km", "km²", "l", "l'", "l'autre", "l'on", "l'un", "l'une", "la", "laquelle", "le", "lequel", "les", "lesquelles", "lesquels", "leur", "leurs", "lez", "lors", "lorsqu'", "lorsque", "lui", "lès", "m", "m'", "ma", "maint", "mainte", "maintes", "maints", "mais", "malgré", "me", "mes", "mg", "mgr", "mil", "mille", "milliards", "millions", "ml", "mm", "mm²", "moi", "moins", "mon", "moyennant", "mt", "m²", "m³", "même", "mêmes", "n", "n'avait", "n'y", "ne", "neuf", "ni", "non", "nonante", "nonobstant", "nos", "notre", "nous", "nul", "nulle", "nº", "néanmoins", "o", "octante", "oh", "on", "ont", "onze", "or", "ou", "outre", "où", "p", "par", "par-delà", "parbleu", "parce", "parmi", "pas", "passé", "pendant", "personne", "peu", "plus", "plus_d'un", "plus_d'une", "plusieurs", "pour", "pourquoi", "pourtant", "pourvu", "près", "puisqu'", "puisque", "q", "qu", "qu'", "qu'elle", "qu'elles", "qu'il", "qu'ils", "qu'on", "quand", "quant", "quarante", "quarante-cinq", "quarante-deux", "quarante-et-un", "quarante-huit", "quarante-neuf", "quarante-quatre", "quarante-sept", "quarante-six", "quarante-trois", "quatorze", "quatre", "quatre-vingt", "quatre-vingt-cinq", "quatre-vingt-deux", "quatre-vingt-dix", "quatre-vingt-dix-huit", "quatre-vingt-dix-neuf", "quatre-vingt-dix-sept", "quatre-vingt-douze", "quatre-vingt-huit", "quatre-vingt-neuf", "quatre-vingt-onze", "quatre-vingt-quatorze", "quatre-vingt-quatre", "quatre-vingt-quinze", "quatre-vingt-seize", "quatre-vingt-sept", "quatre-vingt-six", "quatre-vingt-treize", "quatre-vingt-trois", "quatre-vingt-un", "quatre-vingt-une", "quatre-vingts", "que", "quel", "quelle", "quelles", "quelqu'", "quelqu'un", "quelqu'une", "quelque", "quelques", "quelques-unes", "quelques-uns", "quels", "qui", "quiconque", "quinze", "quoi", "quoiqu'", "quoique", "r", "revoici", "revoilà", "rien", "s", "s'", "sa", "sans", "sauf", "se", "seize", "selon", "sept", "septante", "sera", "serai", "seraient", "serais", "serait", "seras", "serez", "seriez", "serions", "serons", "seront", "ses", "si", "sinon", "six", "soi", "soient", "sois", "soit", "soixante", "soixante-cinq", "soixante-deux", "soixante-dix", "soixante-dix-huit", "soixante-dix-neuf", "soixante-dix-sept", "soixante-douze", "soixante-et-onze", "soixante-et-un", "soixante-et-une", "soixante-huit", "soixante-neuf", "soixante-quatorze", "soixante-quatre", "soixante-quinze", "soixante-seize", "soixante-sept", "soixante-six", "soixante-treize", "soixante-trois", "sommes", "son", "sont", "sous", "soyez", "soyons", "suis", "suite", "sur", "sus", "t", "t'", "ta", "tacatac", "tandis", "te", "tel", "telle", "telles", "tels", "tes", "toi", "ton", "toujours", "tous", "tout", "toute", "toutefois", "toutes", "treize", "trente", "trente-cinq", "trente-deux", "trente-et-un", "trente-huit", "trente-neuf", "trente-quatre", "trente-sept", "trente-six", "trente-trois", "trois", "très", "tu", "u", "un", "une", "unes", "uns", "v", "vers", "via", "vingt", "vingt-cinq", "vingt-deux", "vingt-huit", "vingt-neuf", "vingt-quatre", "vingt-sept", "vingt-six", "vingt-trois", "vis-à-vis", "voici", "voilà", "vos", "votre", "vous", "w", "x", "y", "z", "zéro", "à", "ç'", "ça", "ès", "étaient", "étais", "était", "étant", "étiez", "étions", "été", "étée", "étées", "étés", "êtes", "être", "ô"]
    else:
        #get French stopwords from the nltk kit
        raw_stopword_list = stopwords.words('french') #create a list of all French stopwords
    #stopword_list = [word.decode('utf8x') for word in raw_stopword_list] #make to decode the French stopwords as unicode objects rather than ascii
    return raw_stopword_list

In [11]:
List_stop_words = get_stopswords()

In [32]:
List_ponctuation=[',',';',':','!','?','.','/','%','*','§','^','¨','$','£','=',')','°',']','+','}','_','&']


def lister_mots(phrase):
    for i in range (len(List_ponctuation)): #Enleve la ponctuation de la phrase
        if List_ponctuation[i] in phrase:
            phrase = phrase.replace(List_ponctuation[i],"")
    List_mots = phrase.split(" ")
    return List_mots



def enlever_les_stops_words(L):
    for mot in List_stop_words:
        if mot in L:
            L.remove(mot)
    return L

In [33]:
reponse='Jaime supprimer les mots inutiles des phrases! !'
mots = list(lister_mots(reponse))
mots = enlever_les_stops_words(mots)
print(mots)

Jaime supprimer les mots inutiles des phrases 
['Jaime', 'supprimer', 'mots', 'inutiles', 'phrases', '']


# Preprocessing - Tokenization

We put the words in lower case and convert the numbers (ints) to their text form

In [4]:
#WARNING you must have the num2words library installed
from num2words import num2words
num2words(169, lang='fr')

'cent soixante-neuf'

In [40]:
def tokenization(word_list):
    tokenized_list = []
    for word in word_list:
        try:
            word = int(word) #if the word is the string of a number (i.e '99'), we convert it back to an int
            word = num2words(word,lang='fr')
        except:
            pass
        word = word.lower() #We put the word in lower case
        tokenized_list.append(word)
    return tokenized_list

tokenization('Améliorer le pouvoir d’achat en augmentant les salaires et notamment en montant le SMIC à 1500 €.'.split(' '))
        
        
        
        

['améliorer',
 'le',
 'pouvoir',
 'd’achat',
 'en',
 'augmentant',
 'les',
 'salaires',
 'et',
 'notamment',
 'en',
 'montant',
 'le',
 'smic',
 'à',
 'mille cinq cents',
 '€.']

# Preprocessing - Stemming

In [34]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

ps = FrenchStemmer()
ps.stem('livres')

'livr'

# Embedding