In [361]:
import json
import os
from os import listdir
from os.path import isfile, join
import numpy as np
import pandas as pd
import pathlib
from num2words import num2words
import numpy as np
from scipy.sparse.linalg import svds
from sklearn.feature_extraction.text import CountVectorizer

# Fonctions données

In [59]:
# -*- coding: utf-8 -*-

import pandas as pd
import pathlib


def read_data(filename: str):
    """ Read the json file and convert it into a pd.DataFrame
    preprocessing: format datetime

    Args:
        filename: json filename

    Returns:
        pd.DataFrame
    """
    assert filename.endswith('.json'), 'filename should be a json file'
    assert pathlib.Path(filename).exists(), 'filename does not exists'

    df = pd.read_json(filename, encoding='UTF-8')
    df = df.assign(createdAt=pd.to_datetime(df.createdAt),
                   publishedAt=pd.to_datetime(df.publishedAt),
                   updatedAt=pd.to_datetime(df.updatedAt))

    # remove trashed data
    df = df[~df.trashed]

    return df


def extract_responses_by_id(responses: list, key: str='138'):
    """ Extract a specific question

    Args:
        responses: list (example df.iloc[0].responses)
        key: questionId (example '142')

    Returns:
        responses as a string
    """

    response = [x['formattedValue'] for x in responses
                if x['questionId'] == key]
    if len(response):
        return response[0]
    else:
        return None
    
def get_responses(df: pd.DataFrame):
    """ Extract responses and return a pd.DataFrame
    with columns: authorId, questionId, formattedValue

    Args:
        df: dataframe from read_data
    
    Returns:
        pd.DataFrame with responses
    """
    responses = []
    for i, x in df.iterrows():
        df_tmp = (pd.DataFrame(x.responses).
                  filter(['questionId', 'formattedValue']).
                  assign(authorId=x.authorId).
                assign(authorZipCode=x.authorZipCode))
        
        responses.append(df_tmp)

    return pd.concat(responses, ignore_index=True)

def get_ids_open_reponses(df: pd.DataFrame):
    """ Return the ids of open questions
    i.e does not have a predefined set of possible responses
    """
    list_questions = df.iloc[0].responses

    ids_open_questions = [x['questionId'] for x in list_questions
                          if x['value'] is None or
                          '{"labels"' not in x['value']]
    return ids_open_questions

def get_open_reponses(df: pd.DataFrame):
    """ Filter the data to only return non empty open responses

    Args:
        df: dataframe from read_data

    Returns:
        pd.DataFrame
    """

    df_open = get_responses(df)
    ids_open = get_ids_open_reponses(df)

    df_open = df_open[df_open.questionId.isin(ids_open)]
    df_open = df_open[~pd.isnull(df_open.formattedValue)]
    return df_open

# Preprocessing

In [450]:
def get_stopswords(type="veronis"):
    '''returns the veronis stopwords in unicode, or if any other value is passed, it returns the default nltk french stopwords'''
    if type=="veronis":
        #VERONIS STOPWORDS
        raw_stopword_list = ["un","une","la","le","les","Ap.", "Apr.", "GHz", "MHz", "USD", "a", "afin", "ah", "ai", "aie", "aient", "aies", "ait", "alors", "après", "as", "attendu", "au", "au-delà", "au-devant", "aucun", "aucune", "audit", "auprès", "auquel", "aura", "aurai", "auraient", "aurais", "aurait", "auras", "aurez", "auriez", "aurions", "aurons", "auront", "aussi", "autour", "autre", "autres", "autrui", "aux", "auxdites", "auxdits", "auxquelles", "auxquels", "avaient", "avais", "avait", "avant", "avec", "avez", "aviez", "avions", "avons", "ayant", "ayez", "ayons", "b", "bah", "banco", "ben", "bien", "bé", "c", "c'", "c'est", "c'était", "car", "ce", "ceci", "cela", "celle", "celle-ci", "celle-là", "celles", "celles-ci", "celles-là", "celui", "celui-ci", "celui-là", "celà", "cent", "cents", "cependant", "certain", "certaine", "certaines", "certains", "ces", "cet", "cette", "ceux", "ceux-ci", "ceux-là", "cf.", "cg", "cgr", "chacun", "chacune", "chaque", "chez", "ci", "cinq", "cinquante", "cinquante-cinq", "cinquante-deux", "cinquante-et-un", "cinquante-huit", "cinquante-neuf", "cinquante-quatre", "cinquante-sept", "cinquante-six", "cinquante-trois", "cl", "cm", "cm²", "comme", "contre", "d", "d'", "d'après", "d'un", "d'une", "dans", "de", "depuis", "derrière", "des", "desdites", "desdits", "desquelles", "desquels", "deux", "devant", "devers", "dg", "différentes", "différents", "divers", "diverses", "dix", "dix-huit", "dix-neuf", "dix-sept", "dl", "dm", "donc", "dont", "douze", "du", "dudit", "duquel", "durant", "dès", "déjà", "e", "eh", "elle", "elles", "en", "en-dehors", "encore", "enfin", "entre", "envers", "es", "est", "et", "eu", "eue", "eues", "euh", "eurent", "eus", "eusse", "eussent", "eusses", "eussiez", "eussions", "eut", "eux", "eûmes", "eût", "eûtes", "f", "fait", "fi", "flac", "fors", "furent", "fus", "fusse", "fussent", "fusses", "fussiez", "fussions", "fut", "fûmes", "fût", "fûtes", "g", "gr", "h", "ha", "han", "hein", "hem", "heu", "hg", "hl", "hm", "hm³", "holà", "hop", "hormis", "hors", "huit", "hum", "hé", "i", "ici", "il", "ils", "j", "j'", "j'ai", "j'avais", "j'étais", "jamais", "je", "jusqu'", "jusqu'au", "jusqu'aux", "jusqu'à", "jusque", "k", "kg", "km", "km²", "l", "l'", "l'autre", "l'on", "l'un", "l'une", "la", "laquelle", "le", "lequel", "les", "lesquelles", "lesquels", "leur", "leurs", "lez", "lors", "lorsqu'", "lorsque", "lui", "lès", "m", "m'", "ma", "maint", "mainte", "maintes", "maints", "mais", "malgré", "me", "mes", "mg", "mgr", "mil", "mille", "milliards", "millions", "ml", "mm", "mm²", "moi", "moins", "mon", "moyennant", "mt", "m²", "m³", "même", "mêmes", "n", "n'avait", "n'y", "ne", "neuf", "ni", "non", "nonante", "nonobstant", "nos", "notre", "nous", "nul", "nulle", "nº", "néanmoins", "o", "octante", "oh", "on", "ont", "onze", "or", "ou", "outre", "où", "p", "par", "par-delà", "parbleu", "parce", "parmi", "pas", "passé", "pendant", "personne", "peu", "plus", "plus_d'un", "plus_d'une", "plusieurs", "pour", "pourquoi", "pourtant", "pourvu", "près", "puisqu'", "puisque", "q", "qu", "qu'", "qu'elle", "qu'elles", "qu'il", "qu'ils", "qu'on", "quand", "quant", "quarante", "quarante-cinq", "quarante-deux", "quarante-et-un", "quarante-huit", "quarante-neuf", "quarante-quatre", "quarante-sept", "quarante-six", "quarante-trois", "quatorze", "quatre", "quatre-vingt", "quatre-vingt-cinq", "quatre-vingt-deux", "quatre-vingt-dix", "quatre-vingt-dix-huit", "quatre-vingt-dix-neuf", "quatre-vingt-dix-sept", "quatre-vingt-douze", "quatre-vingt-huit", "quatre-vingt-neuf", "quatre-vingt-onze", "quatre-vingt-quatorze", "quatre-vingt-quatre", "quatre-vingt-quinze", "quatre-vingt-seize", "quatre-vingt-sept", "quatre-vingt-six", "quatre-vingt-treize", "quatre-vingt-trois", "quatre-vingt-un", "quatre-vingt-une", "quatre-vingts", "que", "quel", "quelle", "quelles", "quelqu'", "quelqu'un", "quelqu'une", "quelque", "quelques", "quelques-unes", "quelques-uns", "quels", "qui", "quiconque", "quinze", "quoi", "quoiqu'", "quoique", "r", "revoici", "revoilà", "rien", "s", "s'", "sa", "sans", "sauf", "se", "seize", "selon", "sept", "septante", "sera", "serai", "seraient", "serais", "serait", "seras", "serez", "seriez", "serions", "serons", "seront", "ses", "si", "sinon", "six", "soi", "soient", "sois", "soit", "soixante", "soixante-cinq", "soixante-deux", "soixante-dix", "soixante-dix-huit", "soixante-dix-neuf", "soixante-dix-sept", "soixante-douze", "soixante-et-onze", "soixante-et-un", "soixante-et-une", "soixante-huit", "soixante-neuf", "soixante-quatorze", "soixante-quatre", "soixante-quinze", "soixante-seize", "soixante-sept", "soixante-six", "soixante-treize", "soixante-trois", "sommes", "son", "sont", "sous", "soyez", "soyons", "suis", "suite", "sur", "sus", "t", "t'", "ta", "tacatac", "tandis", "te", "tel", "telle", "telles", "tels", "tes", "toi", "ton", "toujours", "tous", "tout", "toute", "toutefois", "toutes", "treize", "trente", "trente-cinq", "trente-deux", "trente-et-un", "trente-huit", "trente-neuf", "trente-quatre", "trente-sept", "trente-six", "trente-trois", "trois", "très", "tu", "u", "un", "une", "unes", "uns", "v", "vers", "via", "vingt", "vingt-cinq", "vingt-deux", "vingt-huit", "vingt-neuf", "vingt-quatre", "vingt-sept", "vingt-six", "vingt-trois", "vis-à-vis", "voici", "voilà", "vos", "votre", "vous", "w", "x", "y", "z", "zéro", "à", "ç'", "ça", "ès", "étaient", "étais", "était", "étant", "étiez", "étions", "été", "étée", "étées", "étés", "êtes", "être", "ô"]
    else:
        #get French stopwords from the nltk kit
        raw_stopword_list = stopwords.words('french') #create a list of all French stopwords
    #stopword_list = [word.decode('utf8x') for word in raw_stopword_list] #make to decode the French stopwords as unicode objects rather than ascii
    return raw_stopword_list

List_stop_words = get_stopswords()
#A mettre a jour!!!!!!!!!!

List_ponctuation = [',',';',':','!','?','.','/','%','*','§','^','¨','$','£','=',')','°',']','+','}','_','&','\t','(']


def lister_mots(phrase):
    for i in range (len(List_ponctuation)): #Enleve la ponctuation de la phrase
        if List_ponctuation[i] in phrase:
            phrase = phrase.replace(List_ponctuation[i],"")
        if('\n' in phrase):
            phrase = phrase.replace('\n'," ")
    List_mots = phrase.split(" ")
    return List_mots


def enlever_les_stops_words(L):
    for mot in List_stop_words:
        while mot in L:
            L.remove(mot)
    return L

def tokenization(word_list):
    tokenized_list = []
    for word in word_list:
        try:
            word = int(word) #if the word is the string of a number (i.e '99'), we convert it back to an int
            word = num2words(word,lang='fr')
        except:
            pass
        word = word.lower() #We put the word in lower case
        tokenized_list.append(word)
    return tokenized_list


from nltk.stem import SnowballStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

def preprocessing(doc):
    mots = list(lister_mots(doc))
    mots = tokenization(mots)
    mots = enlever_les_stops_words(mots)
    ps = SnowballStemmer('french')
    for indice in range(len(mots)) : 
        mots[indice] = ps.stem(mots[indice])
    return ' '.join(mots)

def preprocessing_list(liste):
    for i in range(len(liste)):
        liste[i]=preprocessing(liste[i])
    return liste

# Latent semantic analysis

In [190]:
class Response :
    def __init__(self,questionId):
        self.questionId = questionId
        
    def fit(self,responses):
        """permet de créer la réponse depuis l'ensemble des reponses du json """
        self.responses = responses 
        self.liste = list((responses[responses['questionId']==str(self.questionId)])['formattedValue'])
        self.liste_zip = list((responses[responses['questionId']=='163'])['authorZipCode'])
        
        print("longueur de liste : "+str(len(self.liste)))
        print("longueur de la liste des zips : "+str(len(self.liste_zip)))
        
    def fit_from_file(self,filename):
        """"permet de créer la reponse depuis le nom du fichier json"""
        self.filename = filename
        data = read_data(filename)
        responses = get_open_reponses(data)
        self.fit(responses)
    
    def preprocess(self):
        """effectue le preprocessing"""
        corpus = preprocessing_list(self.liste)
        vectorizer = CountVectorizer()
        self.matrix_topic =  np.array(vectorizer.fit_transform(corpus).todense())
        print("taille de la matrice : "+str(self.matrix_topic.shape))
        self.vocab =  vectorizer.vocabulary_ 

In [464]:
class Response_Analysis_LSA :
    def __init__(self,k=10,number_words=1000):
        self.k = k
        self.number_words =number_words
        self.max_indexes =[]
    
    def fit(self,response): 
        """
        on calcule la LSA sur la reponse qui correspond a un objet Reponse qui contient toutes les reponses à une question
        """
        self.response = response
        matrix_topic = response.matrix_topic
        # permet de récuperer les indices des mots triés par fréquence
        indexes_sorted = np.argsort((matrix_topic.T).sum(axis=1))
        # indices des mots les plus importants
        self.max_indexes = indexes_sorted[-self.number_words:]

        self.matrix  = (matrix_topic.T[self.max_indexes]).astype(float)

        self.u, self.s, self.vh = svds((self.matrix), k=10)
    
    def retrieve_words_topic(self,topic):
        """
        return the words of one particular topic
        topic : index of the topic 
        """
        vocab = response.vocab
        indices_matrice = [i for i, x in enumerate(np.array(self.u[:,topic]>10e-2).reshape(self.u.shape[0]).tolist()) if x]
        if self.max_indexes != [] :
            true_indices = [self.max_indexes[i] for i in indices_matrice]
            word_topic = [word for word in vocab.keys() if (vocab[word] in true_indices)]
        else : 
            word_topic = [word for word in vocab.keys() if (vocab[word] in indices_matrice)]
        return word_topic
    
    def get_all_topics(self):
        """
        return all topics 
        """
        topics = []
        for i in range(self.k):
            topics.append(self.retrieve_words_topic(i))
        return(topics)
    
    def retrieve_importance_by_region(self,seuil = 10e-4):
        liste = response.liste_zip
        liste_dep = []
        for i in range(len(liste)) :
            liste_dep.append(int(str(liste[i])[:2])*(int(str(liste[i])[:2])<96)*(0<=liste[i]<96000))
        del liste
        dictionnaire_departement = dict()

        # Initialisation pour chaque département
        for i in range(0,96):
            dictionnaire_departement[i] = 0

        # Comptage du nombre de participants par département
        for departement in liste_dep:
            dictionnaire_departement[departement] += 1
        
        vh = self.vh
        nb_topics = vh.shape[0]

        # Indices des personnes s'étant intéresser à un topic
        indices = [[i for i, x in enumerate(np.array(abs(vh[k])>seuil).tolist()) if x] for k in range(nb_topics)]

        # Initiatisation pour chaque département
        liste_dictionnaire_proportions = [dict() for k in range(nb_topics)]
        for i in range(0,96):
            for k in range(nb_topics):
                liste_dictionnaire_proportions[k][i] = 0

        # Détermination du pourcentage de personnes du département s'étant intéresser au topic
        for k in range(nb_topics):
            for ind in indices[k]:
                liste_dictionnaire_proportions[k][liste_dep[ind]] += 1
            for i in [j for j in range(96) if dictionnaire_departement[j] != 0]:
                liste_dictionnaire_proportions[k][i] /= dictionnaire_departement[i]
        self.liste_dictionnaire_proportions = liste_dictionnaire_proportions
        return(liste_dictionnaire_proportions)
    
    def return_importance_for_topic(self,topic):
        """Pourcentage en fonction du zip pour un topic"""
        d = self.liste_dictionnaire_proportions[topic]
        s = pd.Series(d, name='ratio')
        s.index.name = 'zip'
        df =s.reset_index()
        df['zip'] =df['zip'].astype(str)
        return(df)


In [30]:
data_fiscalisation = read_data('LA_FISCALITE_ET_LES_DEPENSES_PUBLIQUES.json')

In [61]:
responses = get_open_reponses(data_fiscalisation)

## Préparation de la réponse

In [459]:
# PARAMETRE : id de la question
response = Response(163)

In [460]:
response.fit(responses)

longueur de liste : 53716
longueur de la liste des zips : 53716


In [461]:
response.preprocess()

taille de la matrice : (53716, 29844)


## LSA 

In [465]:
# PARAMETRES : 
# k : nombre de topics
# number_words : nombre de mots utilisés
LSA = Response_Analysis_LSA(k=10,number_words=3000)

In [466]:
LSA.fit(response)

In [467]:
topics = LSA.get_all_topics()



In [471]:
# mot constituants les différents topics 
print(topics)

[['franc', 'csg', 'retrait', 'supprim', 'revenu'], ['franc', 'entrepris', 'social', 'revenus'], ['franc', 'retrait', 'pai', 'impot', 'revenus', 'tranch'], ['tax', 'retrait', 'pai', 'fiscal'], ['tax', 'franc', 'pai', 'cent', 'impôt'], ['franc', 'retrait', 'pai', 'qu', 'il', 'entrepris', 'fair', 'est'], ['fiscal', 'cent', 'impôt'], ['fiscal', 'entrepris', 'nich', 'lutt', 'évas'], ['pai', 'impôt', 'revenus', 'revenu'], ['tax', 'franc', 'pai', 'fiscal', 'cent', 'impôt', 'qu', 'entrepris', 'un', 'fair', 'social', 'est', 'revenus', 'revenu']]


In [468]:
# PARAMETRE : seuil pour determiner si une réponse parle du topic ou non 

importance_region = LSA.retrieve_importance_by_region(seuil = 10e-4)

## Folium

In [346]:
import json
import folium
import numpy as np
import geopandas
import os

In [475]:
# PARAMETRE : id du topic

topic = 1
df = LSA.return_importance_for_topic(topic)

geo_str = os.path.join('geo_fr.json')
gdf = geopandas.read_file(geo_str)

m = folium.Map([47, 2], zoom_start=6)

choropleth = folium.Choropleth(
    geo_data=gdf,
    data=df,
    columns=['zip','ratio'],
    key_on='feature.properties.code',
    fill_color='YlGn',
    name='pourcentages',
    show=False,
).add_to(m)

m