In [1]:
import xml.etree.ElementTree as ET

In [2]:
def lireDocs():
    docs = {}
    
    fichier_tree = ET.parse("corpusIrisVersion4.xml")
    fichier_docs = fichier_tree.getroot()
    compteur_docs = 0
    
    for document in fichier_docs:
        compteur_docs += 1
        doc_id = document.attrib['id']
        questions = ''
        answers = ''
        doc_class = document.attrib['class']
        doc_subclass = document.attrib['subclass']
        for child in document:
            if child.tag == 'question':
                questions += child.text.rstrip()
            
            if child.tag == 'answer':
                answers += child.text.rstrip()
                
        doc = {}
        doc['id']= doc_id
        doc['class'] = doc_class
        doc['subclass'] = doc_subclass
        doc['questions'] = questions
        doc['answers'] = answers
        docs[doc_id] = doc
        
    print("Total docs : ", compteur_docs)
        
    return docs

In [3]:
documents_Corpus = lireDocs()

Total docs :  12308


In [4]:
import nltk

from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

from nltk.corpus import stopwords
stopWords = set(stopwords.words('french'))

# Exemple en bas
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer("french")

In [5]:
def preprocessingDocs(documents):
    print("Preprocessing documents")
    for element in documents:
        document = documents[element]
        
        # Suppression de signes de ponctuation avec tokenizer
        questions = document['questions']
        q_words = ' '.join(tokenizer.tokenize(questions))
        document['words_questions'] = q_words
        answers = document['answers']
        aw_words = ' '.join(tokenizer.tokenize(answers))
        document['words_answers'] = aw_words
        
    mots = [[]]
    for element in documents:
        document = documents[element]
        q_texte = document['words_questions']
        questionPreprocess = []
        
        for mot in tokenizer.tokenize(q_texte):
            if mot.lower() not in stopWords:
                mot_stem = stemmer.stem(mot.lower())       
                questionPreprocess.append(mot_stem)
                mots.append([(mot_stem)])
        document['words_questions'] = ' '.join(questionPreprocess)
        
        aw_texte = document['words_answers']
        answerPreprocess = []
        
        for mot in tokenizer.tokenize(aw_texte):
            if mot.lower() not in stopWords:
                mot_stem = stemmer.stem(mot.lower())
                answerPreprocess.append(mot_stem)
                
        document['words_answers'] = ' '.join(answerPreprocess)
        
    return mots

In [6]:
mots_questions_Corpus = preprocessingDocs(documents_Corpus)

Preprocessing documents


In [7]:
mots_questions_Corpus

[[],
 ['bonjour'],
 ['depuis'],
 ['loi'],
 ['alur'],
 ['tout'],
 ['le'],
 ['copropriet'],
 ['devront'],
 ['être'],
 ['immatricul'],
 ['avant'],
 ['2019'],
 ['registr'],
 ['copropriet'],
 ['consult'],
 ['tout'],
 ['mond'],
 ['cet'],
 ['immatricul'],
 ['échelon'],
 ['suiv'],
 ['nombr'],
 ['lot'],
 ['principal'],
 ['200'],
 ['lot'],
 ['entre'],
 ['50'],
 ['200'],
 ['lot'],
 ['moin'],
 ['50'],
 ['lot'],
 ['premi'],
 ['immatricul'],
 ['factur'],
 ['le'],
 ['mis'],
 ['jour'],
 ['annuel'],
 ['inclus'],
 ['forf'],
 ['syndic'],
 ['professionnel'],
 ['hor'],
 ['cet'],
 ['factur'],
 ['hor'],
 ['budget'],
 ['prévisionnel'],
 ['conduit'],
 ['frais'],
 ['factur'],
 ['factur'],
 ['tré'],
 ['variabl'],
 ['80'],
 ['4000'],
 ['lor'],
 ['vérif'],
 ['compt'],
 ['surtout'],
 ['cas'],
 ['désign'],
 ['renouvel'],
 ['syndic'],
 ['fait'],
 ['bien'],
 ['attent'],
 ['le'],
 ['contrat'],
 ['typ'],
 ['post'],
 ['7'],
 ['2'],
 ['7'],
 ['autr'],
 ['prestat'],
 ['immatricul'],
 ['initial'],
 ['syndicat'],
 ['le'],
 [

In [34]:
question_utilisateur = ['Je veux annuler une caution solidaire']

In [35]:
def getWordsQuestions(documents):
    liste_questions_stems = []
    for element in documents:
        document = documents[element]       
        liste_questions_stems.append(document['words_questions'])
    return liste_questions_stems

In [36]:
liste_questions = getWordsQuestions(documents_Corpus)

In [37]:
liste_questions[2]

'bonjour problémat concern ancien log dont compagn locatair 14 11 2016 30 03 2018 lor départ provis charg 20 a déduit rembours caution agenc conform disposit vigueur jour reçu aucun régularis charg le anné 2016 2017 2018 contact syndic afin demand dat laquel le assembl général valid compt lieu il souhaitent répondr prétext propriétair égal contact agenc presqu mainten 10 repris depuis départ il indiquent il inform autr fait troubl appris compteur eau install 2017 alor inform person a relev le index comment débloqu situat exist procédur simplifi peut demand dommag intérêt retard merc aid bonjour merc répons envoi demand rar sort log agenc charg locat log 10 avril 2018 attend encor répons bonjour tous effet malheur envoi courri bailleur agenc don trait tout transact fait quelqu découvert malheur regard si pouv trouv numéro propriétair internet afin inform situat malheur sembl societ serv achet log liquid début 2018 sav comment peux retrouv nouvel adress postal propriétair tout cas merc t

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics, feature_extraction

In [39]:
vectorizer = TfidfVectorizer()
TfIdfQuestions = vectorizer.fit_transform(liste_questions)

In [40]:
print(TfIdfQuestions.shape)

(12308, 33992)


In [41]:
TfIdfQuestions_utilisateur = vectorizer.transform(question_utilisateur, True)

In [None]:
class = 'Entreprise'

In [1]:
for question_utilisateurIndex in range (len(question_utilisateur)):
    print("Question utilisateur = ",question_utilisateur[question_utilisateurIndex])

    bestQuestionScore = None
    bestQuestionIndex = None

    for questionIndex in range (len(liste_questions)):
        
        #print("    Question: ",liste_questions[questionIndex])
    
        simScore = metrics.pairwise.cosine_similarity(TfIdfQuestions[questionIndex], TfIdfQuestions_utilisateur[question_utilisateurIndex])
        #print ("        simScore: ",simScore)
        if not bestQuestionScore or simScore > bestQuestionScore:
            bestQuestionScore = simScore
            bestQuestionIndex = questionIndex

    print("  ===> Best Question: ")
    if (bestQuestionIndex):
        index = iris + str(bestQuestionIndex)
           
        print("       QuestionIndex: ",bestQuestionIndex," => ", documents_Corpus.get(index))
        #

NameError: name 'question_utilisateur' is not defined