# Projet 5 - Catégorisez automatiquement des questions

## Importation des librairies et des données

In [8]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.pipeline import make_pipeline
import joblib
import csv
from sklearn import preprocessing

In [88]:
# Importation de la base initiale
df = pd.read_csv('filtered_df.csv', sep = ';', index_col = 0)
df.fillna(' ', inplace = True)
df['Body'].shape

(34802,)

In [86]:
# Importation des tags
tags = np.load('tags.npy')

In [11]:
# Importation de tous les modèles issus de l'analyse du projet
model_final = joblib.load('model_final.plk')
pca = joblib.load('model_pca.plk')
std_scale = joblib.load('std_scale.plk')

In [13]:
# Importation des stopwords
list_sw = []
with open('list_stop_words.csv', 'r') as data:
    for line in csv.reader(data):
        list_sw.append(line)

    # Transforme la liste de listes en liste
sw = [item for sublist in list_sw for item in sublist]
stop_words = set(sw)
len(stop_words)

88373

## Traitement des données et du modèle

In [36]:
# Cleaning, tokenizing and lemmatizing
class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        token = RegexpTokenizer(r'[a-zA-Z]+')
        return [self.wnl.lemmatize(t) for t in token.tokenize(doc.lower()) if t not in stop_words]
wnl = WordNetLemmatizer()    
tokenizer = lambda x: [wnl.lemmatize(x) for x in token.tokenize(x.lower()) if x not in stop_words]


#Tf-Idf
count = CountVectorizer(tokenizer=LemmaTokenizer(),
                        stop_words=stop_words, analyzer='word')
tfidf = TfidfTransformer()
pipe = make_pipeline(count, tfidf)


pipe.fit(df['Body'])
X_features = pipe.transform(df['Body'])
feature_names = count.get_feature_names()
len(feature_names)

997

## Entrée de l'utilisateur

In [161]:
# User's title input
title = input("Title: ")

Title: Interesting download error when trying to download html canvas as image with button


In [162]:
# User's body imput
body = input("Body: ")

Body: When I first click the button it doesn't work. On my second click it downloads 1 picture. My 3rd click it downloads 2 pictures. On my 4th click it downloads 3 pictures. So 1-0, 2-1, 3-2, 4-3. They are also downloaded immediately, it doesn't ask where to save.  js:    function xyz(){   const text =canvas.api.getCanvasAsImage();   const download = document.getElementById('download');   download.addEventListener('click', function(e) {   var link = document.createElement('a');   link.download = 'download.png';   link.href = text;   link.click();   link.delete; }); } html:  <button  onclick="xyz()" id="download">Download</button> I have just started learning javascript. I'm trying to learn by examining an application. I did not understand why these is happening and therefore could not solve the problem.


## Affichage du tag

In [164]:
# Jointure du titre et du corps pour créer la question
question = title + " " + body

# Créer un dataframe contenant la question
df_question = pd.DataFrame({'Question': [question]})

# Cleaning and processing
features = pipe.transform(df_question['Question'])
mx_feature = pd.DataFrame(features.toarray(), columns = feature_names)
std_features = std_scale.transform(mx_feature)
acp = pca.transform(std_features)

# Prediction
predicted_tags = model_final.predict(acp)

# Affichage du tag
print(tags[predicted_tags])

['ruby-on-rails']
