# Projet 5 - Catégorisez automatiquement des questions

## Importation des librairies et des données

In [18]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.pipeline import make_pipeline
import joblib
import csv
from sklearn import preprocessing

In [91]:
X_df = pd.read_csv('filtered_df.csv', sep = ';', index_col = 0)
X_df.fillna(' ', inplace = True)
X_df['Body'].shape

(34805,)

In [5]:
# Importation de tous les modèles issus de l'analyse du projet
model_final = joblib.load('model_final.plk')
pca = joblib.load('model_pca.plk')
std_scale = joblib.load('std_scale.plk')

In [6]:
# Importation des termes de la matrice tf-idf
feature_name = []
with open('feature_name.csv', 'r') as data:
    for line in csv.reader(data):
        feature_name.append(line)

    # Transforme la liste de listes en liste
feature_names = [item for sublist in feature_name for item in sublist]

In [50]:
# Importation des stopwords
list_sw = []
with open('list_stop_words.csv', 'r') as data:
    for line in csv.reader(data):
        list_sw.append(line)

    # Transforme la liste de listes en liste
sw = [item for sublist in list_sw for item in sublist]
stop_words = set(sw)
len(stop_words)

86969

## Traitement des données et du modèle

In [97]:
# Cleaning, tokenizing and lemmatizing
class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        token = RegexpTokenizer(r'[a-zA-Z]+')
        return [self.wnl.lemmatize(t) for t in token.tokenize(doc.lower()) if t not in stop_words]
wnl = WordNetLemmatizer()    
tokenizer = lambda x: [wnl.lemmatize(x) for x in token.tokenize(x.lower()) if x not in stop_words]


#Tf-Idf
count = CountVectorizer(tokenizer=LemmaTokenizer(),
                        stop_words=stop_words, analyzer='word')
tfidf = TfidfTransformer()
pipe = make_pipeline(count, tfidf)

#pipe.fit(X_df)
pipe.fit(X_df['Body'])
X_features = pipe.transform(X_df['Body'])
X_features

<34805x981 sparse matrix of type '<class 'numpy.float64'>'
	with 1055095 stored elements in Compressed Sparse Row format>

In [98]:
mX_feature =  pd.DataFrame(X_features.toarray(), columns = feature_names)
mx_feature

ValueError: Shape of passed values is (34805, 981), indices imply (34805, 977)

## Entrée de l'utilisateur

In [8]:
# User's title input
title = input("Title: ")

Title: 'Hello, these are 3 examples to show the different steps of the cleaning process.'


In [9]:
# User's body imput
body = input("Body: ")

Body: 'Hello, these are 3 examples to show the


In [10]:
# Jointure du titre et du corps pour créer la question
question = title + " " + body

# Créer un dataframe contenant la question
df_question = pd.DataFrame({'Question': [question]})
df_question.head()

Unnamed: 0,Question
0,"'Hello, these are 3 examples to show the diffe..."


## Affichage du tags

In [87]:
features = pipe.transform(df_question)
train_mx =  pd.DataFrame(features.toarray())
train_mx
#X_scaled = std_scale.transform(train_mx)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,971,972,973,974,975,976,977,978,979,980
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [88]:
features = pipe.transform(df_question)
mx_feature = pd.DataFrame(features.toarray())
std_features = std_scale.transform(mx_feature)
acp=pca.transform(std_features)
predicted_tags = model_final.predict(acp)

ValueError: X has 981 features, but this StandardScaler is expecting 977 features as input.