# Objectif

Dans ce notebook nous allons aborder la construction de notre API. Ce notebook sera le point d'entrée vers la conception de notre API qui sera stocké dans le dossier API.

## Import du modèle retenu

In [1]:
import joblib
import os
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

workingDir = os.getcwd()
filePath = os.path.join(workingDir,'API/model/tags_lr_compressed.joblib')
# load
mPredict = joblib.load(filePath)

filePath = os.path.join(workingDir,'API/model/tfidf.joblib')
vectorizer =  joblib.load(filePath)

In [2]:
from sklearn.preprocessing import MultiLabelBinarizer
filePath = os.path.join(workingDir,'API/model/multilabel.joblib')
multilabel = joblib.load(filePath)


In [3]:
mPredict

OneVsRestClassifier(estimator=LogisticRegression(C=10))

## Import d'un jeu de donnée

In [4]:
import pandas as pd
import numpy as np
workingDir = os.getcwd()
dataDir = os.path.join(workingDir,'Data','posts')
filePath = os.path.join(dataDir,'QueryResults_03.csv')
df = pd.read_csv(filePath,sep=',')

In [5]:
df.columns

Index(['Id', 'PostTypeId', 'AcceptedAnswerId', 'ParentId', 'CreationDate',
       'DeletionDate', 'Score', 'ViewCount', 'Body', 'OwnerUserId',
       'OwnerDisplayName', 'LastEditorUserId', 'LastEditorDisplayName',
       'LastEditDate', 'LastActivityDate', 'Title', 'Tags', 'AnswerCount',
       'CommentCount', 'FavoriteCount', 'ClosedDate', 'CommunityOwnedDate',
       'ContentLicense', 'rank'],
      dtype='object')

In [6]:
dfQuestions = df[['Title','Body']]
dfTags = df.Tags

In [7]:
dfQuestions.head()

Unnamed: 0,Title,Body
0,Is it possible to use C# Object Initializers w...,<p>I'm looking at the new object initializers ...
1,Excel Conditional Formatting Self Reference,<p>I'm trying to do some conditional formattin...
2,Best way to check for current date in where cl...,<p>I'm trying to find out the most efficient (...
3,Detect an internet connection activation with ...,<p>I've been using a 3G wireless card for a wh...
4,Improve asp script performance that takes 3+ m...,<p>I use an SQL statement to remove records th...


In [8]:
dfTags.head()

0                                    <c#-3.0><factory>
1    <excel><worksheet-function><conditional-format...
2          <sql><tsql><stored-procedures><performance>
3       <delphi><winapi><wininet><internet-connection>
4                                   <sql><asp-classic>
Name: Tags, dtype: object

In [9]:
def Tag_Transform(x):
    _str = x.replace(' ', '-').replace('<','').replace('>',' ').strip(' ')
    return _str.split()

In [10]:
dfTags = dfTags.apply(Tag_Transform)

In [11]:
dfTags.head()

0                                    [c#-3.0, factory]
1    [excel, worksheet-function, conditional-format...
2          [sql, tsql, stored-procedures, performance]
3       [delphi, winapi, wininet, internet-connection]
4                                   [sql, asp-classic]
Name: Tags, dtype: object

In [12]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import MWETokenizer
from nltk.stem import WordNetLemmatizer
import gensim
from gensim.models.phrases import Phrases

In [13]:
tokenizer = MWETokenizer()
tokenizer.add_mwe(('c', '#'))

In [14]:
tokenizer.tokenize(word_tokenize(dfQuestions.Title[0].lower()))

['is',
 'it',
 'possible',
 'to',
 'use',
 'c_#',
 'object',
 'initializers',
 'with',
 'factories']

In [15]:
new_stop_words=['would','want', 'please', 'help', 'can', "can't",'shall','thanks','thank','may',
                'seem','understand','error','warning','require','rather']
stop_words = stopwords.words('english')
stop_words.extend(new_stop_words)
stop_words = set(stop_words)
lemmatize=WordNetLemmatizer()
tokenizer = MWETokenizer()
tokenizer.add_mwe(('c', '#'))

file_path = os.path.join(workingDir,'API','model','bigram.pkl')
bigram_mod = Phrases.load(file_path)

def clean_text(text):
    clean_text = re.sub(r'[^A-Za-z0-9+#.\-]',' ',text.lower())
    words=word_tokenize(str(clean_text.lower()))
    words = tokenizer.tokenize(words)
    
    clean_words = [str(lemmatize.lemmatize(j)) for j in words if j not in stop_words]
    bigram_words = bigram_mod[clean_words]
    
    clean_text = ' '.join(bigram_words)
    clean_text = clean_text.replace('c_#','c#')
    
    return clean_text.strip()

In [16]:
def clean_body(body):
    _txt = BeautifulSoup(body).get_text()
    return clean_text(_txt)

In [17]:
dfFinal = dfQuestions.Title.apply(clean_text) +  dfQuestions.Body.apply(clean_body) 

In [18]:
dfFinal.head()

0    possible use c# object initializers factoryloo...
1    excel conditional formatting self referencetry...
2    best way check current date clause sql querytr...
3    detect internet connection activation delphius...
4    improve asp script performance take 3+ minute ...
dtype: object

In [19]:
X_tfidf = vectorizer.transform(dfFinal)

In [20]:
X_tfidf

<5000x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 137579 stored elements in Compressed Sparse Row format>

In [21]:
yPred = mPredict.predict(X_tfidf)

In [22]:
mPredict.classes_

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99])

In [23]:
multilabel.classes_

array(['.net', 'actionscript-3', 'ajax', 'algorithm', 'android', 'apache',
       'apache-flex', 'arrays', 'asp.net', 'asp.net-mvc', 'asp.net-mvc-2',
       'bash', 'bug', 'c', 'c#', 'c++', 'class', 'cocoa', 'cocoa-touch',
       'css', 'database', 'debugging', 'delphi', 'design-patterns',
       'discussion', 'django', 'eclipse', 'email', 'entity-framework',
       'events', 'excel', 'facebook', 'feature-request', 'file', 'flash',
       'forms', 'generics', 'git', 'google-app-engine', 'hibernate',
       'html', 'http', 'image', 'internet-explorer', 'iphone', 'java',
       'javascript', 'jquery', 'json', 'linq', 'linq-to-sql', 'linux',
       'macos', 'multithreading', 'mysql', 'nhibernate', 'objective-c',
       'oop', 'oracle', 'parsing', 'performance', 'perl', 'php', 'python',
       'qt', 'regex', 'ruby', 'ruby-on-rails', 'security', 'sharepoint',
       'silverlight', 'spring', 'sql', 'sql-server', 'sql-server-2005',
       'sql-server-2008', 'sqlite', 'string', 'support', 'svn

In [24]:
yPred[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [25]:
dfRes = pd.DataFrame(yPred)

In [26]:
dfRes.columns = multilabel.classes_

In [31]:
dfFinal['Tag_pred'] = multilabel.inverse_transform(yPred)

In [32]:
dfFinal['Tag_orig'] = dfTags

In [36]:
dfTags

0                                       [c#-3.0, factory]
1       [excel, worksheet-function, conditional-format...
2             [sql, tsql, stored-procedures, performance]
3          [delphi, winapi, wininet, internet-connection]
4                                      [sql, asp-classic]
                              ...                        
4995                                 [javascript, jquery]
4996                                      [c#, mono, xna]
4997                   [wpf, animation, navigationwindow]
4998                           [apache-flex, flexbuilder]
4999                             [python, math, nth-root]
Name: Tags, Length: 5000, dtype: object

In [37]:
 multilabel.inverse_transform(yPred)

[('c#',),
 ('excel',),
 ('sql',),
 (),
 ('sql', 'sql-server'),
 (),
 (),
 (),
 (),
 (),
 ('unit-testing',),
 ('debugging', 'javascript', 'visual-studio'),
 ('c++',),
 (),
 ('.net',),
 ('forms', 'html'),
 ('windows',),
 (),
 ('java',),
 (),
 ('regex',),
 (),
 (),
 ('asp.net',),
 ('asp.net-mvc', 'javascript'),
 (),
 ('c++',),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 ('javascript',),
 (),
 (),
 ('c++',),
 ('.net',),
 ('oop',),
 ('html', 'javascript'),
 (),
 (),
 ('c#',),
 ('php', 'regex'),
 (),
 ('eclipse',),
 (),
 (),
 (),
 (),
 (),
 (),
 (),
 ('c#',),
 ('c++', 'oop'),
 (),
 (),
 ('asp.net', 'winforms'),
 ('css', 'javascript'),
 (),
 ('database',),
 (),
 (),
 (),
 ('c#',),
 ('asp.net',),
 (),
 (),
 (),
 ('python',),
 ('.net', 'windows'),
 ('ruby-on-rails',),
 (),
 ('c#', 'generics'),
 (),
 (),
 ('java', 'swing'),
 ('c#', 'multithreading'),
 ('c#',),
 ('windows',),
 (),
 ('wcf', 'web-services'),
 ('unit-testing',),
 (),
 (),
 ('database',),
 (),
 ('.net',),
 ('database',),
 ('wpf',),
 ('php',)