# Projet 5 - Catégorisez automatiquement des questions

## Importation des librairies et des données

In [18]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.pipeline import make_pipeline
import joblib
import csv
from sklearn import preprocessing

In [91]:
X_df = pd.read_csv('filtered_df.csv', sep = ';', index_col = 0)
X_df.fillna(' ', inplace = True)
X_df['Body'].shape

(34805,)

In [5]:
# Importation de tous les modèles issus de l'analyse du projet
model_final = joblib.load('model_final.plk')
pca = joblib.load('model_pca.plk')
std_scale = joblib.load('std_scale.plk')

In [106]:
# Importation des termes de la matrice tf-idf
feature_name = []
with open('feature_name.csv', 'r') as data:
    for line in csv.reader(data):
        feature_name.append(line)

    # Transforme la liste de listes en liste
feature_names = [item for sublist in feature_name for item in sublist]
len(feature_names)

977

In [118]:
imported_features = ['able', 'accept', 'access', 'according','account', 'achieve', 'across', 'action', 'active', 'activity', 'activitythread', 'actual', 'actually', 'add', 'added','adding', 'address', 'admin', 'advance', 'ajax', 'alert', 'algorithm',
 'align', 'alloc', 'allow', 'allowed', 'already', 'also', 'alternative', 'always', 'android', 'androidruntime', 'angular', 'animation', 'annotation', 'another', 'answer', 'anyone', 'anything', 'apache', 'api', 'apk', 'app', 'appear', 'appears', 'append', 'apple', 'application', 'apply', 'appreciated', 'approach', 'apps', 'area', 'arg', 'args', 'argument', 'around', 'array', 'arraylist', 'article', 'artifactid', 'asp', 'assembly', 'assert', 'assume', 'async', 'attempt', 'attr', 'attribute', 'auth', 'authentication', 'auto', 'automatically', 'available', 'avoid', 'await',
 'b', 'back', 'background', 'bad', 'bar', 'base', 'based', 'basic', 'basically', 'bean', 'begin', 'behavior', 'best', 'better', 'big', 'bin', 'binary', 'bind', 'binding', 'bit', 'bitmap','black', 'block', 'blog', 'body', 'book', 'bool', 'boolean','boost','boot', 'bootstrap', 'border', 'bottom', 'box', 'br', 'branch', 'break', 'browser', 'buffer', 'bug', 'build', 'builder', 'building', 'built', 'bundle', 'button', 'byte', 'c', 'cache', 'call', 'callback', 'called', 'calling','cannot', 'canvas', 'card', 'case', 'cast', 'catch', 'category', 'cause', 'cell', 'center', 'certain', 'certificate', 'change', 'changed', 'changing', 'char', 'character', 'check', 'checked', 'checking', 'child', 'chrome', 'class', 'clean', 'clear', 'click', 'client', 'close', 'code', 'col', 'collection', 'color', 'column', 'com', 'come', 'command','comment', 'commit', 'common', 'compare', 'compile', 'compiler', 'complete', 'component', 'condition', 'config', 'configuration', 'connect', 'connection',
 'console', 'const', 'constant', 'constraint', 'constructor', 'contain', 'container', 'contains', 'content', 'context', 'control', 'controller', 'convert', 'copy', 'core', 'correct', 'correctly', 'could', 'count', 'counter', 'course', 'cout', 'cpp', 'crash', 'create', 'created', 'creating', 'crypto', 'cs', 'csv', 'current', 'currently', 'cursor', 'custom', 'data', 'database', 'dataframe', 'date', 'datetime', 'day','db', 'debug', 'decimal', 'declaration', 'declare', 'def', 'default', 'define', 'defined', 'definition', 'delegate', 'delete', 'dependency', 'description', 'design', 'detail', 'detect', 'determine', 'dev', 'developer', 'development',
 'device', 'df', 'dialog', 'dict','dictionary', 'difference', 'different', 'dir', 'directly', 'directory', 'disable', 'display', 'dist', 'div', 'django', 'dll', 'doc', 'document', 'documentation', 'dom', 'domain', 'done', 'double', 'download', 'dp', 'drawable', 'driver', 'due', 'duplicate', 'dynamic', 'dynamically', 'e', 'easy', 'echo', 'eclipse', 'edit', 'effect', 'either', 'element', 'else', 'email', 'empty', 'emulator', 'en', 'enable', 'encoding', 'end', 'engine', 'enough', 'enter','entire', 'entity', 'entry', 'enum', 'env', 'environment', 'equal', 'equivalent', 'err', 'error', 'etc', 'even','event', 'every', 'everything', 'exactly', 'example', 'except', 'exception', 'exe', 'execute', 'execution', 'exist', 'existing', 'exists', 'exit', 'expect','expected', 'explain', 'export', 'express', 'expression', 'extends', 'extension', 'external', 'extra', 'f', 'facebook', 'factory', 'failed', 'fails','false', 'far', 'faster', 'feature', 'feel', 'fetch', 'field', 'figure', 'file', 'filename', 'fill', 'filter', 'final', 'find', 'findviewbyid', 'fine', 'firefox', 'first', 'fix', 'fixed', 'flag', 'float', 'folder', 'following', 'follows', 'font', 'foo', 'force', 'foreach', 'form', 'format', 'found', 'fragment', 'frame', 'framework', 'free', 'full', 'func', 'function', 'functionality', 'g', 'gcc', 'gem', 'general', 'generate',
 'generated', 'generic','get', 'getting', 'git', 'github', 'give', 'given', 'global', 'go', 'going','good', 'google', 'got', 'gradle','graph', 'great', 'group', 'groupid', 'guess', 'h', 'handle', 'handler', 'happens', 'hard', 'hash', 'head', 'header', 'height', 'hello', 'help', 'helper', 'hibernate', 'hidden', 'hide', 'home', 'host','however', 'href', 'html', 'http', 'icon', 'id', 'idea', 'identity', 'ie', 'ignore', 'image', 'imageview', 'img', 'implement', 'implementation',
 'implemented', 'import', 'important', 'include', 'index', 'info', 'information', 'init', 'initialize', 'inline', 'inner', 'input', 'insert', 'inside', 'install', 'installed', 'instance', 'instead', 'int', 'integer', 'intent', 'interface', 'internal', 'invalid', 'invoke', 'io','ip', 'iphone', 'issue', 'item', 'j', 'jar', 'java', 'javascript', 'jdk', 'job', 'join', 'jpg', 'jquery', 'json', 'junit', 'k', 'keep', 'key', 'keyboard', 'kind', 'know', 'l', 'label', 'lambda', 'lang', 'language', 'large', 'last', 'later', 'latest', 'layer', 'layout', 'layoutinflater', 'le','least', 'left', 'length', 'let', 'level', 'li', 'lib', 'library', 'like', 'limit', 'line', 'linearlayout', 'link', 'linq', 'linux', 'list', 'listview', 'little', 'load', 'loaded', 'loading', 'local', 'localhost', 'location', 'lock', 'log', 'logger', 'logging', 'login', 'long', 'look', 'looked','looking', 'loop', 'lot', 'mac', 'machine',
 'made', 'main', 'make', 'making', 'manager', 'manifest', 'manually', 'many', 'map', 'mapping', 'margin', 'master','match', 'math','matter', 'maven', 'max', 'may','maybe', 'mean', 'medium', 'member', 'memory', 'menu', 'merge', 'message', 'meta', 'method', 'microsoft', 'might', 'min', 'missing', 'mm', 'mobile', 'modal', 'mode', 'model', 'module', 'month', 'move', 'much', 'multiple', 'must', 'mvc', 'myapp', 'myclass', 'mysql', 'n', 'name', 'named', 'namespace', 'native', 'navigation', 'need', 'needed', 'net', 'network', 'never', 'new', 'next', 'ng', 'nice', 'nil', 'node', 'non', 'none', 'normal', 'note', 'nothing', 'notification', 'np', 'npm', 'nsstring', 'null', 'num', 'number', 'numpy', 'obj','object', 'objective', 'ok', 'old', 'onclick', 'oncreate', 'one', 'open', 'operation', 'operator', 'option', 'order', 'org', 'orientation', 'origin', 'original', 'output', 'overflow', 'override', 'p', 'package', 'padding', 'page', 'panda', 'param', 'parameter', 'params', 'parent', 'parse', 'part', 'particular', 'passed', 'password', 'path', 'pattern', 'pdf', 'people', 'per', 'perform', 'performance', 'permission', 'person', 'phone', 'php', 'pip', 'place', 'platform', 'play', 'please', 'plot', 'plugin', 'plugins', 'png', 'point', 'pointer', 'port','position', 'possible', 'post', 'practice', 'pretty', 'prevent', 'previous', 'primary', 'print', 'println', 'private', 'probably', 'problem', 'process', 'product', 'profile', 'program', 'programmatically', 'programming', 'project', 'prop', 'properly', 'property', 'protected', 'provide', 'provider', 'ptr', 'public', 'pull', 'purpose', 'push', 'put','px', 'py', 'python', 'q', 'query', 'question', 'queue', 'quite', 'r', 'rail', 'random', 'range', 'rather', 'rb', 'react', 'read', 'reading', 'ready', 'real', 'really', 'reason', 'recently', 'record', 'recyclerview',
 'red', 'ref', 'reference', 'reflect', 'regex', 'related', 'release', 'remote', 'remove', 'render', 'replace', 'repo', 'report', 'repository', 'request', 'require', 'required', 'requirement', 'requires', 'reset', 'resolve', 'resource', 'response','rest', 'result', 'return', 'returned', 'right', 'role', 'root','route', 'row', 'ruby', 'rule','run', 'running', 'runtime', 'rvm','sample', 'save', 'savedinstancestate', 'say', 'schema', 'scope', 'screen', 'script', 'scroll', 'sdk', 'search', 'second', 'section', 'security',
 'see', 'seem', 'seems', 'seen', 'select', 'selected', 'selector', 'self', 'send', 'separate', 'server', 'service', 'servlet', 'session', 'set', 'setting', 'setup', 'several', 'shape', 'share', 'shared', 'short', 'show', 'shown', 'side', 'similar', 'simple', 'simply', 'since', 'single', 'site', 'situation', 'size', 'small', 'socket', 'solution', 'solve', 'someone', 'something', 'sometimes', 'sort', 'source', 'space', 'span', 'specific', 'specified', 'specify', 'split', 'spring', 'springframework', 'sql', 'src', 'ssl', 'stack', 'standard', 'start', 'started','starting', 'state', 'statement', 'static','status', 'std', 'step', 'still', 'stop', 'store', 'stored', 'str', 'stream', 'string', 'struct', 'structure', 'studio', 'stuff', 'style', 'sub', 'submit', 'success', 'suggestion', 'sum', 'sun', 'super', 'support', 'sure', 'svg', 'swift', 'switch', 'symbol', 'symfony', 'syntax', 'system', 'tab', 'table', 'tag', 'take', 'target',
 'task', 'td', 'tell', 'temp', 'template', 'test', 'testing', 'text', 'textview', 'th', 'thank', 'thanks', 'theme', 'thing', 'think', 'though', 'thought', 'thread', 'three', 'throw', 'time', 'timeout', 'timer', 'title', 'tmp', 'token', 'tool', 'top', 'tostring', 'total', 'tr', 'transaction', 'transform', 'tree', 'tried', 'trigger', 'true', 'try', 'trying', 'tuple', 'turn', 'tutorial', 'two', 'txt', 'type', 'u', 'ui', 'ul', 'unable', 'undefined', 'understand', 'understanding', 'unique', 'unit', 'unknown', 'unsigned', 'update', 'updated', 'upload', 'uri', 'url', 'us', 'usage', 'use', 'used', 'useful', 'user', 'username', 'using', 'usr', 'utf', 'util', 'v', 'val', 'valid', 'validation', 'value', 'var', 'varchar', 'variable', 'vector', 'version', 'vertical', 'via', 'video', 'view', 'virtual', 'visible','visual', 'void', 'w', 'wait', 'want', 'wanted', 'warning',
 'way', 'web', 'website', 'well', 'whether', 'white', 'whole', 'widget', 'width', 'win', 'window', 'within', 'without', 'wondering', 'word', 'work', 'worked', 'worker', 'working', 'world', 'would', 'wrap', 'write', 'writeline', 'writing', 'written', 'wrong', 'www', 'x', 'xcode', 'xml', 'xmlns', 'year', 'yes', 'yet','z', 'zero', 'zygoteinit']

In [123]:
len(imported_features)

996

In [120]:
# mots qui ne s'importe pas par csv
words=[]
for i in imported_features:
    if i not in feature_names:
        words.append(i)
print(words)

['alloc', 'allowed', 'area', 'assert', 'auth', 'authentication', 'await', 'bitmap', 'black', 'boot', 'card', 'clean', 'counter', 'crypto', 'decimal', 'determine', 'dict', 'dist', 'emulator', 'entire', 'express', 'feel', 'graph', 'helper', 'identity', 'important', 'initialize', 'inner', 'ip', 'jpg', 'manifest', 'mapping', 'matter', 'modal', 'myapp', 'ng', 'npm', 'objective', 'plugins', 'ready', 'recyclerview', 'requirement', 'ssl', 'svg', 'temp', 'timeout', 'timer', 'trigger', 'tuple', 'varchar', 'win', 'worker']


In [50]:
# Importation des stopwords
list_sw = []
with open('list_stop_words.csv', 'r') as data:
    for line in csv.reader(data):
        list_sw.append(line)

    # Transforme la liste de listes en liste
sw = [item for sublist in list_sw for item in sublist]
stop_words = set(sw)
len(stop_words)

86969

## Traitement des données et du modèle

In [121]:
# Cleaning, tokenizing and lemmatizing
class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        token = RegexpTokenizer(r'[a-zA-Z]+')
        return [self.wnl.lemmatize(t) for t in token.tokenize(doc.lower()) if t not in stop_words]
wnl = WordNetLemmatizer()    
tokenizer = lambda x: [wnl.lemmatize(x) for x in token.tokenize(x.lower()) if x not in stop_words]


#Tf-Idf
count = CountVectorizer(tokenizer=LemmaTokenizer(),
                        stop_words=stop_words, analyzer='word')
tfidf = TfidfTransformer()
pipe = make_pipeline(count, tfidf)


pipe.fit(X_df['Body'])
X_features = pipe.transform(X_df['Body'])
X_features

<34805x981 sparse matrix of type '<class 'numpy.float64'>'
	with 1055095 stored elements in Compressed Sparse Row format>

In [122]:
mX_feature =  pd.DataFrame(X_features.toarray(), columns = imported_features)
mx_feature

ValueError: Shape of passed values is (34805, 981), indices imply (34805, 996)

## Entrée de l'utilisateur

In [8]:
# User's title input
title = input("Title: ")

Title: 'Hello, these are 3 examples to show the different steps of the cleaning process.'


In [9]:
# User's body imput
body = input("Body: ")

Body: 'Hello, these are 3 examples to show the


In [10]:
# Jointure du titre et du corps pour créer la question
question = title + " " + body

# Créer un dataframe contenant la question
df_question = pd.DataFrame({'Question': [question]})
df_question.head()

Unnamed: 0,Question
0,"'Hello, these are 3 examples to show the diffe..."


## Affichage du tags

In [87]:
features = pipe.transform(df_question)
train_mx =  pd.DataFrame(features.toarray())
train_mx
#X_scaled = std_scale.transform(train_mx)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,971,972,973,974,975,976,977,978,979,980
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [88]:
features = pipe.transform(df_question)
mx_feature = pd.DataFrame(features.toarray())
std_features = std_scale.transform(mx_feature)
acp=pca.transform(std_features)
predicted_tags = model_final.predict(acp)

ValueError: X has 981 features, but this StandardScaler is expecting 977 features as input.