# Model to return stackoverflow tags from a clean text

In [1]:
# import libraries
import pandas as pd
import numpy as np
import itertools
from bs4 import BeautifulSoup
import nltk, re, pprint
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords 
from collections import defaultdict 
from nltk.corpus import wordnet as wn
from itertools import chain
import re
from textblob import TextBlob, Word
import time
from collections import Counter

import spacy 
from spacy.lang.en.stop_words import STOP_WORDS


from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import yake

import pickle as p
import json
from flask import Flask, jsonify
import joblib
from joblib import dump, load

In [2]:
df_base = pd.read_csv('/media/marco/DATA/OC_Machine_learning/section_5/tags_stackoverflow/data-output/stackoverflow_processed_sample.csv', encoding='utf-8')
df_base.head()

df_tags = pd.read_csv('/media/marco/DATA/OC_Machine_learning/section_5/tags_stackoverflow/data-output/stackoverflow_processed_tags.csv', encoding='utf-8') # load the taglist in order to perform a tags selection
df_tags = df_tags.dropna()
tags = df_tags.tag #tags ordered by popularity

Number_tags = 50 # chosen number of most popular tags
popular_tags = tags[:Number_tags].tolist() # get the list of n most popular tags

In [3]:
with open('tagsRF.pkl', 'wb') as pickle_out:
    p.dump(popular_tags, pickle_out)
print(popular_tags)

['javascript', 'python', 'java', 'c#', 'android', 'html', 'git', 'css', 'jquery', 'c++', 'ios', '.net', 'php', 'string', 'sql', 'mysql', 'node.js', 'bash', 'arrays', 'c', 'linux', 'objective-c', 'sql-server', 'ruby', 'swift', 'json', 'shell', 'ruby-on-rails', 'iphone', 'angularjs', 'list', 'windows', 'xcode', 'regex', 'r', 'visual-studio', 'performance', 'asp.net', 'database', 'macos', 'asp.net-mvc', 'eclipse', 'django', 'github', 'datetime', 'angular', 'unix', 'postgresql', 'vim', 'reactjs']


In [4]:
# create model target from tags
one_hot = MultiLabelBinarizer() # encoder for the  tags 
y = df_base['taglist']
y_onehot = one_hot.fit_transform(y.str.split(' ')) 
y_bin = pd.DataFrame(y_onehot, columns=one_hot.classes_ ) # transform it to Pandas object
y_bin = y_bin.filter(items=popular_tags)

In [5]:
tfidfVectorizer = TfidfVectorizer(norm=None,analyzer='word',min_df = 5, max_df = 0.8, ngram_range=(1,2),max_features = 220, use_idf=True)
# TF-IDF matrices
tfidfvect = tfidfVectorizer.fit(df_base['Lemma'])
TF_IDF = tfidfvect.transform(df_base['Lemma'])

TF_IDF_dense = TF_IDF.todense()

In [6]:
joblib.dump(tfidfvect, 'vectorizer.pkl')

['vectorizer.pkl']

In [7]:
# 80/20 split TF-IDF
X_train, X_test,X_tfidf_train, X_tfidf_test, y_train, y_test, y_train_bin, y_test_bin = train_test_split(df_base['Lemma'],TF_IDF_dense, y, y_bin,  test_size=0.2,train_size=0.8, random_state=0)

In [8]:
#rf = RandomForestClassifier(max_depth=30000, random_state=42, n_jobs = -1, n_estimators=100)#, parameters optimized to balance consumption and accuracy
lr = LogisticRegression()
clf = MultiOutputClassifier(lr)
lr_clf = clf.fit(X_tfidf_train, y_train_bin)

In [9]:
joblib.dump(lr_clf , 'model.pkl')

['model.pkl']

In [10]:
def preprocess_text(text):
    text = text.lower() # lowercase
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub(r"[0-9]", " ", text)
    text = re.sub(r"[?.!/;:']", " ", text)
    text = re.sub(r"[<>\@%*=]", " ", text)
    text = re.sub(r"[\ |\]|\[|\|\/|\#|\:]", " ", text)
    text = re.sub(r"\'\n", " ", text) #line breaks
    text = re.sub(r"\'\xa0", " ", text) # xa0 Unicode representing spaces
    text = re.sub('\s+', ' ', text) # one or more whitespace characters
    text = text.strip(' ') # spaces
    list_tokens = word_tokenize(text)
    return list_tokens



In [11]:
query = 'peppio python java function'
tokens = preprocess_text(query)
tfidfVectorizer = load("vectorizer.pkl")

def vectorize_query(tokens):
    vectorized_query = tfidfVectorizer.transform(tokens).todense()
    return vectorized_query
vectorize_query(tokens)
print(tokens[:10])

['peppio', 'python', 'java', 'function']


In [12]:
#def vectorize_query(tokens):
#    vectorized_query = tfidfVectorizer.transform(tokens).todense()
#    return vectorized_query

In [14]:
def predict_tags(vectors):
    y_preds= lr_clf.predict(vectorized_query)
    popular_tags = ['javascript', 'python', 'java', 'c#', 'android', 'html', 'git', 'css', 'jquery', 'c++', 'ios', '.net', 'php', 'string', 'sql', 'mysql', 'node.js', 'bash', 'arrays', 'c', 'linux', 'objective-c', 'sql-server', 'ruby', 'swift', 'json', 'shell', 'ruby-on-rails', 'iphone', 'angularjs', 'list', 'windows', 'xcode', 'regex', 'r', 'visual-studio', 'performance', 'asp.net', 'database', 'macos', 'asp.net-mvc', 'eclipse', 'django', 'github', 'datetime', 'angular', 'unix', 'postgresql', 'vim', 'reactjs', 'pandas', 'algorithm', 'multithreading', 'date', 'dictionary', 'http', 'file', 'tsql', 'cocoa-touch', 'docker', 'android-studio', 'typescript', 'unit-testing', 'command-line', 'twitter-bootstrap', 'oop', 'google-chrome', 'version-control', 'xml', 'syntax', 'debugging', 'intellij-idea', 'dataframe', 'c++11', 'ajax', 'linq', 'python-3.x', 'rest', 'wpf', 'spring', 'npm', 'language-agnostic', 'exception', 'numpy', 'mongodb', 'function', 'sorting', 'object', 'collections', 'go', 'generics', 'scala', 'class', 'forms', 'android-layout', 'maven', 'security', 'gradle', 'url', 'image', 'types', 'java-8', 'dom', 'svn', 'entity-framework', 'variables', 'matplotlib', 'sql-server-2008', 'logging', 'express']
    df_probs = pd.DataFrame(y_preds, columns= popular_tags).T
    df_probs["probability"] = df_probs.sum(axis=1)
    df_probs.reset_index(inplace=True)
    
    df_probs = df_probs.sort_values(by='probability', ascending=False)
    tags = df_probs['index'][:5].tolist()
    return tags

In [15]:
vectorized_query =  tfidfVectorizer.transform(tokens).todense()

In [16]:
y_preds= rf_clf.predict(vectorized_query)

In [18]:

df_probs = pd.DataFrame(y_preds, columns= popular_tags).T
#df_probs.loc[:, 'probability'] = df_probs[0].map(lambda x: x[1]) # get out only positive probability
df_probs["probability"] = df_probs.sum(axis=1)
df_probs.reset_index(inplace=True)
print(df_probs.shape)
df_probs = df_probs.sort_values(by='probability', ascending=False)
tags = df_probs['index'][:5].tolist()
print(tags)
df_probs.head()

(50, 6)
['python', 'java', 'c++', 'database', 'iphone']


Unnamed: 0,index,0,1,2,3,probability
1,python,0,1,0,0,1
2,java,0,0,1,0,1
9,c++,0,0,0,1,1
38,database,0,0,0,0,0
28,iphone,0,0,0,0,0


In [19]:
prob_preds = rf_clf.predict_proba(vectorized_query)

df_probs = pd.DataFrame.from_records(prob_preds, index= popular_tags)
df_probs.loc[:, 'probability'] = df_probs[0].map(lambda x: x[1]) # get out only positive probability

df_probs.reset_index(inplace=True)
print(df_probs.shape)
df_probs

ValueError: Shape of passed values is (50, 4), indices imply (110, 4)

In [None]:
df_probs = df_probs.sort_values(by='probability', ascending=False)
tags = df_probs['index'][:5].tolist()
print(tags)
df_probs