In [33]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import joblib
import os
from ast import literal_eval

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.decomposition import NMF, LatentDirichletAllocation

import xgboost as xgb
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multiclass import OneVsRestClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV

import sklearn.metrics as metrics

import warnings

from IPython.display import Markdown, display

def printmd(string):
    display(Markdown(string))


mpl.rcParams["figure.figsize"] = (15, 10)
warnings.filterwarnings('ignore')

# Preprocessing

In [9]:
%time df = pd.read_csv('cleanDfStackOverflow.csv', index_col=0, converters={'Corpus': literal_eval, 'Tags': literal_eval})


CPU times: user 1.06 s, sys: 36.7 ms, total: 1.09 s
Wall time: 1.12 s


In [10]:
X = df['Corpus']
y = df['Tags']

In [11]:
def preprocessing(feature_extraction, X, y):
    # Vectorize X
    vectorizer = feature_extraction(analyzer="word",
                             max_df=.6,
                             min_df=0.005,
                             tokenizer=None,
                             preprocessor=' '.join,
                             stop_words=None,
                             lowercase=False)
    vectorized_data = vectorizer.fit_transform(X)
    feature_names = vectorizer.get_feature_names()
    print("Shape of X: {}".format(vectorized_data.shape))
    
    # Multi label binarize y
    multilabel_binarizer = MultiLabelBinarizer()
    multilabel_binarizer.fit(y)
    y_binarized = multilabel_binarizer.transform(y)
    print("Shape of y: {}".format(y_binarized.shape))
    
    # Create train and test split (30%) => NFM
    X_train, X_test, y_train, y_test = train_test_split(vectorized_data, y_binarized, test_size=0.3, random_state=8)
    print("X_train shape : {}".format(X_train.shape))
    print("X_test shape : {}".format(X_test.shape))
    print("y_train shape : {}".format(y_train.shape))
    print("y_test shape : {}".format(y_test.shape))
    
    return [X_train, X_test, y_train, y_test, vectorized_data, feature_names]


# Unsupervised Models

## LDA = Latent Dirichlet Allocation

LDA can only use raw term counts for LDA because it is a probabilistic graphical model

In [12]:
X_train_tf, X_test_tf, y_train_ft, y_test_tf, tf, tf_feature_names = preprocessing(CountVectorizer, X, y)

Shape of X: (11244, 1023)
Shape of y: (11244, 50)
X_train shape : (7870, 1023)
X_test shape : (3374, 1023)
y_train shape : (7870, 50)
y_test shape : (3374, 50)


In [13]:
%time lda = LatentDirichletAllocation(max_iter=5, learning_method='online', learning_offset=50, random_state=0).fit(tf)


CPU times: user 21.3 s, sys: 64.6 ms, total: 21.4 s
Wall time: 21.3 s


In [14]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 5

In [15]:
print('LDA :')
display_topics(lda, tf_feature_names, no_top_words)

LDA :
Topic 0:
docker run command python version
Topic 1:
use question time code difference
Topic 2:
use type function value object
Topic 3:
file error try get use
Topic 4:
string request api data native
Topic 5:
use image java like want
Topic 6:
project build error use net
Topic 7:
android app test flutter xcode
Topic 8:
error get js use try
Topic 9:
component angular react use page


## NMF = Non-Negative Matrix Factorization

In [16]:
X_train, X_test, y_train, y_test, tfidf, tfidf_feature_names = preprocessing(TfidfVectorizer, X, y)

Shape of X: (11244, 1023)
Shape of y: (11244, 50)
X_train shape : (7870, 1023)
X_test shape : (3374, 1023)
y_train shape : (7870, 50)
y_test shape : (3374, 50)


In [17]:
%time nmf = NMF(n_components=10, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

CPU times: user 1.58 s, sys: 26.6 ms, total: 1.61 s
Wall time: 852 ms


In [18]:
print("NMF :")
display_topics(nmf, tfidf_feature_names, no_top_words)

NMF :
Topic 0:
use function like value code
Topic 1:
android studio gradle app build
Topic 2:
docker image container compose run
Topic 3:
angular component cli ts module
Topic 4:
error run get try install
Topic 5:
net core asp project framework
Topic 6:
file import project visual json
Topic 7:
react component native render prop
Topic 8:
type typescript object string property
Topic 9:
difference vs use two one


# Supervised Models

In [19]:
def metrics_score(model, df, y_true, y_pred):
    if(df is not None):
        dataframe = df
    else:
        dataframe = pd.DataFrame(index=["Accuracy", "F1", "Jaccard", "Recall", "Precision"],
                               columns=[model])
        
    scores = []
    
    scores.append(metrics.accuracy_score(y_true, y_pred))
    scores.append(metrics.f1_score(y_pred, y_true, average='weighted'))
    scores.append(metrics.jaccard_score(y_true, y_pred, average='weighted'))
    scores.append(metrics.recall_score(y_true, y_pred, average='weighted'))
    scores.append(metrics.precision_score(y_true, y_pred, average='weighted'))
    
    dataframe[model] = scores
    
    return dataframe

In [20]:
def multiLabelModelClassifierPredictions(estimator, params, name, metrics_compare_df):    
    # GridSearchCV
    multi_label_cv = GridSearchCV(estimator=estimator,
                              param_grid=params,
                              n_jobs=-1,
                              cv=5,
                              scoring="f1_weighted",
                              return_train_score = True,
                              refit=True)
    
    # fit the model
    %time multi_label_cv.fit(X_train, y_train)
    
    # Best params
    logit_cv_results = pd.DataFrame.from_dict(multi_label_cv.cv_results_)
    print("-"*50)
    printmd(f'Best params for : **{name}**')
    print("-" * 50)
    logit_best_params = multi_label_cv.best_params_
    print(logit_best_params)
    
    # Results with best params
    logit_cv_results[logit_cv_results["params"] == logit_best_params]
    
    # Predict
    y_test_predicted_labels_tfidf = multi_label_cv.predict(X_test)

    # Inverse transform
    multilabel_binarizer = MultiLabelBinarizer()
    multilabel_binarizer.fit(y)
    y_test_pred_inversed = multilabel_binarizer.inverse_transform(y_test_predicted_labels_tfidf)
    y_test_inversed = multilabel_binarizer.inverse_transform(y_test)

    print("-"*50)
    print("Print 10 first predicted Tags vs true Tags")
    print("-" * 50)
    print("Predicted:", y_test_pred_inversed[0:10])
    print("True:", y_test_inversed[0:10])
    
    # Metrics compare
    df_metrics_compare = metrics_score(name, df=metrics_compare_df, y_true = y_test, y_pred = y_test_predicted_labels_tfidf)
    return df_metrics_compare


## Logistic Regression

In [21]:
param_logit = {"estimator__C": [100, 10, 1.0, 0.1],
               "estimator__penalty": ["l1", "l2"],
               "estimator__dual": [False],
               "estimator__solver": ["liblinear", 'sag']}

df_metrics_compare = multiLabelModelClassifierPredictions(
    OneVsRestClassifier(LogisticRegression()),
    param_logit,
    'Logistic Regression',
    None
)

CPU times: user 5.95 s, sys: 177 ms, total: 6.13 s
Wall time: 2min 9s
--------------------------------------------------


Best params for : **Logistic Regression**

--------------------------------------------------
{'estimator__C': 10, 'estimator__dual': False, 'estimator__penalty': 'l1', 'estimator__solver': 'liblinear'}
--------------------------------------------------
Print 10 first predicted Tags vs true Tags
--------------------------------------------------
Predicted: [('node.js',), ('python',), (), ('angular',), ('c#',), (), (), (), ('css', 'html'), ()]
True: [('javascript',), ('python',), ('javascript',), ('angular',), ('.net', '.net-core', 'c#'), ('angular',), ('java',), ('dart', 'flutter'), ('css', 'html'), ('asp.net-core', 'c#')]


In [22]:
df_metrics_compare

Unnamed: 0,Logistic Regression
Accuracy,0.371962
F1,0.681528
Jaccard,0.52546
Recall,0.620716
Precision,0.727564


## XGBoost

In [23]:
params_XGBoost = {
    'max_depth': range (2, 10, 1),
    'n_estimators': range(60, 220, 40),
    'learning_rate': [0.1, 0.01, 0.05]
}

df_metrics_compare = multiLabelModelClassifierPredictions(
    OneVsRestClassifier(xgb.XGBClassifier(
        objective= 'binary:logistic',
        nthread=4,
        seed=42
    )),
    {},
    'XGBoost Classifier',
    df_metrics_compare
)





CPU times: user 7min 10s, sys: 2.93 s, total: 7min 13s
Wall time: 8min 57s
--------------------------------------------------


Best params for : **XGBoost Classifier**

--------------------------------------------------
{}
--------------------------------------------------
Print 10 first predicted Tags vs true Tags
--------------------------------------------------
Predicted: [('node.js', 'reactjs'), ('python',), (), ('angular',), ('c#',), (), ('java',), (), ('css', 'html'), ()]
True: [('javascript',), ('python',), ('javascript',), ('angular',), ('.net', '.net-core', 'c#'), ('angular',), ('java',), ('dart', 'flutter'), ('css', 'html'), ('asp.net-core', 'c#')]


In [24]:
df_metrics_compare

Unnamed: 0,Logistic Regression,XGBoost Classifier
Accuracy,0.371962,0.415827
F1,0.681528,0.737897
Jaccard,0.52546,0.566339
Recall,0.620716,0.633403
Precision,0.727564,0.797657


## KNeighborsClassifier

In [25]:
param_knn = {"n_neighbors": list(range(1, 11))}

df_metrics_compare = multiLabelModelClassifierPredictions(
    KNeighborsClassifier(),
    param_knn,
    'K Neighbors Classifier',
    df_metrics_compare
)

CPU times: user 279 ms, sys: 86.2 ms, total: 366 ms
Wall time: 5min 31s
--------------------------------------------------


Best params for : **K Neighbors Classifier**

--------------------------------------------------
{'n_neighbors': 5}
--------------------------------------------------
Print 10 first predicted Tags vs true Tags
--------------------------------------------------
Predicted: [(), ('node.js',), (), ('angular',), (), (), (), ('pandas', 'python'), ('css',), ()]
True: [('javascript',), ('python',), ('javascript',), ('angular',), ('.net', '.net-core', 'c#'), ('angular',), ('java',), ('dart', 'flutter'), ('css', 'html'), ('asp.net-core', 'c#')]


In [26]:
df_metrics_compare

Unnamed: 0,Logistic Regression,XGBoost Classifier,K Neighbors Classifier
Accuracy,0.371962,0.415827,0.28364
F1,0.681528,0.737897,0.573383
Jaccard,0.52546,0.566339,0.374841
Recall,0.620716,0.633403,0.426434
Precision,0.727564,0.797657,0.732024


## Random Forest Classifier

In [27]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1000, num = 5)]

# Maximum number of levels in tree
max_depth = list(range(10, 15))
max_depth.append(None)

param_rfc = {'n_estimators': n_estimators,
               'max_depth': max_depth}

df_metrics_compare = multiLabelModelClassifierPredictions(
    RandomForestClassifier(),
    {},
    'Random Forest Classifier',
    df_metrics_compare
)

CPU times: user 45.3 s, sys: 687 ms, total: 46 s
Wall time: 1min 36s
--------------------------------------------------


Best params for : **Random Forest Classifier**

--------------------------------------------------
{}
--------------------------------------------------
Print 10 first predicted Tags vs true Tags
--------------------------------------------------
Predicted: [(), ('python',), (), ('angular',), ('c#',), (), ('java',), (), (), ()]
True: [('javascript',), ('python',), ('javascript',), ('angular',), ('.net', '.net-core', 'c#'), ('angular',), ('java',), ('dart', 'flutter'), ('css', 'html'), ('asp.net-core', 'c#')]


In [28]:
df_metrics_compare

Unnamed: 0,Logistic Regression,XGBoost Classifier,K Neighbors Classifier,Random Forest Classifier
Accuracy,0.371962,0.415827,0.28364,0.399526
F1,0.681528,0.737897,0.573383,0.729671
Jaccard,0.52546,0.566339,0.374841,0.50041
Recall,0.620716,0.633403,0.426434,0.535505
Precision,0.727564,0.797657,0.732024,0.835297


f1 score penalise si une prédiction est fausse (prédiction que sur ne seule classe)
précision => pas mal
accuracy =>  pas ouf

# Export fitted model and preprocessor

#### TF-idf Vectorizer

In [30]:
tfidf_vectorizer = TfidfVectorizer(analyzer="word",
                             max_df=.6,
                             min_df=0.005,
                             tokenizer=None,
                             preprocessor=' '.join,
                             stop_words=None,
                             lowercase=False)
tfidf_vectorizer.fit(X)

TfidfVectorizer(lowercase=False, max_df=0.6, min_df=0.005,
                preprocessor=<built-in method join of str object at 0x7fe803470f30>)

#### Multi label binarizer 

In [31]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(y)

MultiLabelBinarizer()

#### Final model

In [32]:
final_model = OneVsRestClassifier(xgb.XGBClassifier(objective= 'binary:logistic', nthread=4, seed=42 ))
%time final_model.fit(X_train, y_train)





CPU times: user 7min 10s, sys: 2.87 s, total: 7min 12s
Wall time: 1min 56s


OneVsRestClassifier(estimator=XGBClassifier(base_score=None, booster=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None, gamma=None,
                                            gpu_id=None, importance_type='gain',
                                            interaction_constraints=None,
                                            learning_rate=None,
                                            max_delta_step=None, max_depth=None,
                                            min_child_weight=None, missing=nan,
                                            monotone_constraints=None,
                                            n_estimators=100, n_jobs=None,
                                            nthread=4, num_parallel_tree=None,
                                            random_state=None, reg_alpha=None,
                           

#### Export

In [39]:
dirname = os.path.join(os.getcwd(), 'exported_models/')

In [40]:
print(dirname)

/Users/arminarnautovic/Documents/P5_Anautovic_Armin/exported_models/


In [41]:
joblib.dump(tfidf_vectorizer, dirname+'tfidf_vectorizer.pkl')

['/Users/arminarnautovic/Documents/P5_Anautovic_Armin/exported_models/tfidf_vectorizer.pkl']

In [42]:
joblib.dump(final_model, dirname+'xgboost_classifier_model.pkl')
joblib.dump(multilabel_binarizer, dirname+'multilabel_binarizer.pkl')

['/Users/arminarnautovic/Documents/P5_Anautovic_Armin/exported_models/multilabel_binarizer.pkl']

# Tests

In [45]:
from bs4 import BeautifulSoup
import re
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/arminarnautovic/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/arminarnautovic/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [44]:
def preprocess_text(text):
    # remove code tag
    soup = BeautifulSoup(text,"lxml")
    code_to_remove = soup.findAll("code")
    for code in code_to_remove:
        code.replace_with(" ")
    # remove html tag
    txt_without_html_tag = soup.get_text()
    # only letters
    letters_only = re.sub("[^a-zA-Z]", " ", txt_without_html_tag)
    #lowercase
    lowercase = letters_only.lower()
    # Create an instance of RegexpTokenizer for alphanumeric tokens
    tokeniser = RegexpTokenizer(r'\w+')
    # Tokenise string
    tokens = tokeniser.tokenize(lowercase)
    # Create an instance of WordNetLemmatizer
    lemmatiser = WordNetLemmatizer()
    # lemmatise tokens
    lemmas = [lemmatiser.lemmatize(token, pos='v') for token in tokens]
    # Remove stops words
    keywords = [lemma for lemma in lemmas if lemma not in stopwords.words('english')]
    return keywords

In [43]:
title_test = "How to Modify CSS Module Style via Javascript"
body_test = "I'm fairly new to javascript and react, but I'm diving in and creating my first react app. I'm tying to modify the amount of 'gridTemplateRows' that display on my screen via a variable and modify it with a new numbers based on the results from the list (I've hard coded this as a 6 right now), however I'm using CSS Modules and i can't seem to get the grid to change its row count and display properly."

In [46]:
cleaned_title = preprocess_text(title_test)
cleaned_body = preprocess_text(body_test)

In [47]:
corpus_test = cleaned_title + cleaned_body

In [56]:
corpus_test

['modify',
 'css',
 'module',
 'style',
 'via',
 'javascript',
 'fairly',
 'new',
 'javascript',
 'react',
 'dive',
 'create',
 'first',
 'react',
 'app',
 'tie',
 'modify',
 'amount',
 'gridtemplaterows',
 'display',
 'screen',
 'via',
 'variable',
 'modify',
 'new',
 'number',
 'base',
 'result',
 'list',
 'hard',
 'cod',
 'right',
 'however',
 'use',
 'css',
 'modules',
 'seem',
 'get',
 'grid',
 'change',
 'row',
 'count',
 'display',
 'properly']

In [48]:
X_to_test = [corpus_test]

In [57]:
X_to_test

[['modify',
  'css',
  'module',
  'style',
  'via',
  'javascript',
  'fairly',
  'new',
  'javascript',
  'react',
  'dive',
  'create',
  'first',
  'react',
  'app',
  'tie',
  'modify',
  'amount',
  'gridtemplaterows',
  'display',
  'screen',
  'via',
  'variable',
  'modify',
  'new',
  'number',
  'base',
  'result',
  'list',
  'hard',
  'cod',
  'right',
  'however',
  'use',
  'css',
  'modules',
  'seem',
  'get',
  'grid',
  'change',
  'row',
  'count',
  'display',
  'properly']]

In [49]:
# Preprocessing
X_final_test = tfidf_vectorizer.transform(X_to_test)

In [50]:
df_test = pd.DataFrame.sparse.from_spmatrix(X_final_test)
col_map = {v:k for k, v in tfidf_vectorizer.vocabulary_.items()}
for col in df_test.columns:
    df_test.rename(columns={col: col_map[col]}, inplace=True)
df_test

Unnamed: 0,able,accept,access,accomplish,accord,account,achieve,across,action,activity,...,would,wrap,write,wrong,www,xcode,xml,yes,yet,zero
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [51]:
# Prediction
predicted_labels_test = final_model.predict(X_final_test)

In [52]:
# Inverse transform
pred_inversed_test = multilabel_binarizer.inverse_transform(predicted_labels_test)

In [53]:
# Print predicted tags
pred_inversed_test

[('javascript', 'reactjs')]