# <center> Feature selection </center>


## Necessary downloads and library imports

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
project_path = '/content/drive/My Drive/Colab Notebooks/MATF_ML_project/'

In [3]:
!pip install scikit-multilearn
!pip install ipynb



In [4]:
from ipynb.fs.full.utility import *

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import pickle

import skmultilearn
from skmultilearn.model_selection import iterative_train_test_split

from scipy import sparse

from sklearn import feature_extraction
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

## Load data

In [5]:
file = open(project_path + "data/data_preprocessed.csv", "rb")
data = pickle.load(file)
file.close()

In [6]:
file = open(project_path + "data/tags_preprocessed.csv", "rb")
tags_preprocessed = pickle.load(file)
file.close()

In [7]:
data.head()

Unnamed: 0,Id,Title,Body,Tags
0,1,"[how, check, upload, file, imag, without, mime...","[like, check, upload, file, imag, file, e, .g,...",[php]
1,2,"[how, prevent, firefox, close, press, ctrl-w]","[favorit, editor, vim, regular, use, ctrl-w, e...",[firefox]
2,3,"[r, error, invalid, type, list, variabl]","[import, matlab, file, construct, data, frame,...",[r]
3,4,"[how, replac, special, charact, url]","[probabl, simpl, simpli, cannot, find, answer,...",[c#]
4,5,"[how, modifi, whoi, contact, detail]","[function, modifi, mcontact, file_get_cont, ui...","[php, api]"


In [8]:
data.shape

(7628, 4)

In [9]:
X = data[['Title', 'Body']]
y = data['Tags']

In [10]:
tags_preprocessed.head()

0         [php]
1     [firefox]
2           [r]
3          [c#]
4    [php, api]
Name: Tags, dtype: object

## Tag vectorization

In [12]:
def tokenizer_none (x):
    return x
    
def preprocessor_none (x):
    return x

In [13]:
tags_vectorizer = CountVectorizer(analyzer='word', tokenizer=tokenizer_none, 
                                    preprocessor=preprocessor_none, token_pattern=None)
tags_vectorizer.fit(y)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1),
                preprocessor=<function preprocessor_none at 0x7ff89fb0de60>,
                stop_words=None, strip_accents=None, token_pattern=None,
                tokenizer=<function tokenizer_none at 0x7ff8a1d888c0>,
                vocabulary=None)

In [14]:
y_vectors = tags_vectorizer.transform(y)

In [15]:
y_vectors

<7628x100 sparse matrix of type '<class 'numpy.int64'>'
	with 12148 stored elements in Compressed Sparse Row format>

In [16]:
y_vectors.shape

(7628, 100)

## Train-Test split

In [17]:
X_train, y_train, X_test, y_test = iterative_train_test_split(X.to_numpy(), y_vectors.toarray(), 
                                                              test_size = 0.2)

In [18]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((6053, 2), (6053, 100), (1575, 2), (1575, 100))

## Text Vectorization

For the text vectorization we choose **Tf-Idf** and **Bag-of-Words** vectorizations. 

**How to combine title and body?** The title seems to be an important feature because it is concise and contains the most important information about question. So we will try to exploit it wisely.

---------

We examined two approaches:

1. **Vectorize -> Concatenate** (My idea)

  Vectorize title (0-1 Bag-of-Words), Vectorize body (Tf-Idf) -> Concatenate vectors

  It seems that the frequency is not important, but only the appearance of the words in the title (for example, you probably won't repeat "in c#" many times in question, but this token is very important for predicting the 'c#' tag).
  On the other hand, it is useful to "extract context" from the body because of tags such as 'design' that don't appear explicitly in the text. So we will use Tf-Idf (or some better approach in the future :D)


2. **Concatenate -> Vectorize** (Idea taken from blog <a href="https://medium.datadriveninvestor.com/predicting-tags-for-the-questions-in-stack-overflow-29438367261e">Predicting Tags for the Questions in Stack Overflow</a>)

  Concatenate body and 3 times repeated title -> Vectorize that text

----------


In [19]:
X_train_titles = X_train[:, 0]
X_test_titles = X_test[:, 0]
X_train_bodies = X_train[:, 1]
X_test_bodies = X_test[:, 1]

### 1. Vectorize -> Concatenate

#### Title vectorization

In [20]:
title_vectorizer = CountVectorizer(analyzer='word', binary=True, tokenizer=tokenizer_none, 
                                    preprocessor=preprocessor_none, token_pattern=None, 
                                    min_df=1)
title_vectorizer.fit(X_train_titles)

CountVectorizer(analyzer='word', binary=True, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1),
                preprocessor=<function preprocessor_none at 0x7ff89fb0de60>,
                stop_words=None, strip_accents=None, token_pattern=None,
                tokenizer=<function tokenizer_none at 0x7ff8a1d888c0>,
                vocabulary=None)

In [21]:
title_vectors_train = title_vectorizer.transform(X_train_titles)
title_vectors_test = title_vectorizer.transform(X_test_titles)

In [22]:
title_vectors_train.shape, title_vectors_test.shape

((6053, 5581), (1575, 5581))

In [23]:
print(np.unique(title_vectors_train.toarray(), return_counts=True))
print(np.unique(title_vectors_test.toarray(), return_counts=True))

(array([0, 1]), array([33746225,    35568]))
(array([0, 1]), array([8781555,    8520]))


#### Body vectorization

In [24]:
body_vectorizer = TfidfVectorizer(analyzer='word', tokenizer=tokenizer_none, 
                                    preprocessor=preprocessor_none, token_pattern=None,
                                    min_df=0.0003)
body_vectorizer.fit(X_train_bodies)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=0.0003, ngram_range=(1, 1), norm='l2',
                preprocessor=<function preprocessor_none at 0x7ff89fb0de60>,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern=None,
                tokenizer=<function tokenizer_none at 0x7ff8a1d888c0>,
                use_idf=True, vocabulary=None)

In [25]:
body_vectors_train = body_vectorizer.transform(X_train_bodies)
body_vectors_test = body_vectorizer.transform(X_test_bodies)

In [26]:
body_vectors_train.shape, body_vectors_test.shape

((6053, 11819), (1575, 11819))

#### Concatenate vectors

In [27]:
X_train_concat = sparse.hstack([body_vectors_train, title_vectors_train])
X_test_concat = sparse.hstack([body_vectors_test, title_vectors_test])

In [28]:
X_train_concat.shape, X_test_concat.shape

((6053, 17400), (1575, 17400))

#### Save data

In [29]:
sparse.save_npz(project_path + "data/X_train_vect_concat.npz", sparse.coo_matrix(X_train_concat))
sparse.save_npz(project_path + "data/y_train_vect_concat.npz", sparse.coo_matrix(y_train))
sparse.save_npz(project_path + "data/X_test_vect_concat.npz", sparse.coo_matrix(X_test_concat))
sparse.save_npz(project_path + "data/y_test_vect_concat.npz", sparse.coo_matrix(y_test))

### 2. Concatenate -> Vectorize

#### Question vectorisation

In [30]:
X_train_question = X_train_bodies + 3 * X_train_titles
X_test_question = X_test_bodies + 3 * X_test_titles

In [31]:
question_vectorizer = TfidfVectorizer(analyzer='word', tokenizer=tokenizer_none, 
                                    preprocessor=preprocessor_none, token_pattern=None,
                                    min_df=0.0003)
question_vectorizer.fit(X_train_question)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=0.0003, ngram_range=(1, 1), norm='l2',
                preprocessor=<function preprocessor_none at 0x7ff89fb0de60>,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern=None,
                tokenizer=<function tokenizer_none at 0x7ff8a1d888c0>,
                use_idf=True, vocabulary=None)

In [32]:
question_train_vectors = question_vectorizer.transform(X_train_question)
question_test_vectors = question_vectorizer.transform(X_test_question)

In [33]:
question_train_vectors.shape, question_test_vectors.shape

((6053, 11992), (1575, 11992))

#### Save data

In [34]:
sparse.save_npz(project_path + "data/X_train_concat_vect.npz", sparse.coo_matrix(question_train_vectors))
sparse.save_npz(project_path + "data/y_train_concat_vect.npz", sparse.coo_matrix(y_train))
sparse.save_npz(project_path + "data/X_test_concat_vect.npz", sparse.coo_matrix(question_test_vectors))
sparse.save_npz(project_path + "data/y_test_concat_vect.npz", sparse.coo_matrix(y_test))

# <center>The Heuristic (Count Tags in text)</center>

Tags such as 'c#', 'java', 'android', file extensions, etc. usually appear in the text. 
So our baseline (heuristic) approach will be counting tag appearances in the text.

In [35]:
tag_counter = CountVectorizer(analyzer='word', tokenizer=tokenizer_none, 
                                    preprocessor=preprocessor_none, token_pattern=None)
tag_counter.fit(tags_preprocessed)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1),
                preprocessor=<function preprocessor_none at 0x7ff89fb0de60>,
                stop_words=None, strip_accents=None, token_pattern=None,
                tokenizer=<function tokenizer_none at 0x7ff8a1d888c0>,
                vocabulary=None)

In [36]:
tags_in_question = tag_counter.transform(X['Title'] + X['Body'])

In [37]:
tag_counts = [sum(vect>0) for vect in tags_in_question.toarray()]
print("Average number of tags predicted from a question: %.3f" % (sum(tag_counts)/len(tag_counts)))
print("Max number of tags predicted from a question:", max(tag_counts))

Average number of tags predicted from a question: 2.183
Max number of tags predicted from a question: 13


In [38]:
predicated = [[1 if c > 0 else 0 for c in vect] for vect in tags_in_question.toarray()]

In [39]:
print_metrics(y_vectors.toarray(), predicated)

Hamming Score ('Accuracy' by Keras):	97.852
Hamming Score (= 1 - Hamming Loss):	97.852
Exact match ratio (Subset Accuracy):	12.389
F1-Score Micro Averaged:		43.101
F1-Score Macro Averaged:		40.638
F1-Score Weighted Average:		48.195
Precision Score Micro Averaged:		37.270
Recall Score Micro Averaged:		51.095


Surprisingly good results!

# Save vectorizers

In [40]:
with open(project_path + 'models/vectorizer_tag.pk', 'wb') as fin:
    pickle.dump(tags_vectorizer, fin)
with open(project_path + 'models/vectorizer_title.pk', 'wb') as fin:
    pickle.dump(title_vectorizer, fin)
with open(project_path + 'models/vectorizer_body.pk', 'wb') as fin:
    pickle.dump(body_vectorizer, fin)
with open(project_path + 'models/vectorizer_question.pk', 'wb') as fin:
    pickle.dump(question_vectorizer, fin)
with open(project_path + 'models/vectorizer_heuristic_tag_counter.pk', 'wb') as fin:
    pickle.dump(tag_counter, fin)

# <center> Experiment: Train-Test Split - Stratification problem </center>

In <a href="http://scikit.ml/stratification.html">Multi-label data stratification - Scikit-Multilearn</a> it is illustrated how `skmultilearn.model_selection.iterative_train_test_split()`, which is adapted to the problem of multi-label classification, preserves the ratio of counts of label combinations in train and test sets. We are curious to illustrate how it preserves the counts of each label.



#### Testing scikit-multilearn method for train-test split (with stratification)

In [41]:
X_train, y_train, X_test, y_test = iterative_train_test_split(X.to_numpy(), y_vectors.toarray(), 
                                                              test_size = 0.5)

In [42]:
tag_counts_train = sum(y_train)
tag_counts_test = sum(y_test)

In [43]:
print(tag_counts_train)

[ 20 151  33  53  31 259  22  21  19  44 154  51  26  27  92 389 167  21
  16  22  23  20 113  57  31  36  25  27  19  20  40  17  18  16  30  21
  20  23  25  22 140  32  19  24 121  26 150 352 312 252  18  31  17  33
 104  16  38  19 139  38 102  28  40  16  38  18 350 150  25  24  21  43
  71 103  36  19  26  23 128  65  16  27  16  29  17  20  27  37  22  30
  29  25  88  40  17  26  24  57  41  59]


In [44]:
print(tag_counts_test)

[ 20 151  32  54  30 259  22  20  19  44 155  52  27  28  92 389 166  20
  16  22  24  20 113  56  32  36  25  28  20  21  40  16  19  16  30  21
  20  22  24  21 140  32  19  25 121  25 150 351 312 252  18  30  16  32
 104  17  37  14 139  37 102  29  40  17  38  17 351 150  25  23  20  43
  70 104  37  19  26  22 128  65  20  27  17  30  17  19  26  37  22  30
  29  24  87  40  18  29  25  57  42  58]


In [45]:
print(list(zip(tag_counts_train, tag_counts_test))[0:15])

[(20, 20), (151, 151), (33, 32), (53, 54), (31, 30), (259, 259), (22, 22), (21, 20), (19, 19), (44, 44), (154, 155), (51, 52), (26, 27), (27, 28), (92, 92)]


In [46]:
# differences between counts of tag in train and test sets
diff = [abs(ttrain - ttest) for ttrain, ttest in zip(tag_counts_train, tag_counts_test)]

In [47]:
print("Max difference: ", max(diff))
print("Avg. difference: ", np.average(diff))

Max difference:  5
Avg. difference:  0.62


In [48]:
print("%.3f %%" % (100 * max(diff) / np.average(tag_counts_train)))

8.230 %


#### Testing scikit-learn method for train-test split with no stratification



In [49]:
X_train, X_test, y_train, y_test = train_test_split(X.to_numpy(), y_vectors.toarray(), 
                                                              test_size = 0.5)

In [50]:
y_train.shape

(3814, 100)

In [51]:
tag_counts_train = sum(y_train)
tag_counts_test = sum(y_test)

In [52]:
print(tag_counts_train)

[ 20 173  32  47  26 253  19  19  22  42 149  53  27  29  89 402 165  21
  16  21  25  22 116  59  30  32  27  33  19  19  42  15  20  13  29  20
  19  24  30  21 148  34  21  28 104  29 150 339 321 250  20  38  21  38
 112  23  39  21 147  41  88  22  35  21  33  15 379 147  30  30  24  52
  69  97  36  19  26  26 117  68  18  17  21  38  21  16  26  35  14  31
  20  22  87  36  17  29  30  60  33  60]


In [53]:
print(tag_counts_test)

[ 20 129  33  60  35 265  25  22  16  46 160  50  26  26  95 376 168  20
  16  23  22  18 110  54  33  40  23  22  20  22  38  18  17  19  31  22
  21  21  19  22 132  30  17  21 138  22 150 364 303 254  16  23  12  27
  96  10  36  12 131  34 116  35  45  12  43  20 322 153  20  17  17  34
  72 110  37  19  26  19 139  62  18  37  12  21  13  23  27  39  30  29
  38  27  88  44  18  26  19  54  50  57]


In [54]:
print(list(zip(tag_counts_train, tag_counts_test))[0:15])

[(20, 20), (173, 129), (32, 33), (47, 60), (26, 35), (253, 265), (19, 25), (19, 22), (22, 16), (42, 46), (149, 160), (53, 50), (27, 26), (29, 26), (89, 95)]


In [55]:
# differences between counts of tag in train and test sets
diff = [abs(ttrain - ttest) for ttrain, ttest in zip(tag_counts_train, tag_counts_test)]

In [56]:
print("Max difference: ", max(diff))
print("Avg. difference: ", np.average(diff))

Max difference:  57
Avg. difference:  8.56


In [57]:
print("%.3f %%" % (100 * max(diff) / np.average(tag_counts_train)))

93.000 %
