# <center> Feature selection </center>


## Necessary downloads and library imports

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
project_path = '/content/drive/My Drive/Colab Notebooks/MATF_ML_project/'

In [None]:
!pip install scikit-multilearn
!pip install ipynb



In [None]:
from ipynb.fs.full.utility import *

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import pickle

import skmultilearn
from skmultilearn.model_selection import iterative_train_test_split

from scipy import sparse

from sklearn import feature_extraction
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

## Load data

In [None]:
file = open(project_path + "data/data_preprocessed.csv", "rb")
data = pickle.load(file)
file.close()

In [None]:
file = open(project_path + "data/tags_preprocessed.csv", "rb")
tags_preprocessed = pickle.load(file)
file.close()

In [None]:
data.head()

Unnamed: 0,Id,Title,Body,Tags
0,1,"[how, check, upload, file, imag, without, mime...","[like, check, upload, file, imag, file, e, .g,...",[php]
1,2,"[how, prevent, firefox, close, press, ctrl-w]","[favorit, editor, vim, regular, use, ctrl-w, e...",[firefox]
2,3,"[r, error, invalid, type, list, variabl]","[import, matlab, file, construct, data, frame,...",[r]
3,4,"[how, replac, special, charact, url]","[probabl, simpl, simpli, cannot, find, answer,...",[c#]
4,5,"[how, modifi, whoi, contact, detail]","[function, modifi, mcontact, file_get_cont, ui...","[php, api]"


In [None]:
data.shape

(7628, 4)

In [None]:
X = data[['Title', 'Body']]
y = data['Tags']

In [None]:
tags_preprocessed.head()

0         [php]
1     [firefox]
2           [r]
3          [c#]
4    [php, api]
Name: Tags, dtype: object

## Tag vectorization

In [None]:
tags_vectorizer = CountVectorizer(analyzer='word', tokenizer=lambda x : x, 
                                    preprocessor=lambda x : x, token_pattern=None)
tags_vectorizer.fit(y)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1),
                preprocessor=<function <lambda> at 0x7f590a689c20>,
                stop_words=None, strip_accents=None, token_pattern=None,
                tokenizer=<function <lambda> at 0x7f590a689b90>,
                vocabulary=None)

In [None]:
y_vectors = tags_vectorizer.transform(y)

In [None]:
y_vectors

<7628x100 sparse matrix of type '<class 'numpy.int64'>'
	with 12148 stored elements in Compressed Sparse Row format>

In [None]:
y_vectors.shape

(7628, 100)

## Text Vectorization

For the text vectorization we choose **Tf-Idf** and **Bag-of-Words** vectorizations. 

**How to combine title and body?** The title seems to be an important feature because it is concise and contains the most important information about question. So we will try to exploit it wisely.

---------

We examined two approaches:

1. **Vectorize -> Concatenate** (My idea)

  Vectorize title (0-1 Bag-of-Words), Vectorize body (Tf-Idf) -> Concatenate vectors

  It seems that the frequency is not important, but only the appearance of the words in the title (for example, you probably won't repeat "in c#" many times in question, but this token is very important for predicting the 'c#' tag).
  On the other hand, it is useful to "extract context" from the body because of tags such as 'design' that don't appear explicitly in the text. So we will use Tf-Idf (or some better approach in the future :D)


2. **Concatenate -> Vectorize** (Idea taken from blog <a href="https://medium.datadriveninvestor.com/predicting-tags-for-the-questions-in-stack-overflow-29438367261e">Predicting Tags for the Questions in Stack Overflow</a>)

  Concatenate body and 3 times repeated title -> Vectorize that text

----------


### 1. Vectorize -> Concatenate

#### Title vectorization

In [None]:
title_vectorizer = CountVectorizer(analyzer='word', binary=True, tokenizer=lambda x : x, 
                                    preprocessor=lambda x : x, token_pattern=None, 
                                    min_df=1)
title_vectorizer.fit(X['Title'])

CountVectorizer(analyzer='word', binary=True, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1),
                preprocessor=<function <lambda> at 0x7f590a636440>,
                stop_words=None, strip_accents=None, token_pattern=None,
                tokenizer=<function <lambda> at 0x7f590a636320>,
                vocabulary=None)

In [None]:
title_vectors = title_vectorizer.transform(X['Title'])

In [None]:
title_vectors.shape

(7628, 6372)

In [None]:
print(np.unique(title_vectors.toarray(), return_counts=True))

(array([0, 1]), array([48560701,    44915]))


#### Body vectorization

In [None]:
body_vectorizer = TfidfVectorizer(analyzer='word', tokenizer=lambda x : x, 
                                    preprocessor=lambda x : x, token_pattern=None,
                                    min_df=0.0003)
body_vectorizer.fit(X['Body'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=0.0003, ngram_range=(1, 1), norm='l2',
                preprocessor=<function <lambda> at 0x7f590a636d40>,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern=None,
                tokenizer=<function <lambda> at 0x7f590a636cb0>, use_idf=True,
                vocabulary=None)

In [None]:
body_vectors = body_vectorizer.transform(X['Body'])

In [None]:
body_vectors

<7628x8976 sparse matrix of type '<class 'numpy.float64'>'
	with 351670 stored elements in Compressed Sparse Row format>

In [None]:
body_vectors.shape

(7628, 8976)

#### Concatenate vectors

In [None]:
X_concatenated = sparse.hstack([body_vectors, title_vectors])

In [None]:
X_concatenated.shape

(7628, 15348)

#### Train-Test Split (with stratification)



In [None]:
X_train, y_train, X_test, y_test = iterative_train_test_split(X_concatenated.toarray(), 
                                                              y_vectors.toarray(), 
                                                              test_size = 0.2)

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((6051, 15348), (6051, 100), (1577, 15348), (1577, 100))

#### Save data

In [None]:
sparse.save_npz(project_path + "data/X_train_vect_concat.npz", sparse.coo_matrix(X_train))
sparse.save_npz(project_path + "data/y_train_vect_concat.npz", sparse.coo_matrix(y_train))
sparse.save_npz(project_path + "data/X_test_vect_concat.npz", sparse.coo_matrix(X_test))
sparse.save_npz(project_path + "data/y_test_vect_concat.npz", sparse.coo_matrix(y_test))

### 2. Concatenate -> Vectorize

#### Question vectorisation

In [None]:
X_question = X['Body'] + 3 * X['Title']

In [None]:
question_vectorizer = TfidfVectorizer(analyzer='word', tokenizer=lambda x : x, 
                                    preprocessor=lambda x : x, token_pattern=None,
                                    min_df=0.0003)
question_vectorizer.fit(X_question)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=0.0003, ngram_range=(1, 1), norm='l2',
                preprocessor=<function <lambda> at 0x7f590a64a8c0>,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern=None,
                tokenizer=<function <lambda> at 0x7f590a64a7a0>, use_idf=True,
                vocabulary=None)

In [None]:
question_vectors = question_vectorizer.transform(X_question)

In [None]:
question_vectors.shape

(7628, 9110)

#### Train-Test Split (with stratification)



In [None]:
X_train, y_train, X_test, y_test = iterative_train_test_split(question_vectors.toarray(), 
                                                              y_vectors.toarray(), 
                                                              test_size = 0.2)

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((6062, 9110), (6062, 100), (1566, 9110), (1566, 100))

#### Save data

In [None]:
sparse.save_npz(project_path + "data/X_train_concat_vect.npz", sparse.coo_matrix(X_train))
sparse.save_npz(project_path + "data/y_train_concat_vect.npz", sparse.coo_matrix(y_train))
sparse.save_npz(project_path + "data/X_test_concat_vect.npz", sparse.coo_matrix(X_test))
sparse.save_npz(project_path + "data/y_test_concat_vect.npz", sparse.coo_matrix(y_test))

# <center>The Heuristic (Count Tags in text)</center>

Tags such as 'c#', 'java', 'android', file extensions, etc. usually appear in the text. 
So our baseline (heuristic) approach will be counting tag appearances in the text.

In [None]:
tag_counter = CountVectorizer(analyzer='word', tokenizer=lambda x : x, 
                                    preprocessor=lambda x : x, token_pattern=None)
tag_counter.fit(tags_preprocessed)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1),
                preprocessor=<function <lambda> at 0x7f590a2e87a0>,
                stop_words=None, strip_accents=None, token_pattern=None,
                tokenizer=<function <lambda> at 0x7f590a2e8710>,
                vocabulary=None)

In [None]:
tags_in_question = tag_counter.transform(X['Title'] + X['Body'])

In [None]:
tag_counts = [sum(vect>0) for vect in tags_in_question.toarray()]
print("Average number of tags predicted from a question: %.3f" % (sum(tag_counts)/len(tag_counts)))
print("Max number of tags predicted from a question:", max(tag_counts))

Average number of tags predicted from a question: 2.183
Max number of tags predicted from a question: 13


In [None]:
predicated = [[1 if c > 0 else 0 for c in vect] for vect in tags_in_question.toarray()]

In [None]:
print_metrics(y_vectors.toarray(), predicated)

Hamming Score ('Accuracy' by Keras):	97.852
Hamming Score (= 1 - Hamming Loss):	97.852
Exact match ratio (Subset Accuracy):	12.389
F1-Score Micro Averaged:		43.101
F1-Score Macro Averaged:		40.638
F1-Score Weighted Average:		48.195
Precision Score Micro Averaged:		37.270
Recall Score Micro Averaged:		51.095


Surprisingly good results!

# <center> Experiment: Train-Test Split - Stratification problem </center>

In <a href="http://scikit.ml/stratification.html">Multi-label data stratification - Scikit-Multilearn</a> it is illustrated how `skmultilearn.model_selection.iterative_train_test_split()`, which is adapted to the problem of multi-label classification, preserves the ratio of counts of label combinations in train and test sets. We are curious to illustrate how it preserves the counts of each label.



#### Testing scikit-multilearn method for train-test split (with stratification)

In [None]:
X_train, y_train, X_test, y_test = iterative_train_test_split(X_concatenated.toarray(), 
                                                              y_vectors.toarray(), 
                                                              test_size = 0.5)

In [None]:
tag_counts_train = sum(y_train)
tag_counts_test = sum(y_test)

In [None]:
print(tag_counts_train)

[ 20 151  32  54  31 259  22  20  19  44 155  51  26  27  92 389 167  20
  16  22  23  20 113  57  31  36  25  27  20  20  40  17  18  16  30  21
  20  22  24  22 140  32  20  25 121  24 150 351 312 252  18  31  16  33
 104  17  38  15 139  37 102  29  40  16  38  18 350 150  25  24  20  43
  71 103  36  19  26  23 128  65  18  27  16  29  17  19  26  37  22  30
  29  25  87  40  17  27  25  57  41  58]


In [None]:
print(tag_counts_test)

[ 20 151  33  53  30 259  22  21  19  44 154  52  27  28  92 389 166  21
  16  22  24  20 113  56  32  36  25  28  19  21  40  16  19  16  30  21
  20  23  25  21 140  32  18  24 121  27 150 352 312 252  18  30  17  32
 104  16  37  18 139  38 102  28  40  17  38  17 351 150  25  23  21  43
  70 104  37  19  26  22 128  65  18  27  17  30  17  20  27  37  22  30
  29  24  88  40  18  28  24  57  42  59]


In [None]:
print(list(zip(tag_counts_train, tag_counts_test))[0:15])

[(20, 20), (151, 151), (32, 33), (54, 53), (31, 30), (259, 259), (22, 22), (20, 21), (19, 19), (44, 44), (155, 154), (51, 52), (26, 27), (27, 28), (92, 92)]


In [None]:
# differences between counts of tag in train and test sets
diff = [abs(ttrain - ttest) for ttrain, ttest in zip(tag_counts_train, tag_counts_test)]

In [None]:
print("Max difference: ", max(diff))
print("Avg. difference: ", np.average(diff))

Max difference:  3
Avg. difference:  0.58


In [None]:
print("%.3f %%" % (100 * max(diff) / np.average(tag_counts_train)))

4.945 %


#### Testing scikit-learn method for train-test split with no stratification



In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_concatenated, y_vectors, test_size = 0.5)

In [None]:
y_train.shape

(3814, 100)

In [None]:
tag_counts_train = sum(y_train.toarray())
tag_counts_test = sum(y_test.toarray())

In [None]:
print(tag_counts_train)

[ 18 156  33  51  31 248  19  15  21  39 154  43  19  24  86 409 176  22
  11  25  28  19 124  68  30  40  28  33  19  17  33  17  10  19  33  21
  22  20  26  19 152  35  17  22 124  28 157 351 330 248  19  28  17  30
  94  17  47  19 146  39 101  25  36  20  35  13 338 136  26  22  23  41
  80 109  29  19  30  23 122  57  22  24  14  29  12  20  26  39  27  35
  32  31  90  43  17  31  25  57  47  58]


In [None]:
print(tag_counts_test)

[ 22 146  32  56  30 270  25  26  17  49 155  60  34  31  98 369 157  19
  21  19  19  21 102  45  33  32  22  22  20  24  47  16  27  13  27  21
  18  25  23  24 128  29  21  27 118  23 143 352 294 256  17  33  16  35
 114  16  28  14 132  36 103  32  44  13  41  22 363 164  24  25  18  45
  61  98  44  19  22  22 134  73  14  30  19  30  22  19  27  35  17  25
  26  18  85  37  18  24  24  57  36  59]


In [None]:
print(list(zip(tag_counts_train, tag_counts_test))[0:15])

[(18, 22), (156, 146), (33, 32), (51, 56), (31, 30), (248, 270), (19, 25), (15, 26), (21, 17), (39, 49), (154, 155), (43, 60), (19, 34), (24, 31), (86, 98)]


In [None]:
# differences between counts of tag in train and test sets
diff = [abs(ttrain - ttest) for ttrain, ttest in zip(tag_counts_train, tag_counts_test)]

In [None]:
print("Max difference: ", max(diff))
print("Avg. difference: ", np.average(diff))

Max difference:  40
Avg. difference:  8.36


In [None]:
print("%.3f %%" % (100 * max(diff) / np.average(tag_counts_train)))

65.466 %
