## Importing Libraries

In [31]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from ast import literal_eval
import pandas as pd
import numpy as np
import re
%matplotlib inline

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Importing Data

In [32]:
def read_data(filename):
    data = pd.read_csv(filename, sep='\t')
    data['tags'] = data['tags'].apply(literal_eval)
    return data

In [33]:
train = read_data('data/train.tsv')
validation = read_data('data/validation.tsv')
test = pd.read_csv('data/test.tsv', sep ='\t')

In [34]:
train.head(5)

Unnamed: 0,title,tags
0,How to draw a stacked dotplot in R?,[r]
1,mysql select all records where a datetime fiel...,"[php, mysql]"
2,How to terminate windows phone 8.1 app,[c#]
3,get current time in a specific country via jquery,"[javascript, jquery]"
4,Configuring Tomcat to Use SSL,[java]


## Test train split

In [35]:
x_train, y_train = train['title'].values, train['tags'].values
x_validation, y_validation = validation['title'].values, validation['tags'].values
x_test = test['title'].values

In [36]:
x_train[:5]

array(['How to draw a stacked dotplot in R?',
       'mysql select all records where a datetime field is less than a specified value',
       'How to terminate windows phone 8.1 app',
       'get current time in a specific country via jquery',
       'Configuring Tomcat to Use SSL'], dtype=object)

## Preprocessing the data

In [37]:
TO_BE_REPLACED_BY_SPACE = re.compile('[/(){}\[\]\|@,;]')
TO_BE_REMOVED = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))
def text_process(text):
    text = text.lower()
    text = re.sub(TO_BE_REPLACED_BY_SPACE, " ", text)
    text = re.sub(TO_BE_REMOVED, "", text)
    text = text.split();
    return ' '.join([i for i in text if i not in STOPWORDS])

In [38]:
x_train = [text_process(x) for x in x_train]
x_validation = [text_process(x) for x in x_validation]
x_test = [text_process(x) for x in x_test]

In [39]:
x_train[:5]

['draw stacked dotplot r',
 'mysql select records datetime field less specified value',
 'terminate windows phone 81 app',
 'get current time specific country via jquery',
 'configuring tomcat use ssl']

In [43]:
from collections import defaultdict

words_count = defaultdict(int)

for text in x_train:
    for word in text.split():
        words_count[word] +=1
        
tags_count = defaultdict(int)

for tags in y_train:
    for tag in tags:
        tags_count[tag] +=1  

### Transforming text to a vector

##### Bag of words method

In [44]:
most_common_words = sorted(words_counts.items(), key=lambda x: x[1], reverse=True)[:6000]
DICT_SIZE = 5000
WORDS_TO_INDEX = {p[0]:i for i,p in enumerate(most_common_words[:DICT_SIZE])} 
INDEX_TO_WORDS = {WORDS_TO_INDEX[k]:k for k in WORDS_TO_INDEX}
ALL_WORDS = WORDS_TO_INDEX.keys()

def my_bag_of_words(text, words_to_index, dict_size):
    result_vector = np.zeros(dict_size)
    for word in text.split():
        if word in words_to_index:
            result_vector[words_to_index[word]] += 1
    
    return result_vector

In [45]:
from scipy import sparse as sp_sparse

In [47]:
x_train_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in x_train])
x_validation_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in x_validation])
x_test_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in x_test])
print('x_train shape ', x_train_mybag.shape)
print('x_validation shape ', x_validation_mybag.shape)
print('x_test shape ', x_test_mybag.shape)

x_train shape  (100000, 5000)
x_validation shape  (30000, 5000)
x_test shape  (20000, 5000)


#### TF-IDF

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [49]:
def tfidf_features(x_train, x_validation, x_test):
    tfidf_vectorizer = TfidfVectorizer(min_df=5, max_df=0.9, ngram_range=(1, 2), token_pattern='(\S+)')
    x_train = tfidf_vectorizer.fit_transform(x_train)
    x_validation = tfidf_vectorizer.transform(x_validation)
    x_test = tfidf_vectorizer.transform(x_test)
    return x_train, x_validation, x_test, tfidf_vectorizer.vocabulary_

In [50]:
x_train_tfidf, x_validation_tfidf, x_test_tfidf, tfidf_vocab = tfidf_features(x_train, x_validation, x_test)
tfidf_reversed_vocab = {i:word for word,i in tfidf_vocab.items()}

In [56]:
tfidf_vocab['python']

12531

In [58]:
tfidf_reversed_vocab[14587]

'size collection'

### MultiLabel classifier

In [59]:
from sklearn.preprocessing import MultiLabelBinarizer

In [61]:
mlb = MultiLabelBinarizer(classes=sorted(tags_count.keys()))
y_train = mlb.fit_transform(y_train)
y_validation = mlb.fit_transform(y_validation)

In [62]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition.nmf import NMF
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier

In [65]:
def train_classifier(X_train, y_train, C=1.0, penalty='l2'):
    lr = LogisticRegression(solver='newton-cg',C=C, penalty=penalty,n_jobs=-1)
    ovr = OneVsRestClassifier(lr)
    ovr.fit(X_train, y_train)
    return ovr

In [66]:
classifier_mybag = train_classifier(x_train_mybag, y_train)
classifier_tfidf = train_classifier(x_train_tfidf, y_train)

In [70]:
y_validation_predicted_labels_mybag = classifier_mybag.predict(x_validation_mybag)
y_validation_predicted_scores_mybag = classifier_mybag.decision_function(x_validation_mybag)

y_validation_predicted_labels_tfidf = classifier_tfidf.predict(x_validation_tfidf)
y_validation_predicted_scores_tfidf = classifier_tfidf.decision_function(x_validation_tfidf)

In [71]:
y_validation_pred_inversed = mlb.inverse_transform(y_validation_predicted_labels_tfidf)
y_validation_inversed = mlb.inverse_transform(y_validation)
for i in range(3):
    print('Title:\t{}\nTrue labels:\t{}\nPredicted labels:\t{}\n\n'.format(
        x_validation[i],
        ','.join(y_validation_inversed[i]),
        ','.join(y_validation_pred_inversed[i])
    ))

Title:	odbc_exec always fail
True labels:	php,sql
Predicted labels:	


Title:	access base classes variable within child class
True labels:	javascript
Predicted labels:	


Title:	contenttype application json required rails
True labels:	ruby,ruby-on-rails
Predicted labels:	json,ruby-on-rails




## Evaluation

In [72]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

In [74]:
def print_evaluation_scores(y_validation, predicted):
    
    print(accuracy_score(y_validation, predicted))
    print(f1_score(y_validation, predicted, average='weighted'))
    print(average_precision_score(y_validation, predicted))

In [75]:
print('Bag-of-words')
print_evaluation_scores(y_validation, y_validation_predicted_labels_mybag)
print('Tfidf')
print_evaluation_scores(y_validation, y_validation_predicted_labels_tfidf)

Bag-of-words
0.0


  'precision', 'predicted', average, warn_for)


0.0
0.01950333333333333
Tfidf
0.33393333333333336
0.6142668931088263
0.30181976655232984


## Analyzing Important Features

In [80]:
def print_words_for_tag(classifier, tag, tags_classes, index_to_words, all_words):
    print('Tag:\t{}'.format(tag))
    est = classifier.estimators_[tags_classes.index(tag)]
    top_positive_words = [index_to_words[index] for index in est.coef_.argsort().tolist()[0][-5:]]  # top-5 words sorted by the coefficiens.
    top_negative_words = [index_to_words[index] for index in est.coef_.argsort().tolist()[0][:5]] # bottom-5 words  sorted by the coefficients.
    print('Top positive words:\t{}'.format(', '.join(top_positive_words)))
    print('Top negative words:\t{}\n'.format(', '.join(top_negative_words)))

In [81]:
print_words_for_tag(classifier_tfidf, 'c', mlb.classes, tfidf_reversed_vocab, ALL_WORDS)
print_words_for_tag(classifier_tfidf, 'ruby', mlb.classes, tfidf_reversed_vocab, ALL_WORDS)
print_words_for_tag(classifier_tfidf, 'python', mlb.classes, tfidf_reversed_vocab, ALL_WORDS)

Tag:	c
Top positive words:	gcc, printf, scanf, malloc, c
Top negative words:	java, php, python, javascript, c#

Tag:	ruby
Top positive words:	gem, sinatra, nokogiri, rails, ruby
Top negative words:	ruby rails, php, java, python, c#

Tag:	python
Top positive words:	flask, matplotlib, numpy, pandas, python
Top negative words:	php, java, c#, javascript, jquery

