# Text Preprocessing

In [98]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/bnitish/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [99]:
from ast import literal_eval
import pandas as pd
import numpy as np

In [100]:
def read_data(filename):
    data = pd.read_csv(filename,sep='\t')
    data['tags'] = data['tags'].apply(literal_eval)
    return data

In [101]:
train = read_data('data/train.tsv')
validation = read_data('data/validation.tsv')
test = pd.read_csv('data/test.tsv',sep='\t')

In [102]:
train.head()

Unnamed: 0,title,tags
0,How to draw a stacked dotplot in R?,[r]
1,mysql select all records where a datetime fiel...,"[php, mysql]"
2,How to terminate windows phone 8.1 app,[c#]
3,get current time in a specific country via jquery,"[javascript, jquery]"
4,Configuring Tomcat to Use SSL,[java]


In [103]:
x_train, y_train = train['title'].values, train['tags'].values
x_val, y_val = validation['title'].values, validation['tags'].values
x_test = test['title'].values

In [104]:
x_train

array(['How to draw a stacked dotplot in R?',
       'mysql select all records where a datetime field is less than a specified value',
       'How to terminate windows phone 8.1 app', ...,
       'Python Pandas Series of Datetimes to Seconds Since the Epoch',
       'jqGrid issue grouping - Duplicate rows get appended every time sort is changed',
       'Create a List of primitive int?'], dtype=object)

In [105]:
import re
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

In [106]:
def text_prepare(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub("", text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub("", text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join([word for word in text.split() if word not in STOPWORDS]) # delete stopwords from text
    return text

In [107]:
x_train = [text_prepare(x) for x in x_train]
x_val = [text_prepare(x) for x in x_val]
x_test = [text_prepare(x) for x in x_test]

In [108]:
x_train[:5]

['draw stacked dotplot r',
 'mysql select records datetime field less specified value',
 'terminate windows phone 81 app',
 'get current time specific country via jquery',
 'configuring tomcat use ssl']

In [121]:
# Dictionary of all words from train corpus with their counts.
from collections import Counter
words_counts = dict(Counter(' '.join([x for x in x_train]).split()))
len(words_counts)

34197

In [119]:
# Dictionary of all tags from train corpus with their counts.
tags_counts = dict(Counter(' '.join([''.join(t) for t in [' '.join(y) for y in y_train]]).split()))
tags_counts

{'.net': 3872,
 'ajax': 1767,
 'algorithm': 419,
 'android': 2818,
 'angularjs': 1353,
 'apache': 441,
 'arrays': 2277,
 'asp.net': 3939,
 'asp.net-mvc': 1244,
 'c': 3119,
 'c#': 19077,
 'c++': 6469,
 'class': 509,
 'cocoa-touch': 507,
 'codeigniter': 786,
 'css': 1769,
 'csv': 435,
 'database': 740,
 'date': 560,
 'datetime': 557,
 'django': 1835,
 'dom': 400,
 'eclipse': 992,
 'entity-framework': 649,
 'excel': 443,
 'facebook': 508,
 'file': 582,
 'forms': 872,
 'function': 487,
 'generics': 420,
 'google-maps': 408,
 'hibernate': 807,
 'html': 4668,
 'html5': 842,
 'image': 672,
 'ios': 3256,
 'iphone': 1909,
 'java': 18661,
 'javascript': 19078,
 'jquery': 7510,
 'json': 2026,
 'jsp': 680,
 'laravel': 525,
 'linq': 964,
 'linux': 793,
 'list': 693,
 'loops': 389,
 'maven': 432,
 'mongodb': 350,
 'multithreading': 1118,
 'mysql': 3092,
 'node.js': 771,
 'numpy': 502,
 'objective-c': 4338,
 'oop': 425,
 'opencv': 401,
 'osx': 490,
 'pandas': 479,
 'parsing': 403,
 'performance': 512

# Transforming text to a vector

In [138]:
sorted_keys = sorted(words_counts, key = words_counts.get, reverse = True)
for s in sorted_keys[:5000]:
    print s, words_counts[s]

using 8274
php 5422
java 5397
file 5022
javascript 4546
error 4359
get 4266
c# 4090
python 4008
string 3584
array 3500
data 3284
value 3135
jquery 3124
function 2871
object 2614
class 2516
use 2416
c++ 2143
method 2109
list 2104
multiple 2087
text 1881
page 1836
image 1824
form 1769
rails 1740
create 1738
type 1670
code 1650
html 1635
working 1633
database 1624
set 1619
add 1584
convert 1583
json 1533
variable 1528
values 1515
server 1499
android 1487
mysql 1486
aspnet 1472
way 1450
application 1433
change 1429
c 1409
button 1380
find 1370
one 1367
without 1367
django 1352
windows 1329
cannot 1313
table 1263
call 1254
files 1234
check 1213
ajax 1197
time 1174
two 1169
getting 1158
name 1153
ruby 1130
web 1126
make 1126
another 1115
app 1108
user 1102
xml 1100
input 1099
element 1094
custom 1093
date 1088
event 1084
api 1073
access 1072
url 1045
number 1043
spring 1006
sql 990
view 989
mvc 974
return 974
google 969
query 966
select 964
inside 946
exception 937
column 937
loop 937
displa

integration 93
increment 92
buffer 92
audio 92
letters 92
whole 92
modules 92
mock 92
combine 92
break 92
closed 92
lock 91
scripts 91
filtering 91
letter 91
crash 91
uploaded 90
substring 90
username 90
ant 90
bit 90
winform 90
poi 90
repository 90
2008 90
following 89
extra 89
series 89
identifier 89
bundle 89
iis 89
leak 89
mvvm 88
isnt 88
heap 88
entry 88
ftp 88
browsers 88
amazon 88
blob 88
camera 87
void 87
express 87
shows 87
communication 87
pip 87
safari 87
smtp 87
receive 87
dependencies 86
primary 86
success 86
solution 86
rake 86
around 86
extracting 86
macro 86
failing 85
invoke 85
ie8 85
tell 85
listener 85
failure 85
free 85
tableview 85
generator 84
depending 84
beautifulsoup 84
secure 84
storage 84
controllers 84
cross 84
negative 84
generics 84
solve 84
unity 84
provider 84
marker 84
localhost 84
deserialize 84
downloading 83
self 83
cocoa 83
recursion 83
require 83
directly 83
tutorial 83
divs 83
area 83
datasource 83
exe 82
requires 82
iterator 82
understanding 82
2

16 41
switching 41
deserialization 41
matlab 41
multiline 41
stdout 41
highlighting 41
init 41
formatted 41
provide 40
associative 40
constructors 40
whitespace 40
ee 40
completion 40
reactjs 40
querystring 40
understand 40
putting 40
includes 40
footer 40
phps 40
cart 40
subprocess 40
observable 40
serializing 40
enabled 40
scrollbar 40
spreadsheet 40
attachment 40
hangs 40
longer 40
arrow 39
swipe 39
outlook 39
cloud 39
simulator 39
ifelse 39
promise 39
wxpython 39
setinterval 39
positive 39
oop 39
resultset 39
fired 39
finish 39
slash 39
fullcalendar 39
examples 39
latest 39
cors 39
databinding 39
jbutton 39
triggered 39
initializer 39
mono 39
look 39
databases 39
implemented 39
managed 39
preview 39
arbitrary 39
seems 39
disk 39
capistrano 39
emberjs 39
longitude 39
simplexml 39
along 39
invoking 39
currently 39
querying 39
linear 39
detected 39
defining 39
ejb 39
screenshot 39
cordova 38
party 38
third 38
needs 38
shape 38
nsdate 38
nsdata 38
datatype 38
sure 38
meaning 38
compose

javasqlsqlexception 23
embedding 23
menus 23
fileupload 23
mat 23
lion 23
graphs 23
functional 23
phantomjs 23
thymeleaf 23
handshake 23
incomplete 23
maintaining 23
appropriate 23
unused 23
markdown 23
h 23
configured 23
schedule 23
nth 23
scrollview 23
rails3 23
responsive 23
attached 23
rounded 23
alphabetically 23
sleep 23
containers 23
trim 23
english 23
locked 22
intent 22
entering 22
locations 22
clock 22
ienumerablet 22
jsonobject 22
versa 22
forcing 22
producing 22
friendly 22
consolelog 22
iterators 22
supplied 22
subclassing 22
gps 22
solr 22
sizes 22
collect 22
unchecked 22
messagebox 22
x64 22
linqtosql 22
igniter 22
temporarily 22
cs 22
guid 22
ratio 22
fully 22
gd 22
draggable 22
websockets 22
margin 22
deadlock 22
retrieved 22
mq 22
concatenating 22
improve 22
wall 22
rate 22
injecting 22
lower 22
fragments 22
agility 22
foundation 22
mkmapview 22
launching 22
renaming 22
winrt 22
imagemagick 22
people 22
uninstall 22
docker 22
prime 22
18 22
forbidden 22
memcache 22
gu

operands 14
geocoding 14
nsinvalidargumentexception 14
curly 14
overhead 14
resized 14
beyond 14
hive 14
quality 14
todays 14
tips 14
restarting 14
threw 14
profiling 14
yield 14
caller 14
jmeter 14
cycle 14
compression 14
ignores 14
formatter 14
subdomains 14
hanging 13
rabbitmq 13
subplots 13
uiimagepickercontroller 13
inconsistent 13
preference 13
paper 13
indexhtml 13
placement 13
integrity 13
techniques 13
cons 13
opencl 13
requiring 13
systems 13
culture 13
density 13
overlap 13
multipartformdata 13
nonblocking 13
flags 13
associate 13
salesforce 13
openshift 13
expensive 13
dbmigrate 13
workflow 13
neural 13
gmt 13
numerical 13
connectionstring 13
intended 13
nspredicate 13
argumenterror 13
ssrs 13
transactional 13
jasperreports 13
activex 13
separately 13
retry 13
apk 13
posix 13
combined 13
__init__ 13
taskbar 13
osgi 13
mechanism 13
edition 13
volley 13
highlighted 13
transport 13
textures 13
aka 13
sends 13
rectangles 13
anyobject 13
digital 13
mp4 13
phphtml 13
timeline 13


pythondjango 9
args 9
conform 9
recv 9
rect 9
webapplication 9
minimal 9
jdbctemplate 9
behaving 9
ioerror 9
perfectly 9
scrolls 9
comparable 9
warn 9
instanceof 9
neo4j 9
suggestion 9
talk 9
isolate 9
rating 9
secondary 9
beforeafter 9
reserved 9
objectid 9
stdlist 9
inherits 9
autoscroll 9
46 9
leads 9
datarows 9
ios8 9
nsnumber 9
textfields 9
iterations 9
outofmemory 9
dist 9
selectedindex 9
reconnect 9
71 9
latin 9
reject 9
partialview 9
move_uploaded_file 9
tablet 9
reversed 9
godaddy 9
mapper 9
suspend 9
busy 9
prefer 9
subqueries 9
libs 9
ddl 9
consider 9
whitespaces 9
lifecycle 9
blog 9
monthly 9
purposes 9
pieces 9
backcolor 9
checkedlistbox 9
stringreplace 9
sscanf 9
monday 9
javalangunsatisfiedlinkerror 9
lag 9
phase 9
engines 9
mistake 9
reusable 9
xyz 9
ngif 9
recover 9
vb6 9
undo 9
playlist 9
networking 9
onblur 9
technique 9
cleanup 9
stable 9
yesterday 9
addremove 9
follows 9
affects 9
eliminate 9
getters 9
subscription 9
rearrange 9
size_t 9
mysql2 9
motion 9
fetched 9

In [145]:
DICT_SIZE = 5000
WORDS_TO_INDEX = {}
i = 0
for k in sorted_keys[:DICT_SIZE]:
    WORDS_TO_INDEX[k]=i
    i=i+1
    
WORDS_TO_INDEX

{'stock': 4210,
 'limited': 2574,
 'practise': 4513,
 'simplexmlelement': 4251,
 'todays': 3515,
 'anywhere': 4356,
 'raphaeljs': 3742,
 'dynamic': 155,
 'extends': 2201,
 'four': 3030,
 'prices': 4974,
 'prefix': 1615,
 'nsdate': 1898,
 'sleep': 2673,
 'nsdata': 1899,
 'semantic': 4831,
 'saved': 1606,
 'findall': 3724,
 'localized': 3289,
 'relationships': 1926,
 'looking': 1372,
 'typeerror': 345,
 'const': 552,
 'calculate': 551,
 'couple': 4811,
 'gcd': 4767,
 'tweet': 3740,
 'pager': 4719,
 'mpmovieplayercontroller': 3228,
 'imap': 2525,
 'entityframework': 2802,
 'initialise': 4605,
 'matlab': 1819,
 'mouseout': 4587,
 'mysql_connect': 3912,
 'spec': 3251,
 'placed': 3297,
 'preflight': 4853,
 'updated': 1343,
 'onsubmit': 4062,
 'fieldset': 4900,
 'flushing': 4867,
 'filenotfoundexception': 4298,
 'htmlagilitypack': 2471,
 'void': 1048,
 'maximized': 4265,
 'every': 417,
 'updates': 1719,
 'namespaces': 1815,
 'affect': 3913,
 'servlet': 415,
 'webinf': 4285,
 'preparedstatemen

In [160]:
def my_bag_of_words(text, words_to_index, dict_size):
    """
        text: a string
        dict_size: size of the dictionary
        
        return a vector which is a bag-of-words representation of 'text'
    """
    result_vector = np.zeros(dict_size)
    for word in text.split():
        if word in words_to_index:
            result_vector[words_to_index[word]]=1
    return result_vector

In [163]:
my_bag_of_words('mysql select all using records',WORDS_TO_INDEX,DICT_SIZE)

array([1., 0., 0., ..., 0., 0., 0.])

# Create Sparse Matrix

In [166]:
from scipy import sparse as sp_sparse

In [171]:
x_train_my_bag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text,WORDS_TO_INDEX,DICT_SIZE)) for text in x_train])
x_val_my_bag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text,WORDS_TO_INDEX,DICT_SIZE)) for text in x_val])
x_test_my_bag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text,WORDS_TO_INDEX,DICT_SIZE)) for text in x_test])

print('x_train shape ', x_train_my_bag.shape)
print('x_val shape ', x_val_my_bag.shape)
print('x_test shape ', x_test_my_bag.shape)

('x_train shape ', (100000, 5000))
('x_val shape ', (30000, 5000))
('x_test shape ', (20000, 5000))


# TF-IDF

In [172]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [193]:
def tfidf_features(x_train, x_val, x_test):
    """
        x_train, x_val, x_test — samples        
        return TF-IDF vectorized representation of each sample and vocabulary
    """
    # Create TF-IDF vectorizer with a proper parameters choice
    # Fit the vectorizer on the train set
    # Transform the train, test, and val sets and return the result
    
    
    tfidf_vectorizer = TfidfVectorizer(lowercase=True,min_df=5,sublinear_tf=True, token_pattern='(\S+)')
    x_train = tfidf_vectorizer.fit_transform(x_train)
    x_val = tfidf_vectorizer.fit_transform(x_val)
    x_test = tfidf_vectorizer.fit_transform(x_test)
    
    return x_train, x_val, x_test, tfidf_vectorizer.vocabulary_

In [194]:
x_train_tfidf, x_val_tfidf, x_test_tfidf, tfidf_vocab = tfidf_features(x_train, x_val, x_test)
tfidf_reversed_vocab = {i:word for word,i in tfidf_vocab.items()}

In [195]:
len(tfidf_vocab)
tfidf_reversed_vocab

{0: u'+',
 1: u'0',
 2: u'00',
 3: u'1',
 4: u'10',
 5: u'100',
 6: u'11',
 7: u'12',
 8: u'14',
 9: u'15',
 10: u'16',
 11: u'17',
 12: u'1d',
 13: u'2',
 14: u'20',
 15: u'2005',
 16: u'2008',
 17: u'2010',
 18: u'2012',
 19: u'2013',
 20: u'2015',
 21: u'21',
 22: u'22',
 23: u'23',
 24: u'25',
 25: u'27',
 26: u'2d',
 27: u'3',
 28: u'30',
 29: u'31',
 30: u'32',
 31: u'32bit',
 32: u'35',
 33: u'3d',
 34: u'4',
 35: u'40',
 36: u'400',
 37: u'403',
 38: u'404',
 39: u'41',
 40: u'42',
 41: u'45',
 42: u'5',
 43: u'50',
 44: u'500',
 45: u'51',
 46: u'52',
 47: u'6',
 48: u'60',
 49: u'64',
 50: u'64bit',
 51: u'7',
 52: u'8',
 53: u'81',
 54: u'9',
 55: u'_get',
 56: u'_post',
 57: u'able',
 58: u'absolute',
 59: u'abstract',
 60: u'accept',
 61: u'accepting',
 62: u'access',
 63: u'accesscontrolalloworigin',
 64: u'accessed',
 65: u'accessible',
 66: u'accessing',
 67: u'according',
 68: u'accordion',
 69: u'account',
 70: u'accounts',
 71: u'across',
 72: u'action',
 73: u'actio

In [196]:
x_train_tfidf

<100000x6292 sparse matrix of type '<type 'numpy.float64'>'
	with 512593 stored elements in Compressed Sparse Row format>

# MultiLabel Classifier

In [199]:
from sklearn.preprocessing import MultiLabelBinarizer

In [200]:
mlb = MultiLabelBinarizer(classes=sorted(tags_counts.keys()))
y_train = mlb.fit_transform(y_train)
y_val = mlb.fit_transform(y_val)

In [206]:
y_train[1]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [209]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier

In [210]:
def train_classifier(X_train, y_train):
    """
      X_train, y_train — training data
      
      return: trained classifier
    """
    
    # Create and fit LogisticRegression wraped into OneVsRestClassifier.

    base_clf = LogisticRegression()
    clf = OneVsRestClassifier(base_clf).fit(X_train, y_train)
    
    return clf


In [211]:
classifier_my_bag = train_classifier(x_train_my_bag,y_train)
classifier_tfidf = train_classifier(x_train_tfidf,y_train)

In [214]:
y_val_predicted_labels_my_bag = classifier_my_bag.predict(x_val_my_bag)
y_val_predicted_scores_my_bag = classifier_my_bag.decision_function(x_val_my_bag)

array([[-3.52187611, -5.3387789 , -5.92904112, ..., -6.60457381,
        -4.17927727, -5.96514857],
       [-3.10104852, -7.0994201 , -9.26086622, ..., -6.19537317,
        -4.68062802, -5.26412852],
       [-7.11878246, -4.31386447, -6.59257174, ..., -7.34850519,
        -6.50448705, -5.52109163],
       ...,
       [-2.74124079, -4.37320974, -6.53576968, ..., -6.34226145,
        -4.89737186, -6.21848741],
       [-3.34687033, -5.91928759, -6.47733579, ..., -7.33012657,
        -5.02950937, -7.2726027 ],
       [-3.79123789, -4.14067607, -7.13268177, ..., -6.88250609,
        -4.14811252, -4.89146118]])

In [218]:
y_val_predicted_labels_my_bag[3]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [220]:
y_val_predicted_scores_my_bag[3]

array([-4.73342083, -5.52428205, -6.22807009, -4.13026915, -6.36824019,
       -6.86993052, -4.75633879, -3.17184798, -3.8605352 , -4.49652795,
       -3.35180774, -3.46930425, -6.12291724, -4.54452504, -3.18563912,
       -6.54066724, -7.5486474 , -5.87821993, -6.93092549, -6.63195928,
       -6.59211856, -7.1462963 , -6.06569358, -5.40381075, -7.67153868,
       -7.19161645, -7.9237423 , -4.92966134, -5.89586618, -6.92684045,
       -5.29930918, -6.66038539, -4.54675849, -4.49173293, -7.34412605,
       -4.93238018, -4.4626199 , -2.68928766, -4.88894683, -4.9064266 ,
       -5.05324084, -4.3101184 , -7.21485234, -6.71294893, -4.81679034,
       -7.03939917, -6.85404657, -6.09524175, -6.42133462, -5.07558422,
       -4.52649099, -6.82372012, -6.01901141, -4.23187848, -6.06763703,
       -6.5985396 , -5.48103096, -8.39191526, -7.67767032, -7.70039398,
       -0.71413808, -6.57397963, -4.1673087 , -4.98288743, -6.02843259,
       -6.40935335, -5.03951444, -6.59558089, -5.50930129,  0.53

In [222]:
y_val_pred_inversed = mlb.inverse_transform(y_val_predicted_labels_my_bag)
y_val_inversed = mlb.inverse_transform(y_val)
for i in range(3):
    print('Title:\t{}\nTrue labels:\t{}\nPredicted labels:\t{}\n\n'.format(
        x_val[i],
        ','.join(y_val_inversed[i]),
        ','.join(y_val_pred_inversed[i])
    ))

Title:	odbc_exec always fail
True labels:	php,sql
Predicted labels:	


Title:	access base classes variable within child class
True labels:	javascript
Predicted labels:	class


Title:	contenttype applicationjson required rails
True labels:	ruby,ruby-on-rails
Predicted labels:	ruby-on-rails




In [None]:
y_val_predicted_labels_tfidf = classifier_tfidf.predict(x_val_tfidf)
y_val_predicted_scores_tfidf = classifier_tfidf.decision_function(x_val_tfidf)

In [None]:
y_val_pred_inversed = mlb.inverse_transform(y_val_predicted_labels_tfidf)
y_val_inversed = mlb.inverse_transform(y_val)
for i in range(3):
    print('Title:\t{}\nTrue labels:\t{}\nPredicted labels:\t{}\n\n'.format(
        x_val[i],
        ','.join(y_val_inversed[i]),
        ','.join(y_val_pred_inversed[i])
    ))

# Evaluation

### Classification Metrics for Evaluation
    Accuracy
    F1-Score
    Area Under ROC-curve
    Area Under Precision-recall curve

In [225]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

In [None]:
def print_evaluation_scores(y_val, predicted):
    
    ######################################
    ######### YOUR CODE HERE #############
    ######################################

In [None]:
print('Bag-of-words')
print_evaluation_scores(y_val, y_val_predicted_labels_mybag)
print('Tfidf')
print_evaluation_scores(y_val, y_val_predicted_labels_tfidf)

In [231]:
from sklearn.metrics import roc_curve
%matplotlib inline

In [232]:
n_classes = len(tags_counts)
roc_curve(y_val, y_val_predicted_scores_my_bag, n_classes)

ValueError: multilabel-indicator format is not supported