In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.multiclass import OneVsRestClassifier

In [6]:
import ast

In [4]:
df = pd.read_csv('../../Datasets/data/stackoverflow.csv', index_col=0)

In [5]:
df.head()

Unnamed: 0,Text,Tags
2,aspnet site maps has anyone got experience cre...,"['sql', 'asp.net']"
4,adding scripting functionality to net applicat...,"['c#', '.net']"
5,should i use nested classes in this case i am ...,['c++']
6,homegrown consumption of web services i have b...,['.net']
8,automatically update version number i would li...,['c#']


In [7]:
ast.literal_eval(df['Tags'].iloc[0])

['sql', 'asp.net']

In [8]:
df['Tags'] = df['Tags'].apply(lambda x: ast.literal_eval(x))
df.head()

Unnamed: 0,Text,Tags
2,aspnet site maps has anyone got experience cre...,"[sql, asp.net]"
4,adding scripting functionality to net applicat...,"[c#, .net]"
5,should i use nested classes in this case i am ...,[c++]
6,homegrown consumption of web services i have b...,[.net]
8,automatically update version number i would li...,[c#]


In [15]:
y = df['Tags']

In [16]:
y

2          [sql, asp.net]
4              [c#, .net]
5                   [c++]
6                  [.net]
8                    [c#]
                ...      
1262668             [c++]
1262834             [c++]
1262915          [python]
1263065          [python]
1263454             [c++]
Name: Tags, Length: 48976, dtype: object

In [17]:
multilabel = MultiLabelBinarizer()
y = multilabel.fit_transform(df['Tags'])

In [19]:
y.shape

(48976, 20)

In [20]:
multilabel.classes_

array(['.net', 'android', 'asp.net', 'c', 'c#', 'c++', 'css', 'html',
       'ios', 'iphone', 'java', 'javascript', 'jquery', 'mysql',
       'objective-c', 'php', 'python', 'ruby', 'ruby-on-rails', 'sql'],
      dtype=object)

In [22]:
pd.DataFrame(y,columns=multilabel.classes_)

Unnamed: 0,.net,android,asp.net,c,c#,c++,css,html,ios,iphone,java,javascript,jquery,mysql,objective-c,php,python,ruby,ruby-on-rails,sql
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48971,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
48972,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
48973,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
48974,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


# Text Vectorization

\begin{align*}
tfidf(t,d,D) = tf(t,d) \times idf(t,D)
\end{align*}

In [24]:
df.head()

Unnamed: 0,Text,Tags
2,aspnet site maps has anyone got experience cre...,"[sql, asp.net]"
4,adding scripting functionality to net applicat...,"[c#, .net]"
5,should i use nested classes in this case i am ...,[c++]
6,homegrown consumption of web services i have b...,[.net]
8,automatically update version number i would li...,[c#]


TfidfVectorizer principais parametros:
* analyzer: define se as features deveraos ser palavras or n_gram characters. 
* max_features: define um numero máximo de features a serem utilizadas. Define o vocabulario no qual serao utilizados somente os top x termos ordenados em ordem descrescente de frequencia ( maior -> menor)
* n_gram_range: limite inferior e superior do intervalo de n-valores para diferentes n-gramas a serem extraídos. 
* max_df: seleciona o limite superior para selecionar as palavras ( por ex: palavras com frequencia acima de x não serão selecionadas)
* min_df: seleciona o limite inferior para selecionar as palavras (por ex: palavras com frequencia abaixo de y não serão selecionadas)




In [109]:
ngram_range=(1,3)

In [131]:
tfidf = TfidfVectorizer(analyzer='word',max_features=10000,stop_words='english',ngram_range=(1,3))

X = tfidf.fit_transform(df['Text'])

In [132]:
X

<48976x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 2002390 stored elements in Compressed Sparse Row format>

In [133]:
# tfidf.vocabulary_

In [134]:
X.shape, y.shape

((48976, 10000), (48976, 20))

In [135]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

# Build Model

In [136]:
# Sotchastic gradient descente learning
sgd = SGDClassifier()

# LogisticRegression() -> check solvers 
lr = LogisticRegression(solver='lbfgs')

# Linear SVC
svc = LinearSVC()





Os problemas de classificação multilabel devem ser avaliados usando medidas de desempenho diferentes dos problemas de classificação de rótulo único. Duas das métricas de desempenho mais comuns são a perda de hamming e a similaridade de Jaccard. A perda de Hamming é a fração média de rótulos incorretos. Observe que a perda de hamming é uma função de perda e que a pontuação perfeita é 0. A similaridade de Jaccard, ou o índice de Jaccard, é o tamanho da interseção dos rótulos previstos e os rótulos verdadeiros dividido pelo tamanho da união do previsto e verdadeiro rótulos. Ele varia de 0 a 1, e 1 é a pontuação perfeita.

In [137]:
# for classifier in [sgd,lr,svc]:
#     clf = OneVsRestClassifier(classifier)
#     clf.fit(X_train,y_train)
    
    
    

In [138]:
def jaccard_score(y_true,y_pred):
    score = np.minimum(y_true,y_pred).sum(axis=1)/np.maximum(y_true,y_pred).sum(axis=1)
    return score.mean()*100

def print_score(y_pred,clf):
    print('CLF:',clf.__class__.__name__)
    print('Jaccard Score:{}'.format(jaccard_score(y_test,y_pred)))
    print('---')
    
    

In [139]:
# NO NGRAM RANGE

In [118]:
for classifier in [sgd,lr,svc]:
    clf = OneVsRestClassifier(classifier)
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print_score(y_pred,classifier)

CLF: SGDClassifier
Jaccard Score:51.834422213148216
---
CLF: LogisticRegression
Jaccard Score:50.79573295222539
---
CLF: LinearSVC
Jaccard Score:62.992888253709
---


In [129]:
# (1,2) N GRAM RANGE

In [130]:
for classifier in [sgd,lr,svc]:
    clf = OneVsRestClassifier(classifier)
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print_score(y_pred,classifier)

CLF: SGDClassifier
Jaccard Score:52.121954539267726
---
CLF: LogisticRegression
Jaccard Score:51.010106165781956
---
CLF: LinearSVC
Jaccard Score:62.224717571798
---


In [140]:
for classifier in [sgd,lr,svc]:
    clf = OneVsRestClassifier(classifier)
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print_score(y_pred,classifier)

CLF: SGDClassifier
Jaccard Score:52.36780318497346
---
CLF: LogisticRegression
Jaccard Score:51.07901184156799
---
CLF: LinearSVC
Jaccard Score:62.1898393902273
---


## Model Test with Real Data

In [95]:
x = ['how to write ml code in python and java i have data but do not know how to do it']

In [96]:
xt = tfidf.transform(x)
clf.predict(xt)

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0]])

In [97]:
multilabel.inverse_transform(clf.predict(xt))

[('java', 'python')]