In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/stackoverflow.csv", index_col=0)
df.head()

Unnamed: 0,Text,Tags
2,aspnet site maps has anyone got experience cre...,"['sql', 'asp.net']"
4,adding scripting functionality to net applicat...,"['c#', '.net']"
5,should i use nested classes in this case i am ...,['c++']
6,homegrown consumption of web services i have b...,['.net']
8,automatically update version number i would li...,['c#']


In [9]:
import ast
print(f"{df['Tags'].iloc[0]} type {type(df['Tags'].iloc[0])}")
print(f"{ast.literal_eval(df['Tags'].iloc[0])} type {type(ast.literal_eval(df['Tags'].iloc[0]))}")


['sql', 'asp.net'] type <class 'str'>
['sql', 'asp.net'] type <class 'list'>


In [10]:
df['Tags'] = df['Tags'].apply(lambda x: ast.literal_eval(x ))
df.head()

Unnamed: 0,Text,Tags
2,aspnet site maps has anyone got experience cre...,"[sql, asp.net]"
4,adding scripting functionality to net applicat...,"[c#, .net]"
5,should i use nested classes in this case i am ...,[c++]
6,homegrown consumption of web services i have b...,[.net]
8,automatically update version number i would li...,[c#]


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.multiclass import OneVsRestClassifier

In [12]:
multilabel = MultiLabelBinarizer()
y = multilabel.fit_transform(df['Tags'])
y

array([[0, 0, 1, ..., 0, 0, 1],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [14]:
multilabel.classes_

array(['.net', 'android', 'asp.net', 'c', 'c#', 'c++', 'css', 'html',
       'ios', 'iphone', 'java', 'javascript', 'jquery', 'mysql',
       'objective-c', 'php', 'python', 'ruby', 'ruby-on-rails', 'sql'],
      dtype=object)

In [15]:
y = pd.DataFrame(y, columns=multilabel.classes_)
y

Unnamed: 0,.net,android,asp.net,c,c#,c++,css,html,ios,iphone,java,javascript,jquery,mysql,objective-c,php,python,ruby,ruby-on-rails,sql
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48971,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
48972,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
48973,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
48974,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [33]:
tfidf = TfidfVectorizer(analyzer='word', max_features=5000, ngram_range=(1,2), stop_words='english')
X = tfidf.fit_transform(df['Text'])

In [34]:
X

<48976x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 1790339 stored elements in Compressed Sparse Row format>

In [18]:
tfidf.vocabulary_

{'aspnet': 79,
 'site': 785,
 'has': 395,
 'anyone': 57,
 'got': 385,
 'creating': 215,
 'have': 397,
 'the': 863,
 'default': 230,
 'xml': 995,
 'file': 335,
 'working': 986,
 'properly': 676,
 'with': 978,
 'my': 560,
 'menu': 541,
 'and': 48,
 'but': 131,
 'will': 975,
 'need': 568,
 'way': 957,
 'for': 351,
 'users': 929,
 'of': 597,
 'to': 890,
 'create': 213,
 'modify': 551,
 'pages': 624,
 'page': 623,
 'into': 452,
 'standard': 811,
 'system': 842,
 'as': 77,
 'well': 962,
 'adding': 29,
 'functionality': 364,
 'net': 572,
 'applications': 65,
 'little': 503,
 'game': 366,
 'written': 992,
 'in': 429,
 'it': 459,
 'uses': 930,
 'database': 222,
 'is': 456,
 'wanted': 954,
 'implement': 422,
 'function': 363,
 'mean': 537,
 'that': 862,
 'an': 47,
 'interface': 450,
 'which': 969,
 'class': 167,
 'implements': 426,
 'public': 682,
 'contains': 199,
 'are': 70,
 'called': 140,
 'by': 134,
 'make': 526,
 'thing': 870,
 'would': 989,
 'like': 493,
 'each': 272,
 'source': 800,
 'co

In [19]:
X.shape, y.shape

((48976, 1000), (48976, 20))

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### Build Model

In [21]:
sgd = SGDClassifier()
lr = LogisticRegression(solver='lbfgs')
svc = LinearSVC()

In [25]:
def j_score(y_true, y_pred):
    jaccard = np.minimum(y_true, y_pred).sum(axis=1)/np.maximum(y_true, y_pred).sum(axis=1)
    return jaccard.mean()*100

def print_score(y_pred, clf):
    print("Clf: ", clf.__class__.__name__)
    print(f"Jaccard: {j_score(y_test, y_pred)}")
    print("----")

In [36]:
for classifier in [sgd, lr, svc]:
    clf = OneVsRestClassifier(classifier)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print_score(y_pred, classifier)
    
    # Origianl
    # Clf:  SGDClassifier
    # Jaccard: 44.500986797332224
    # ----
    # Clf:  LogisticRegression
    # Jaccard: 46.323499387505116
    # ----
    # Clf:  LinearSVC
    # Jaccard: 51.65560773104669
    # ----
    
    # Added ngram
    # Clf:  SGDClassifier
    # Jaccard: 48.17102218592622
    # ----
    # Clf:  LogisticRegression
    # Jaccard: 48.56778276847694
    # ----
    # Clf:  LinearSVC
    # Jaccard: 58.143289778140726
    # ----

Clf:  SGDClassifier
Jaccard: 52.440111610181006
----
Clf:  LogisticRegression
Jaccard: 51.67942697699742
----
Clf:  LinearSVC
Jaccard: 61.22192731727228
----


## Model Test

In [37]:
x = ['how to write ml code in python and java i have data but do not know how to do it']
xt = tfidf.transform(x)
multilabel.inverse_transform(clf.predict(xt))


[('java', 'python')]