In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Reading the csv file

df = pd.read_csv(r"C:\Users\talas\Downloads\NLP\train_fqin_revised_labels.csv")
df.head(20)

Unnamed: 0,No,comments,issue
0,0,new activation power new device,"['activation', 'power']"
1,1,phone would turn activation apple logo doesnt ...,"['activation', 'power']"
2,2,rsc processed prepaid new line activation 1172...,"['activation', 'charge']"
3,3,activate apple server activation issue,['activation']
4,4,housing damage discover activation,"['activation', 'damage']"
5,5,phone continue activation mode get stuck looki...,['activation']
6,6,device cmpleted process activation,['activation']
7,7,phone would complete initial activation,['activation']
8,8,phone locked activation screen,['activation']
9,9,box activation failure doa,"['activation', 'doa']"


In [3]:
type(df['issue'].iloc[3])

str

In [4]:
import ast
ast.literal_eval(df['issue'].iloc[0])

['activation', 'power']

In [5]:
# Convert the string format to list using lambda function

df['issue'] = df['issue'].apply(lambda x: ast.literal_eval(x))

In [6]:
df.head()

Unnamed: 0,No,comments,issue
0,0,new activation power new device,"[activation, power]"
1,1,phone would turn activation apple logo doesnt ...,"[activation, power]"
2,2,rsc processed prepaid new line activation 1172...,"[activation, charge]"
3,3,activate apple server activation issue,[activation]
4,4,housing damage discover activation,"[activation, damage]"


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

In [8]:
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.multiclass import OneVsRestClassifier


In [9]:
y = df['issue']
y

0           [activation, power]
1           [activation, power]
2          [activation, charge]
3                  [activation]
4          [activation, damage]
                  ...          
19350                 [speaker]
19351          [audio, speaker]
19352                 [speaker]
19353    [damage, doa, speaker]
19354                 [speaker]
Name: issue, Length: 19355, dtype: object

In [10]:
multilabel = MultiLabelBinarizer()
y = multilabel.fit_transform(df['issue'])

In [11]:
y

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1]])

In [12]:
# 28 unique classes

multilabel.classes_

array(['activation', 'antenna', 'audio', 'backlight', 'battery',
       'blacklist', 'bluetooth', 'button', 'camera', 'charge', 'charger',
       'connectivity', 'cosmetic', 'damage', 'display', 'doa', 'earphone',
       'encrypt', 'exchange', 'keypad', 'mic', 'network', 'password',
       'power', 'safe', 'service', 'software', 'speaker'], dtype=object)

In [13]:
pd.DataFrame(y,columns=multilabel.classes_)

Unnamed: 0,activation,antenna,audio,backlight,battery,blacklist,bluetooth,button,camera,charge,...,exchange,keypad,mic,network,password,power,safe,service,software,speaker
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19350,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
19351,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
19352,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
19353,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [14]:
tfidf = TfidfVectorizer(analyzer='word',max_features=1000)
X = tfidf.fit_transform(df['comments'])

In [15]:
X

<19355x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 109083 stored elements in Compressed Sparse Row format>

In [43]:
# 
#tfidf.vocabulary_

In [16]:
X.shape , y.shape

((19355, 1000), (19355, 28))

In [17]:
# Splitting the dataset(80-20 split)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Build models

In [18]:
sgd = SGDClassifier()
lr = LogisticRegression(solver='lbfgs')
svc = LinearSVC()

In [19]:
def j_score(y_true, y_pred):
  jaccard = np.minimum(y_true, y_pred).sum(axis = 1)/np.maximum(y_true, y_pred).sum(axis = 1)
  return jaccard.mean()*100


def print_score(y_pred, clf):
  print("Clf: ", clf.__class__.__name__)
  print('Jacard score: {}'.format(j_score(y_test, y_pred)))
  print('----')

In [20]:
for classifier in [sgd, lr, svc]:
  clf = OneVsRestClassifier(classifier)
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  print_score(y_pred, classifier)

Clf:  SGDClassifier
Jacard score: 98.74063549470421
----
Clf:  LogisticRegression
Jacard score: 94.697752518729
----
Clf:  LinearSVC
Jacard score: 98.79876001033324
----


# Model testing on Real data

In [21]:
a = ['new activation power new device']

In [22]:
at = tfidf.transform(a)
clf.predict(at)

array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0]])

In [23]:
multilabel.inverse_transform(clf.predict(at))

[('activation', 'power')]