## Multi-Label Auto-Tagger

### Automatic Comment tagging

The purpose of this document is to create an automated tagging system that tags a comment as belonging to a small set of predetermined categories.
Once comment can have multiple tags hence the multilabeling approach.


Example: {'Fantastic meals....quite good service':['food','service']}


In [1]:
###Importations
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.utils import shuffle
from sklearn import metrics

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MultiLabelBinarizer

import re
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from xgboost.sklearn import XGBClassifier
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from sklearn.pipeline import Pipeline
import seaborn as sns
import pickle

## Train

#### a.) Load the Data

In [2]:
#csvdata = pd.read_csv('datasets/Java-training-data_V1.csv')
csvdata = pd.read_csv('datasets/insuranceThemes_V1.csv')
csvdata.head()

Unnamed: 0,userId,phone,company,rating,comment,theme
0,5877440,254700001454,britam,10,No complains on my part...so far so good,service
1,33,254700001920,liberty,10,Good relationship with the staff. Talking poli...,staff
2,38,254700006348,liberty,9,It also provides asset management and property...,service
3,2817604,254700040400,britam,10,Carol Wambugu is a very pleasant Customer serv...,service
4,2817604,254700040400,britam,10,Carol Wambugu is a very pleasant Customer serv...,staff


In [3]:
#make everything in the themes column lower case.
csvdata['theme'] = [x.lower() for x in csvdata['theme']]

#Themes has speed and speed/efficiency that need to be combined to one (speed/efficiency)
csvdata['theme'] = [re.sub(r'speed$', 'speed/efficiency', x) for x in csvdata['theme']]


In [4]:
#Add an extra column for themes as a category
#Label encoding to represent each of the theme classes as numbers
theme_categories = csvdata['theme'].astype('category') #1. We first convert the column into a category
csvdata['theme_categories']  = theme_categories.cat.codes #2. assign the encoded variable to a new column using the cat.codes
target_names = list(theme_categories.cat.categories)
csvdata.head()

Unnamed: 0,userId,phone,company,rating,comment,theme,theme_categories
0,5877440,254700001454,britam,10,No complains on my part...so far so good,service,6
1,33,254700001920,liberty,10,Good relationship with the staff. Talking poli...,staff,8
2,38,254700006348,liberty,9,It also provides asset management and property...,service,6
3,2817604,254700040400,britam,10,Carol Wambugu is a very pleasant Customer serv...,service,6
4,2817604,254700040400,britam,10,Carol Wambugu is a very pleasant Customer serv...,staff,8


In [5]:
# df = pd.Series(csvdata)
# df.describe()

csvdata.describe()
csvdata['theme'].value_counts()


service             3394
speed/efficiency    1890
communication       1697
policy              1079
staff               1021
product              646
agents               316
premiums             295
claims               147
Name: theme, dtype: int64

In [6]:
target_names, len(target_names)

(['agents',
  'claims',
  'communication',
  'policy',
  'premiums',
  'product',
  'service',
  'speed/efficiency',
  'staff'],
 9)

In [7]:
processed_data = {}

for row in csvdata.iterrows():
    if row[1]['comment']  in processed_data.keys():  
        processed_data[row[1]['comment']].append(row[1]['theme_categories'])
    else:
        processed_data[row[1]['comment']] = [row[1]['theme_categories']]
        


In [8]:
my_data = {}
my_data['data'] = processed_data.keys()
my_data['target'] = processed_data.values()

X = my_data['data']
y = MultiLabelBinarizer().fit_transform(processed_data.values())
y.shape


(7000, 9)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42,shuffle=True )

Try out some classifiers:
1. Logistic Regression
2. Random Forest
3. SVM
4. Multinomial Naives Bayes
5. XGBoost

### Logistic Regression Classifier

In [10]:
from sklearn.linear_model import LogisticRegression

LogReg_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1)),
            ])
print('... Processing')
LogReg_pipeline.fit(X_train, y_train)
# compute the testing accuracy
prediction = LogReg_pipeline.predict(X_test)
print('Test accuracy is {}'.format(accuracy_score(y_test, prediction)))


... Processing
Test accuracy is 0.765238095238


### Random Forest Classifier

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification


In [12]:
RandomForest_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(RandomForestClassifier(max_depth=150, random_state=0), n_jobs=1)),
            ])
print('... Processing')
RandomForest_pipeline.fit(X_train, y_train)
# compute the testing accuracy
prediction = RandomForest_pipeline.predict(X_test)
print('Test accuracy is {}'.format(accuracy_score(y_test, prediction)))

... Processing
Test accuracy is 0.852857142857


### SVM Classifier

In [13]:
from sklearn import svm

SVM_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(svm.SVC(decision_function_shape='ovr'), n_jobs=10)),
            ])
print('... Processing')
SVM_pipeline.fit(X_train, y_train)
# compute the testing accuracy
prediction = SVM_pipeline.predict(X_test)
print('Test accuracy is {}'.format(accuracy_score(y_test, prediction)))


... Processing
Test accuracy is 0.0


### Naives Bayes Classifier

In [14]:
from sklearn.naive_bayes import MultinomialNB
Naives_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(MultinomialNB(), n_jobs=1)),
            ])
print('... Processing')
Naives_pipeline.fit(X_train, y_train)
# compute the testing accuracy
prediction = Naives_pipeline.predict(X_test)
print('Test accuracy is {}'.format(accuracy_score(y_test, prediction)))

... Processing
Test accuracy is 0.438571428571


### XGBoost Classifier

In [15]:
classes = len(csvdata['theme'].unique())#number of classes
reg_lambda = 2 #XG Boost's L2 regularization term on weights, increasing it makes the model more conservative.default=1


XGB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(
                    XGBClassifier(                  
                        objective = "multi:softmax", 
                        seed =27,
                        reg_lambda=reg_lambda,
                        num_class = classes
                ), n_jobs=1)),
            ])
print('... Processing')
XGB_pipeline.fit(X_train, y_train)
# compute the testing accuracy
prediction = XGB_pipeline.predict(X_test)
print('Test accuracy is {}'.format(accuracy_score(y_test, prediction)))


... Processing
Test accuracy is 0.910476190476


### Validation

In [16]:
#Validation using the best classifier 
comment1 = 'Poor at customer service!! Call them for dispute resolution, and they in the middle of the convo, they hung up on you!!' #
comment2 = 'claims (general)processing is shoddy!'
comment3 = 'Do you have a funeral insurance cover, and how does it work.'
comment4 = 'For education plan, how much am I supposed to pay monthly'
comment5 = 'My claim has taken too long'
comment6 = "The agent lied to me"
comment7 = 'Grace in Headoffice was really helpful'

comments_new = [comment1,comment2,comment3,comment4, comment5, comment6,comment7]

print('XGBoost')
predicted = XGB_pipeline.predict(comments_new)
predicted = pd.DataFrame(predicted, columns=target_names)
   

for tw, category in zip(comments_new, predicted.iterrows()):
    themes=[]
    for i in range(len(category[1])) :
        if category[1][i] == 1:
            themes.append(target_names[i])
    print('\n%r ===> %s' % (tw, themes))

XGBoost

'Poor at customer service!! Call them for dispute resolution, and they in the middle of the convo, they hung up on you!!' ===> ['service']

'claims (general)processing is shoddy!' ===> ['claims']

'Do you have a funeral insurance cover, and how does it work.' ===> []

'For education plan, how much am I supposed to pay monthly' ===> ['communication']

'My claim has taken too long' ===> ['claims']

'The agent lied to me' ===> ['agents']

'Grace in Headoffice was really helpful' ===> ['staff']


#### Model Persistence- Save the model for later


In [17]:
filename = 'insurance.sav'
pickle.dump(XGB_pipeline, open(filename, 'wb'))
