## Multi-Label Auto-Tagger

### Automatic Comment tagging

The purpose of this document is to create an automated tagging system that tags a comment as belonging to a small set of predetermined categories.
Once comment can have multiple tags hence the multilabeling approach.


Example: {'Fantastic meals....quite good service':['food','service']}


In [1]:
###Importations
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.utils import shuffle
from sklearn import metrics

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MultiLabelBinarizer

import re
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import seaborn as sns

## Train

#### a.) Load the Data

In [42]:
csvdata = pd.read_csv('datasets/Java-training-data_V1.csv')
csvdata.head()

Unnamed: 0,rating,cat,comment,amount,store,word,theme
0,10,promoter,Fantastic meals....quite good service,1260,GARDEN CITY,meals,food
1,10,promoter,Fantastic meals....quite good service,1260,GARDEN CITY,service,service
2,10,promoter,Good service,900,ROSSLYN,service,service
3,10,promoter,Joseph is a cool guy his services are spending...,1330,-,joseph,HR
4,10,promoter,Joseph is a cool guy his services are spending...,1330,-,services,service


In [43]:
#Add an extra column for themes as a category
#Label encoding to represent each of the theme classes as numbers
theme_categories = csvdata['theme'].astype('category') #1. We first convert the column into a category
csvdata['theme_categories']  = theme_categories.cat.codes #2. assign the encoded variable to a new column using the cat.codes
target_names = list(theme_categories.cat.categories)
csvdata.head()

Unnamed: 0,rating,cat,comment,amount,store,word,theme,theme_categories
0,10,promoter,Fantastic meals....quite good service,1260,GARDEN CITY,meals,food,4
1,10,promoter,Fantastic meals....quite good service,1260,GARDEN CITY,service,service,9
2,10,promoter,Good service,900,ROSSLYN,service,service,9
3,10,promoter,Joseph is a cool guy his services are spending...,1330,-,joseph,HR,0
4,10,promoter,Joseph is a cool guy his services are spending...,1330,-,services,service,9


In [44]:
# df = pd.Series(csvdata)
# df.describe()

csvdata.describe()
csvdata['theme'].value_counts()


service     36649
food        19062
HR          17293
speed       10554
drink        2698
hygiene      1095
IT            670
menu          506
price         447
billing       302
security       14
Name: theme, dtype: int64

In [45]:
target_names, len(target_names)

(['HR',
  'IT',
  'billing',
  'drink',
  'food',
  'hygiene',
  'menu',
  'price',
  'security',
  'service',
  'speed'],
 11)

In [46]:
processed_data = {}

for row in csvdata.iterrows():
    if row[1]['comment']  in processed_data.keys():  
        processed_data[row[1]['comment']].append(row[1]['theme_categories'])
    else:
        processed_data[row[1]['comment']] = [row[1]['theme_categories']]
        


In [47]:
my_data = {}
my_data['data'] = processed_data.keys()
my_data['target'] = processed_data.values()

X = my_data['data']
y = MultiLabelBinarizer().fit_transform(processed_data.values())
y.shape


(36013, 11)

In [53]:


# X_train = pd.Series(X[:-1000])
# y_train = y[:-1000]

# X_test = pd.Series(X[-1000:])
# y_test = y[-1000:]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42,shuffle=True )

In [54]:
#y_train

In [55]:
# import seaborn as sns
# sns.set_style("whitegrid")
# ytrain = sns.load_dataset(y_train)
# #ax = sns.boxplot(x=tips["total_bill"])

In [56]:
LogReg_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1)),
            ])
print('... Processing')
LogReg_pipeline.fit(X_train, y_train)
# compute the testing accuracy
prediction = LogReg_pipeline.predict(X_test)
print('Test accuracy is {}'.format(accuracy_score(y_test, prediction)))


... Processing
Test accuracy is 0.902350981118


In [60]:
#Validation
comment1 = 'The hospitality of the staff is so welcoming and they are very knowledgeable and helpful with questions and directions' #
comment2 = 'Perfect location, stellar customer service, and the rooms - so quaint, lovely, and cozy! The food was also great'
comment3 = 'LOVE the coffee here! They also make a mean affogato. Its really fun to watch them actually roast their coffee in the back'
comment4 = 'Wide variety of flavours, clean comfortable space'
comment5 = 'cynthia brought my bill late'
comment6 = "Quality coffee and good service"


comments_new = [comment1,comment2,comment3,comment4, comment5, comment6]


predicted = LogReg_pipeline.predict(comments_new)
predicted = pd.DataFrame(predicted, columns=target_names)

predicted
   

for tw, category in zip(comments_new, predicted.iterrows()):
    themes=[]
    for i in range(len(category[1])) :
        if category[1][i] == 1:
            themes.append(target_names[i])
    print('\n%r ===> %s' % (tw, themes))



'The hospitality of the staff is so welcoming and they are very knowledgeable and helpful with questions and directions' ===> ['HR']

'Perfect location, stellar customer service, and the rooms - so quaint, lovely, and cozy! The food was also great' ===> ['food', 'service']

'LOVE the coffee here! They also make a mean affogato. Its really fun to watch them actually roast their coffee in the back' ===> ['drink']

'Wide variety of flavours, clean comfortable space' ===> ['hygiene']

'cynthia brought my bill late' ===> ['HR', 'billing']

'Quality coffee and good service' ===> ['drink', 'service']
