## Validation: Multi-Label  Automatic Comment tagging

The purpose of this document is to compare the results of automatic tagging with that of a human annotator.

This comparison is achieved in three steps:
1. Human-annotated comments and themes are loaded
2. Use saved model to annotate comments automatically
3. Compare 1 and 2 above

#### 1.) Load Human annotated  Data

In [167]:
csvdata = pd.read_csv('datasets/Java-validationSet_v2.csv')
csvdata.head()

Unnamed: 0,userId,source,comment,theme
0,254728000000.0,PY,At least have customers care attendant to i...,HR
1,254728000000.0,PY,At least have customers care attendant to i...,service
2,254717000000.0,PY,Friendly service n. This is a unique disp...,food
3,254717000000.0,PY,Friendly service n. This is a unique disp...,service
4,254722000000.0,java,Good service from Francis,HR


In [168]:
csvdata.shape

(7653, 4)

In [169]:
#Add an extra column for themes as a category
#Label encoding to represent each of the theme classes as numbers
theme_categories = csvdata['theme'].astype('category') #1. We first convert the column into a category
csvdata['theme_categories']  = theme_categories.cat.codes #2. assign the encoded variable to a new column using the cat.codes
target_names = list(theme_categories.cat.categories)
csvdata.head()

Unnamed: 0,userId,source,comment,theme,theme_categories
0,254728000000.0,PY,At least have customers care attendant to i...,HR,0
1,254728000000.0,PY,At least have customers care attendant to i...,service,8
2,254717000000.0,PY,Friendly service n. This is a unique disp...,food,4
3,254717000000.0,PY,Friendly service n. This is a unique disp...,service,8
4,254722000000.0,java,Good service from Francis,HR,0


In [170]:
csvdata.describe()
csvdata['theme'].value_counts()


service    2517
food       2412
HR         1110
speed       601
hygiene     413
price       312
drink       180
billing      47
IT           36
menu         25
Name: theme, dtype: int64

In [171]:
target_names, len(target_names)

(['HR',
  'IT',
  'billing',
  'drink',
  'food',
  'hygiene',
  'menu',
  'price',
  'service',
  'speed'],
 10)

In [172]:
processed_data = {}

for row in csvdata.iterrows():
    if row[1]['comment']  in processed_data.keys():  
        processed_data[row[1]['comment']].append(row[1]['theme_categories'])
    else:
        processed_data[row[1]['comment']] = [row[1]['theme_categories']]
        


In [173]:
my_data = {}
my_data['data'] = processed_data.keys()
my_data['target'] = processed_data.values()

X = my_data['data']
y = MultiLabelBinarizer().fit_transform(processed_data.values())
y.shape


(4769, 10)

In [174]:
#Match the number of columns in the two sets.
#Sometimes the validation set doesnt have data in all the columns. In this case it is missing the security column

#New dataset l
b = np.zeros((4769,11)) #Create a matrix of zeros with the default size
b[:,:8] = y[:,:8]
b[:,-2:] = y[:,-2:]
y=b

In [175]:
y.shape

(4769, 11)

#### 2.) Use saved model for automated annotation

In [177]:
import pickle

filename = "LogReg.sav"
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.predict(processed_data.keys())
result.shape

(4769, 11)

#### 3.) Compare human and computer annotations


In [154]:
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(result.argmax(axis=1), y.argmax(axis=1))
conf_matrix

array([[ 920,    4,    6,    4,   59,   12,    0,   39,   15,   17],
       [   1,   16,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    4,    0,    0,    0,    0,    0,    0,    0],
       [   6,    0,    0,   74,    8,    1,    0,    0,    0,    0],
       [  77,    5,    9,   63, 1852,    4,    1,   62,    3,    4],
       [   5,    0,    0,    2,    8,  129,    0,    0,    0,    0],
       [   0,    0,    0,    1,    1,    0,    6,    0,    0,    0],
       [   0,    0,    4,    0,    2,    1,    0,   79,    2,    0],
       [ 100,    2,    8,    5,   78,   20,    0,   17,  972,   25],
       [   1,    0,    1,    0,    0,    0,    0,    0,    0,   34]])

In [155]:
conf_matrix.shape

(10, 10)

In [156]:
#Create a dataframe from the confusion matrix
df = pd.DataFrame(conf_matrix)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,920,4,6,4,59,12,0,39,15,17
1,1,16,0,0,0,0,0,0,0,0
2,0,0,4,0,0,0,0,0,0,0
3,6,0,0,74,8,1,0,0,0,0
4,77,5,9,63,1852,4,1,62,3,4
5,5,0,0,2,8,129,0,0,0,0
6,0,0,0,1,1,0,6,0,0,0
7,0,0,4,0,2,1,0,79,2,0
8,100,2,8,5,78,20,0,17,972,25
9,1,0,1,0,0,0,0,0,0,34


In [157]:
#Calculate the sum across columns
column_sums = df.sum(axis=0)
column_sums

0    1110
1      27
2      32
3     149
4    2008
5     167
6       7
7     197
8     992
9      80
dtype: int64

In [158]:
#Calculate the sum across rows
row_sums = df.sum(axis=1)
row_sums

0    1076
1      17
2       4
3      89
4    2080
5     144
6       8
7      88
8    1227
9      36
dtype: int64

In [159]:
#Calculate observed agreement by doing sum across the diagonal
diagonal_sum = conf_matrix.diagonal().sum()
diagonal_sum

4086

In [160]:
#Calculate sum of the entire dataframe
entire_df_sum = conf_matrix.sum()
entire_df_sum

4769

In [161]:
from __future__ import division
po = float(diagonal_sum/entire_df_sum)
po

0.8567833927448102

In [162]:
lista = map(lambda x:x/entire_df_sum,row_sums)
listb = map(lambda x:x/entire_df_sum,column_sums)

In [163]:
pe = sum([a*b for a,b in zip(lista,listb)])
pe

0.2922326212031722

In [164]:
cohen = (po-pe)/(1-pe)
cohen

0.7976501721531001