# AMT Label for Sentences of Movie Reviews

## Data Results

The results of all batches of labeling were saved in a CSV files including a document and sentences id (from docid split) 

## Objective

We want to determine how resolving conflicts will affect the label distribution of the results 

## Methods

### 1. Default to Neutral

This method solves conflicts by default to neutral label, whether there is a neutral answer or conflict the label becomes neutral. 

### 2. Default to Neutral with Coin Tie-breaker

This method return neutral if at least one answer is neutral, if the answers are not neutral it flips a coin to answer. 

### 3. Default to Label with Coint Tie-breaker

This method returns a label if at least one non-neutral answer is available, otherwise flip a coin to answer.


In [1]:
## Imports 
%matplotlib inline

STRUCTURED = '/Users/maru/MyCode/structured'
DATA= 'C:/Users/mramire8/Dropbox/My Papers/Structured Reading/Code/Data/sample3_v3_merge/'

import sys
import os

import numpy as np
import nltk
from sklearn import metrics
import matplotlib.pyplot as plt
import matplotlib as mpl

mpl.style.use('bmh')

In [16]:
def load_data_results(filename):
    import csv 
    from collections import defaultdict
    results = defaultdict(lambda : [])
    header = []
    with open(filename, 'rb') as csvfile:
        sents = csv.DictReader(csvfile, delimiter=',', quotechar='"')
        for row in sents:
            
            for k,v in row.items():
                results[k].append(v)
            
    return results

amt = load_data_results(DATA + "amt.results.csv")


In [21]:
print "\n".join(sorted(amt.keys()))

Agreement
Answer
Answer1
Answer2
DOCID
Date
HITID
ID
SENTID
TARGET
TEXT
Worker1
Worker2


In [205]:
def label_distribution(targets, label=None ):
    ''' Calculate the counts of label in targets'''
    from collections import Counter
    c = Counter(targets)
    return c

def answer_to_label(ans):
    if isinstance(ans, int):
        return ans 
    if 'Negative' in ans:
        return 0
    elif 'Positive'in ans:
        return 1
    elif 'Neutral'in ans:
        return 2
    else: 
        return 3

def to_label(targets):
    return [answer_to_label(t) for t in targets]
    
def print_dist(dist):
    return "\n".join(["%s: %s - %.3f" % (k,v, 1.*v/sum(dist.values())) for k,v in dist.items()])


def to_answers(data, conflict_solver, rnd):
    return [solve_conflict(a,b,conflict_solver, rnd) for a,b in zip(data['Answer1'], data['Answer2'])]

def solve_conflict(a1, a2, conflict_fn, rnd):
    if a1 != a2:
        a11 = answer_to_label(a1)
        a22 = answer_to_label(a2)
        return conflict_fn(a11, a22, rnd)
    else:
        return answer_to_label(a1)

def solver_allneutral(a1, a2, rnd):
    if a1 != a2:
        return answer_to_label('Neutral')
    else: 
        return a1

def solver_neutral(a1, a2, rnd):
    if (a1 + a2) < 2: 
        #flip a coin
        return rnd.randint(2)
    else: # if  there is a neutral in the answers
        return 2 # return neutral

def solver_label(a1, a2, rnd):
    if (a1 + a2) < 2: 
        #flip a coin
        return rnd.randint(2)
    else: # if there is a neutral in the answer
        return min(a1, a2) # return label


In [77]:
c = label_distribution(amt['TARGET'])

In [78]:

print [1. * v/sum(c.values()) for v in c.values()]

[0.511921458625526, 0.48807854137447404]


In [85]:
# print amt['Answer1']
base = label_distribution(to_label(amt['Answer']))
print base
print "\n === True Distribution ==="
print print_dist(c)
print "\n=== Base distribution ==="
print print_dist(base)

Counter({3: 749, 2: 719, 1: 702, 0: 682})

 === True Distribution ===
1: 1460 - 0.512
0: 1392 - 0.488

=== Base distribution ===
0: 682 - 0.239
1: 702 - 0.246
2: 719 - 0.252
3: 749 - 0.263


In [227]:
rnd = np.random.RandomState(123)
neu = label_distribution(to_answers(amt, solver_allneutral, rnd))
print "\n== All neutral =="
print print_dist(neu)

rnd = np.random.RandomState(555)
neu = label_distribution(to_answers(amt, solver_neutral, rnd))
print "\n== Neutral-Coin neutral =="
print print_dist(neu)


rnd = np.random.RandomState(555)
neu = label_distribution(to_answers(amt, solver_label, rnd))
print "\n== Label-Coin neutral =="
print print_dist(neu)


== All neutral ==
0: 682 - 0.239
1: 702 - 0.246
2: 1468 - 0.515

== Neutral-Coin neutral ==
0: 736 - 0.258
1: 753 - 0.264
2: 1363 - 0.478

== Label-Coin neutral ==
0: 1073 - 0.376
1: 1060 - 0.372
2: 719 - 0.252


In [179]:
to_answers({'Answer1':[0,0,0,1,1,1,2,2,2],'Answer2':[0,1,2,0,1,2,0,1,2]}, solver_label, rnd)

[0, 1, 0, 1, 1, 1, 0, 1, 2]

In [206]:
rnd = np.random.RandomState(555)
equal = [a == b for a, b in zip(amt['Answer1'], amt['Answer2'])]
res = [[a == b, a, b, solve_conflict(a,b,solver_neutral, rnd)] for a,b in zip(amt['Answer1'], amt['Answer2'])]
print sum(equal)
print 2103 + 749


2103
2852


In [207]:
print "\n".join(", ".join("%s" % t for t in r) for r in res)
 

False, Negative Sentiment, Neutral Sentiment, 2
True, Neutral Sentiment, Neutral Sentiment, 2
False, Neutral Sentiment, Positive Sentiment, 2
True, Negative Sentiment, Negative Sentiment, 0
False, Negative Sentiment, Neutral Sentiment, 2
False, Negative Sentiment, Neutral Sentiment, 2
True, Positive Sentiment, Positive Sentiment, 1
True, Negative Sentiment, Negative Sentiment, 0
True, Neutral Sentiment, Neutral Sentiment, 2
True, Negative Sentiment, Negative Sentiment, 0
True, Positive Sentiment, Positive Sentiment, 1
True, Negative Sentiment, Negative Sentiment, 0
True, Negative Sentiment, Negative Sentiment, 0
True, Neutral Sentiment, Neutral Sentiment, 2
True, Negative Sentiment, Negative Sentiment, 0
True, Neutral Sentiment, Neutral Sentiment, 2
True, Positive Sentiment, Positive Sentiment, 1
True, Positive Sentiment, Positive Sentiment, 1
True, Negative Sentiment, Negative Sentiment, 0
True, Positive Sentiment, Positive Sentiment, 1
True, Neutral Sentiment, Neutral Sentiment, 2
Tr

In [220]:
np.sum([solve_conflict(0,0, solver_neutral, rnd) for _ in range(10000)])

0