In [1]:
#### Nomenclature
# B beginning of review
# L length of review (by characters)
# b beginning of unit (how many characters in does the highlight begin)
# l length of unit (by characters)
# categories assigned by c or k
# annotators identified by i or j
# sections identified by g or h (for annotators i and j respectively)
# gap or section defined by v (0 or 1 respectively)

In [2]:
# Phrases that have not been highlighted must have empty cells in Excel (not have 0s).
# Same applies to any "Category" cells corresponding to that phrase.

In [1]:
def krippendorff_alpha_u(xlsx_file_url, sheet_name):
    dictionary = store_annotation_data_in_dictionary(xlsx_file_url, sheet_name)
    DocDec = calculate_obsereved_and_expected_disagreement(dictionary[0], dictionary[1], dictionary[2])
    alphas = calculate_alpha(DocDec[0], DocDec[1])
    return alphas

In [2]:
import nltk
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
porter_stemmer = PorterStemmer()
from nltk.stem.snowball import SnowballStemmer
snowball_stemmer = SnowballStemmer("english")
stop = stopwords.words('english')
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
    return ' '.join(lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text))

def store_annotation_data_in_dictionary(xlsx_file_url, sheet_name):
    #### read data into data frame
    import pandas as pd
    df = pd.read_excel(xlsx_file_url, sheet_name=sheet_name)
    # lowercase reviews
    df['Review'] = df['Review'].str.lower()
    # remove whitespace
    df['Review'] = df['Review'].str.strip()
    # remove numbers
    df['Review'] = df['Review'].str.replace('\d+', '')
    # remove punctuation from reviews
    df['Review'] = df['Review'].str.replace('[^\w\s]','')
    # remove stop words
    df['Review'] = df['Review'].apply(lambda x: ' '.join([word for word in str(x).split() if word not in (stop)]))
    # lemmatize reviews
    df['Review'] =  df['Review'].apply(lemmatize_text)
    # stem reviews
    df['Review'] = df['Review'].apply(lambda x : filter(None,x.split(" ")))
    df['Review'] = df['Review'].apply(lambda x : [porter_stemmer.stem(y) for y in x])
    df['Review'] = df['Review'].apply(lambda x : " ".join(x))
    
    B = 0
    
    # determine maximum number of highlights per review
    n_highlights_max = 0
    for col in df.columns:
        if 'Phrase' in col:
            n_highlights_max += 1

    # determine maximum number of categories       
    n_categories = 0
    for i in range(1, n_highlights_max + 1):
        # lowercase phrases
        df['Phrase' + str(i)] = df['Phrase' + str(i)].str.lower()
        # remove whitespace
        df['Phrase' + str(i)] = df['Phrase' + str(i)].str.strip()
        # remove numbers
        df['Phrase' + str(i)] = df['Phrase' + str(i)].str.replace('\d+', '')
        # remove punctuation from phrases
        df['Phrase' + str(i)] = df['Phrase' + str(i)].str.replace('[^\w\s]','')        
        # remove stop words
        df['Phrase' + str(i)] = df['Phrase' + str(i)].apply(lambda x: ' '.join([word for word in str(x).split() if word not in (stop)]))
        # lemmatize phrases
        df['Phrase' + str(i)] =  df['Phrase' + str(i)].apply(lemmatize_text)        
        # stem phrases
        df['Phrase' + str(i)] = df['Phrase' + str(i)].apply(lambda x : filter(None, str(x).split(" ")))
        df['Phrase' + str(i)] = df['Phrase' + str(i)].apply(lambda x : [porter_stemmer.stem(y) for y in x])
        df['Phrase' + str(i)] = df['Phrase' + str(i)].apply(lambda x : " ".join(x))
        
        if df['Category' + str(i)].isnull().all():
            continue
        n_category = int(df['Category' + str(i)].max())
        if n_category > n_categories:
            n_categories = n_category
    n_categories += 1
    print('number_of_categories: ' + str(n_categories))

    annotation_data = [None]*n_categories
    L_dict = {}

    # populate dictionary with data from Excel file
    for category in range(0, n_categories):
        annotation_data[category] = {}
        n_coder = 0
        for index, row in df.iterrows():
            #skip row if does not contain annotation
            if row['Phrase1'] == '0' or row['Phrase1'] == 0 or row['Phrase1'] == '':
                continue

            # add review to dictionary
            if 'review' + str(row.ID) in annotation_data[category].keys():
                n_coder += 1
            else:
                n_coder = 0 
                annotation_data[category]['review' + str(row.ID)] = {}
            annotations = []
            
            # determine number of annotations
            for i in range(1,n_highlights_max + 1):
                #remove space at the end of a phrase if present
                if str(row['Phrase'+str(i)]).endswith(' '):
                    row['Phrase'+str(i)] = str(row['Phrase'+str(i)])[:-1]
                #standardize 0 format 
                elif row['Phrase'+str(i)] == '0' or row['Phrase'+str(i)] == '0.0' or row['Phrase'+str(i)] == 0.0:
                    row['Phrase'+str(i)] = 0  

                if row['Category' + str(i)] == category:
                    annotations.append(row['Phrase'+str(i)])
                else:
                    annotations.append(0)

            n_highlights = n_highlights_max - annotations.count(0)
            #n_sections = 2*(n_highlights)+1

#             last_highlight_index = [index for index, item in enumerate(annotations) if item != 0]
#             if len(last_highlight_index) != 0:
#                 last_highlight_index = last_highlight_index[-1]

            review = row.Review
            L = len(review)
            L_dict['review' + str(row.ID)] = L
            b = [B]
            l = []
            
            # make b vector
            for i in range(1, n_highlights_max + 1):
                if row['Category' + str(i)] == category:
                    b += [review.find(str(row['Phrase'+str(i)])), review.find(str(row['Phrase'+str(i)])) + len(str(row['Phrase'+str(i)]))]

            # make l vector
            for i in range(len(b) - 1) : 
                l.append(abs(b[i] - b[i + 1]))
            l.append(L - b[-1])

            # make v vector
            v = ([0, 1] * n_highlights) + [0]

            if b == [B]:
                continue

            
            #adjust if there are multiple adjacent highlights for the same category
            for j in range (0, l.count(1)):
                if len(b) <= 3:
                    continue           
                for i in range(0, len(l)):

                    if i >= len(l)-2:
                        break
                    if l[i] == 1:
                        del b[i+1]
                        del b[i]
                        l[i-1] = l[i-1] + l[i+1] + l[i]
                        del l[i+1]
                        del l[i]
                        del v[i+2]
                        del v[i+1]


            #store data properties in dictionary
            annotation_data[category]['review' + str(row.ID)]['coder'+str(n_coder)]={}
            annotation_data[category]['review' + str(row.ID)]['coder'+str(n_coder)]['b'] = b
            annotation_data[category]['review' + str(row.ID)]['coder'+str(n_coder)]['l'] = l
            annotation_data[category]['review' + str(row.ID)]['coder'+str(n_coder)]['v'] = v


    print('annotation_data: ' + str(annotation_data))
    print('length_of_reviews: ' + str(L_dict))
    
    return annotation_data, n_categories, L_dict

In [3]:
# make doc and dec into dictionary
def calculate_obsereved_and_expected_disagreement(annotation_data, n_categories, L_dict):
    Doc = {}
    Dec = {}
    for category in range(0, n_categories):

        Doc['category' + str(category)] = {}
        Dec['category' + str(category)] = {}
        for review in annotation_data[category]:
            L = L_dict[review]
            b = []
            l = []
            v = []
            d = []
            for coder in annotation_data[category][review]:
                i = 0
                # extract annotation propoerties into lists
                for annotation in annotation_data[category][review][coder]:

                    if i == 0:
                        b0 = annotation_data[category][review][coder][annotation]
                        b += b0
                        i += 1
                        continue
                    if i == 1:
                        l0 = annotation_data[category][review][coder][annotation]
                        l += l0
                        i += 1
                        continue
                    if i == 2:
                        v0 = annotation_data[category][review][coder][annotation]
                        v += v0
                        i = 0

                        # loop through other annotations
                        for other_coder in annotation_data[category][review]:
                            for other_annotation in annotation_data[category][review][other_coder]:
                                if i == 0:
                                    b1 = annotation_data[category][review][other_coder][other_annotation]
                                    i += 1
                                    continue
                                if i == 1:
                                    l1 = annotation_data[category][review][other_coder][other_annotation]
                                    i += 1
                                    continue
                                if i == 2:
                                    v1 = annotation_data[category][review][other_coder][other_annotation]
                                    i = 0

                                    if b1 == b0 and l1 == l0 and v1 == v0:
                                        d.append(0)
                                        continue

                                    # calculate difference d between two annotations
                                    b0_limit = len(b0)-1
                                    b1_limit = len(b1)-1
                                    #d = []
                                    vec = []
                                    vec.append(b0)
                                    vec.append(b1)

                                    if len(b0)>len(b1):
                                        for index, x in enumerate(b0):
                                            if index >= b1_limit:
                                                # if both are units
                                                if v1[b1_limit] == 1 and v0[index] == 1:
                                                    # if there is no overlap
                                                    if (b0[index] + l0[index]) < b1[b1_limit] or b1[b1_limit] + l1[b1_limit]<b0[index]:
                                                        d.append(l0[index]**2 + l1[b1_limit]**2)
                                                    # if there is overlap
                                                    if (b0[index] <= b1[b1_limit] + l1[b1_limit] <= b0[index] + l0[index]) or (b1[b1_limit] <= b0[index] + l0[index] <= b1[b1_limit] + l1[b1_limit]):
                                                        d.append((b0[index]-b1[b1_limit])**2+(b0[index]+l0[index]-b1[b1_limit]-l1[b1_limit])**2)
                                                elif v1[b1_limit] == 1 and v0[index] == 0:
                                                    d.append(l1[b1_limit]**2)
                                                elif v1[b1_limit] == 0 and v0[index] == 1:
                                                    d.append(l0[index]**2)
                                                else:
                                                    d.append(0)
                                            else:
                                                # if both are units
                                                if v1[index] == 1 and v0[index] == 1:
                                                    # if there is no overlap
                                                    if (b0[index] + l0[index]) < b1[index] or b1[index] + l1[index]<b0[index]:
                                                        d.append(l0[index]**2 + l1[index]**2)
                                                    # if there is overlap
                                                    if (b0[index] <= b1[index] + l1[index] <= b0[index] + l0[index]) or (b1[index] <= b0[index] + l0[index] <= b1[index] + l1[index]):
                                                        d.append((b0[index]-b1[index])**2+(b0[index]+l0[index]-b1[index]-l1[index])**2)
                                                elif v1[index] == 1 and v0[index] == 0:
                                                    d.append(l1[index]**2)
                                                elif v1[index] == 0 and v0[index] == 1:
                                                    d.append(l0[index]**2)
                                                else:
                                                    d.append(0)
                                    else:
                                        for index, x in enumerate(b1):
                                            if index >= b0_limit:
                                                # if both are units
                                                if v1[index] == 1 and v0[b0_limit] == 1:
                                                    # if there is no overlap
                                                    if (b0[b0_limit] + l0[b0_limit]) < b1[index] or b1[index] + l1[index]<b0[b0_limit]:
                                                        d.append(l0[b0_limit]**2 + l1[index]**2)
                                                    # if there is overlap
                                                    if (b0[b0_limit] <= b1[index] + l1[index] <= b0[b0_limit] + l0[b0_limit]) or (b1[index] <= b0[b0_limit] + l0[b0_limit] <= b1[index] + l1[index]):
                                                        d.append((b0[b0_limit]-b1[index])**2+(b0[b0_limit]+l0[b0_limit]-b1[index]-l1[index])**2)
                                                elif v1[index] == 1 and v0[b0_limit] == 0:
                                                    d.append(l1[index]**2)
                                                elif v1[index] == 0 and v0[b0_limit] == 1:
                                                    d.append(l0[b0_limit]**2)
                                                else:
                                                    d.append(0)
                                            else:
                                                # if both are units
                                                if v1[index] == 1 and v0[index] == 1:
                                                    # if there is no overlap
                                                    if (b0[index] + l0[index]) < b1[index] or b1[index] + l1[index]  < b0[index]:
                                                        d.append(l0[index]**2 + l1[index]**2)
                                                    # if there is overlap
                                                    if (b0[index] <= b1[index] + l1[index] <= b0[index] + l0[index]) or (b1[index] <= b0[index] + l0[index] <= b1[index] + l1[index]):
                                                        d.append((b0[index]-b1[index])**2+(b0[index]+l0[index]-b1[index]-l1[index])**2)
                                                elif v1[index] == 1 and v0[index] == 0:
                                                    d.append(l1[index]**2)
                                                elif v1[index] == 0 and v0[index] == 1:
                                                    d.append(l0[index]**2)
                                                else:
                                                    d.append(0)



            # append doc for each review
            m = len(annotation_data[category][review])

            if m == 1 or m == 0:
                continue

            Doc['category' + str(category)][review] = (sum(d)/(m*(m-1)*L**2))

            # Calculate Dec
            n=v.count(1)
            numerator = 0
            second_term_d=0
            second_term_n = 0        
            for index, x in enumerate(b):
                if v[index] == 1:
                    second_term_d += l[index]*(l[index]-1)
                    first_term_n = ((n-1)/3)*((2*(l[index]**3))-(3*l[index]**2)+l[index])
                    second_term_n = 0
                    for index_1, x_1 in enumerate(b):
                        if l[index_1] >= l[index] and v[index_1] ==0:
                            second_term_n += (l[index_1]-l[index]+1)
                    numerator += first_term_n + (l[index]**2)*second_term_n


            numerator = (2/L)*numerator
            first_term_d = m*L*(m*L-1)
            denominator = first_term_d - second_term_d

            # append dec for each review
            Dec['category' + str(category)][review] = (numerator / denominator)

    print('Doc_by_category: ' + str(Doc))
    print('Dec_by_category: ' + str(Dec))
    
    return Doc, Dec

In [4]:
def calculate_alpha(Doc, Dec):
    import numpy as np
    #### Alpha intercoder agreement
    alpha_category = {}
    alpha_combined = {}
    ## alpha for each category
    for category in Doc:
        for review in Doc[category]:
            if review in alpha_category.keys():
                pass
            else:
                alpha_category[review] = {}
            alpha_category[review][category] = 1 - (Doc[category][review]/Dec[category][review])
    print('alpha_by_category: ' + str(alpha_category))

    ## combined alpha
    alpha_combined = {}
    sum_Doc = {}
    sum_Dec = {}

    # calculate Doc and Dec sum
    for category in Doc:
        for review in Doc[category]:
            if review in sum_Doc.keys():
                pass
            else:
                sum_Doc[review] = 0
                sum_Dec[review] = 0
            sum_Doc[review] += Doc[category][review]
            sum_Dec[review] += Dec[category][review]

    # calculate alpha    
    for review in sum_Doc:
        if review in alpha_combined.keys():
            pass
        else:
            alpha_combined[review] = {}
        alpha_combined[review] = 1 - sum_Doc[review] / sum_Dec[review]
    print('alpha_combined: ' + str(alpha_combined))
    
    return alpha_category, alpha_combined

In [None]:
sustainability = 'Economic' # Social, Envrionmental, or Economic
output = krippendorff_alpha_u("/Users/ndehaibi/Desktop/Research/Study 2/IRR/IRR Test (" + str(sustainability) + ").xlsx", 'IRR')
alpha_combined = list((output[1].values()))
from matplotlib import pyplot as plt
bins = 10
arr = plt.hist(alpha_combined, bins)
plt.ylabel('Review Count')
plt.xlabel('Intercoder Agreement')
plt.title(str(sustainability) + ' Sustainability - Letters with NLP')
for i in range(bins):
    plt.text(arr[1][i],arr[0][i] + 1.2,str(arr[0][i]))
print(len(alpha_combined))
import statistics
print('mean: ' + str(statistics.mean(alpha_combined)))
print('std dev: ' + str(statistics.stdev(alpha_combined)))