In [1]:
import pandas as pd
import numpy as np
import math
import time
import copy
from tqdm import tqdm
from pathlib import Path
import matplotlib.pyplot as plt
import matplotlib.lines as lines
import networkx as nx
import networkx.algorithms.community as nxcom
import collections
import community as community_louvain
import krippendorff
import seaborn as sns
import statistics

import helper.visualization as viz
from helper.cluster import *

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### Parameters

In [3]:
min_size_group = 250

size_leave_out = 100000
ratio = 0.119

selected_number = 3
number_of_groups = 7

# for reproducability of teh paper's results
random_states = {
  "heuristic6": 39,
  "heuristic7": 24,
  "agreement6": 10,
  "agreement7": 21,
  "cohen6": 25,
  "cohen7": 16,
  "krippendorff6": 6,
  "krippendorff7": 41
}

# Paths
path_comments = "00_data/attack_annotated_comments.tsv"
path_annotations = "00_data/attack_annotations.tsv"
path_workers = "00_data/attack_worker_demographics.tsv"
path_graph = "./01_processed_data/200903_run_004/200903_00_graph_"
path_store_raw = "./01_processed_data/200903_run_004/datasplits/200903_louvian"
path_results = "./03_results/200903_run_004/200903_louvian_"

# Distance functions
types = ['agreement','cohen','krippendorff','heuristic']
#types = ['krippendorff']
types_print_names = ['Agreement Rate','Cohen\'s Kappa','Krippendorff\'s Alpha','Heuristic Function']

### Detect communities and split data data

In [4]:
table_latex = [
    ['Size of training set/test set'],
    ['Number of identified groups'],
    ['Number of selected groups'],
    ['AVG(annotators/groups)'],
    ['SD(annotators/groups)'],
]

for i in range(0,len(types)):
    selected_type  = types[i]
    selected_print_name = types_print_names[i]
    path_store = path_store_raw + "_" + selected_type
    random_state = random_states[selected_type+str(number_of_groups)] 
    
    print("#"*70)
    print("###",selected_print_name)
    print("#"*70)
    print("\n")    

    
    print("Random state:", random_state)
    
    G = nx.read_gexf(path_graph + selected_type + ".gexf")
    df_annotations = pd.read_csv(Path(path_annotations), sep="\t", encoding="utf-8")
    print("Number of annotations:\t", len(df_annotations))

    df_comments = pd.read_csv(Path(path_comments), sep="\t", encoding="utf-8")
    print("Number of comments:\t", len(df_comments))
    print("\n")
    
    
    print("-"*70)
    print("---","Community detection")
    print("-"*70)

    group_list = getCommunityGroups(G,method="louvian",random_state=random_state)

    # Number of identified groups
    table_latex[1].append('{:,.0f}'.format(len(group_list)))
    
    df_stats = getExtendedStatsOfGroups(group_list, df_annotations).sort_values(by=['#Annotators'],ascending=False)
    df_stats
    print("\n")


    print("-"*70)
    print("---","Remove small communities")
    print("-"*70)

    group_list = removeSmallGroup(group_list,min_size_group)

    group_list.insert(0,[])
    for i in range(0,len(group_list)):
        print("Group", i,"and size of",len(group_list[i]))
    print("\n")
    
    # avg and sd of group size
    sizes_groups = []
    for group in group_list:
        sizes_groups.append(len(group))
    table_latex[3].append('{:,.2f}'.format(statistics.mean(sizes_groups))) 
    table_latex[4].append('{:,.2f}'.format(np.std(sizes_groups)))    
    
    # Number of selected groups
    table_latex[2].append('{:,.0f}'.format(len(group_list)-1))


    print("-"*70)
    print("---","Create data sets for each community (leave-out and single only)")
    print("-"*70)

    group_dfs_each,group_dfs_leave_out = getGroupSpecificDataSlices(group_list,df_annotations,df_comments)
    print("Length group_dfs_each:",len(group_dfs_each))
    print("Length group_dfs_leave_out:",len(group_dfs_leave_out))
    print("\n")


    print("-"*70)
    print("---","Identify shared comments")
    print("-"*70)

    shared_comments = getSharedComments(group_dfs_each)
    print("Number of shared comments between all groups:\t\t\t",len(shared_comments))

    shared_comments_leave_out = getSharedComments(group_dfs_leave_out)
    print("Number of shared comments leave out between all groups:\t\t",len(shared_comments_leave_out))

    not_contained_comments= getNotContainedComments(group_dfs_each,df_comments)
    print("Number of comments that are not contained in all groups:\t",len(not_contained_comments))
    print("\n")


    print("-"*70)
    print("---","Size of selected groups")
    print("-"*70)

    for group in group_dfs_each:
        print(len(group))
    print("\n")

    # Number of selected groups
    table_latex[0].append('{:,.0f} / {:,.0f} '.format(len(shared_comments)*0.8,len(shared_comments)*0.2))                            
                            
    res1,res2,res3 = getInterraterReliability(group_dfs_each,shared_comments,group_list,df_annotations,selected_print_name,selected_type,path_results)

    storeFullySharedOnlyDataSlices(group_dfs_each,shared_comments,path_store,random_state)
    #storeFullySharedLeaveOutDataSlices(group_dfs_leave_out,shared_comments,path_store,random_state)    
    #storeTestSharedOnlyDataSlices(group_dfs_each,shared_comments,path_store,random_state)
    #storeTestSharedLeaveOutDataSlices(group_dfs_leave_out,shared_comments_leave_out,size_leave_out,path_store,random_state)

df_latex_table =  pd.DataFrame(table_latex, columns=['Weight function','Agreement Rate','Cohen\'s Kappa','Krippendorff\'s Alpha','Heuristic Function'])


######################################################################
### Agreement Rate
######################################################################


Random state: 21
Number of annotations:	 1365217
Number of comments:	 115864


----------------------------------------------------------------------
--- Community detection
----------------------------------------------------------------------


Unnamed: 0,#Annotators,#Annotations,#Comments
3,868,301189,106936
2,835,276759,104471
4,783,265077,103904
1,772,253033,102465
0,452,156239,85050
5,343,112920,71426




----------------------------------------------------------------------
--- Remove small communities
----------------------------------------------------------------------
Group 0 and size of 0
Group 1 and size of 452
Group 2 and size of 772
Group 3 and size of 835
Group 4 and size of 868
Group 5 and size of 783
Group 6 and size of 343


----------------------------------------------------------------------
--- Create data sets for each community (leave-out and single only)
----------------------------------------------------------------------
Length group_dfs_each: 7
Length group_dfs_leave_out: 7


----------------------------------------------------------------------
--- Identify shared comments
----------------------------------------------------------------------
Number of shared comments between all groups:			 32348
Number of shared comments leave out between all groups:		 115862
Number of comments that are not contained in all groups:	 0


---------------------------------------

Unnamed: 0,#Annotators,#Annotations,#Comments
12,536,180618,89862
1,471,155775,83859
11,426,141981,80570
8,412,142012,80494
6,318,102944,66677
4,301,95870,65087
5,278,88991,61535
0,274,88086,60393
7,271,90303,62811
3,256,93392,62229




----------------------------------------------------------------------
--- Remove small communities
----------------------------------------------------------------------
Group 0 and size of 0
Group 1 and size of 274
Group 2 and size of 471
Group 3 and size of 256
Group 4 and size of 301
Group 5 and size of 278
Group 6 and size of 318
Group 7 and size of 271
Group 8 and size of 412
Group 9 and size of 426
Group 10 and size of 536


----------------------------------------------------------------------
--- Create data sets for each community (leave-out and single only)
----------------------------------------------------------------------
Length group_dfs_each: 11
Length group_dfs_leave_out: 11


----------------------------------------------------------------------
--- Identify shared comments
----------------------------------------------------------------------
Number of shared comments between all groups:			 2868
Number of shared comments leave out between all groups:		 115864
Num

Unnamed: 0,#Annotators,#Annotations,#Comments
4,738,257878,102862
0,557,182588,90603
3,495,164109,87140
7,466,150878,82890
8,428,132231,77986
9,423,143420,80780
5,303,101096,66293
10,302,115166,72223
2,113,37468,31739
11,97,33235,29232




----------------------------------------------------------------------
--- Remove small communities
----------------------------------------------------------------------
Group 0 and size of 0
Group 1 and size of 557
Group 2 and size of 495
Group 3 and size of 738
Group 4 and size of 303
Group 5 and size of 466
Group 6 and size of 428
Group 7 and size of 423
Group 8 and size of 302


----------------------------------------------------------------------
--- Create data sets for each community (leave-out and single only)
----------------------------------------------------------------------
Length group_dfs_each: 9
Length group_dfs_leave_out: 9


----------------------------------------------------------------------
--- Identify shared comments
----------------------------------------------------------------------
Number of shared comments between all groups:			 9615
Number of shared comments leave out between all groups:		 115864
Number of comments that are not contained in all group

Unnamed: 0,#Annotators,#Annotations,#Comments
3,1099,371653,111297
2,1046,352544,110352
1,748,245672,101744
4,616,206322,96029
0,544,189026,92939




----------------------------------------------------------------------
--- Remove small communities
----------------------------------------------------------------------
Group 0 and size of 0
Group 1 and size of 544
Group 2 and size of 748
Group 3 and size of 1046
Group 4 and size of 1099
Group 5 and size of 616


----------------------------------------------------------------------
--- Create data sets for each community (leave-out and single only)
----------------------------------------------------------------------
Length group_dfs_each: 6
Length group_dfs_leave_out: 6


----------------------------------------------------------------------
--- Identify shared comments
----------------------------------------------------------------------
Number of shared comments between all groups:			 59432
Number of shared comments leave out between all groups:		 115864
Number of comments that are not contained in all groups:	 0


-------------------------------------------------------------

In [5]:
print(df_latex_table.to_latex(index=False,bold_rows=True))

\begin{tabular}{lllll}
\toprule
               Weight function &   Agreement Rate & Cohen's Kappa & Krippendorff's Alpha & Heuristic Function \\
\midrule
 Size of training set/test set &  25,878 / 6,470  &  2,294 / 574  &       7,692 / 1,923  &   47,546 / 11,886  \\
   Number of identified groups &                6 &            14 &                   12 &                  5 \\
     Number of selected groups &                6 &            10 &                    8 &                  5 \\
        AVG(annotators/groups) &           579.00 &        322.09 &               412.44 &             675.50 \\
         SD(annotators/groups) &           301.19 &        135.60 &               191.84 &             364.92 \\
\bottomrule
\end{tabular}

