In [3]:
import csv
import collections

In [4]:
def classify_editor(bias, param):
    if bias > param:
        editor_bias = 'i'
    elif bias < param*-1:
        editor_bias = 'p'
    else:
        editor_bias = 'n'
        
    return editor_bias

In [16]:
# Article editor bias
csvpath = '/home/michael/school/cprose_research/wp/wp_articles/IPC_userBiases.csv'

# Find specific user
with open(csvpath, 'r') as csvfile:
    reader = csv.reader(csvfile)
    next(reader) # Skip first row
    for row in reader:
        username = row[1]
        if username == 'Harlan wilkerson':
            print(row[5], row[9], row[12])

United Nations Partition Plan for Palestine 2006-09-16 08:37:26 0.8586607649920523
Deir Yassin massacre 2008-07-04 17:18:56 -1.124294587040307
Deir Yassin massacre 2008-07-04 17:33:55 -0.01599666110435649
United Nations Special Committee on Palestine 2008-07-08 20:07:22 0.7638555033080081
McMahon–Hussein Correspondence 2008-07-09 19:21:16 2.2514427188962998
McMahon–Hussein Correspondence 2008-07-09 19:26:09 -1.2574439747544517
McMahon–Hussein Correspondence 2008-07-09 19:27:23 0.027506050473441235
Balfour Declaration 2008-07-09 19:55:33 -11.571101012458257
Balfour Declaration 2008-07-09 19:57:55 -0.2724029758365134
Balfour Declaration 2008-07-09 19:59:48 0.06711130913496532
McMahon–Hussein Correspondence 2008-07-09 22:32:11 -10.173339685212255
McMahon–Hussein Correspondence 2008-07-09 22:33:49 -0.0468857959608755
McMahon–Hussein Correspondence 2008-07-09 22:44:22 -2.0223611937394423
McMahon–Hussein Correspondence 2008-07-09 22:46:43 0.007300058070917714
McMahon–Hussein Correspondence 2

In [6]:
# Build dictionary of talk page editor bias
csvpath = '/home/michael/school/cprose_research/wp/wp-talk/bias_output/talkpages.csv_biases_50_10.csv'
talk_biases = {}

with open(csvpath, 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    
    # Skip first row
    next(reader)
    
    for row in reader:
        username = row[3]
        bias = float(row[5])
        if not username in talk_biases:
            talk_biases[username] = bias
        else:
            talk_biases[username] += bias

print(len(talk_biases)) # Should be 6461

6461


In [3]:
# Fetch combined editor biases from big CSV
combined_csvpath = '/home/michael/school/cprose_research/wp/ipc_editor_bias.csv'
combined_biases = {}

with open(combined_csvpath, 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)
    for row in reader:
        name = row[0]
        sum_bias = row[1]
        avg_bias = row[2]
        if len(row) > 3:
            talk_bias = row[3]
            combined_biases[name] = [sum_bias, avg_bias, talk_bias]
        else:
            combined_biases[name] = [sum_bias, avg_bias]

len(combined_biases)

346817

In [5]:
# Build dictionary of avg article editor bias
article_avg_biases = {}
csvpath = '/home/michael/school/cprose_research/wp/wp_articles/IPC_userBiases.csv'

with open(csvpath, 'r') as csvfile:
    reader = csv.reader(csvfile)
    next(reader) # Skip first row
    for row in reader:
        username = row[1]
        if not username in article_avg_biases:
            article_avg_biases[username] = float(row[2])

len(article_avg_biases)

345676

In [8]:
# Build dictionary of sum article editor bias
article_sum_biases = {}
csvpath = '/home/michael/school/cprose_research/wp/wp_articles/IPC_userBiases.csv'

with open(csvpath, 'r') as csvfile:
    reader = csv.reader(csvfile)
    next(reader) # Skip first row
    for row in reader:
        username = row[1]
        if not username in article_sum_biases:
            article_sum_biases[username] = float(row[2])
        else:
            article_sum_biases[username] += float(row[2])

len(article_sum_biases)

345676

In [17]:
# Combine dictionaries for all editors seen
# editor: [sum, avg, talk_page]
combined_biases = {}

# From article edits
for editor in article_sum_biases:
    combined_biases[editor] = [article_sum_biases[editor], article_avg_biases[editor]]

for editor in talk_biases:
    if editor in combined_biases:
        combined_biases[editor].append(talk_biases[editor])
    else:
        combined_biases[editor] = ["", "", talk_biases[editor]]

# Remove anonymous editor
del combined_biases['']

len(combined_biases) # should be 346817

346817

In [27]:
# Create CSV with all editor biases
combined_csvpath = '/home/michael/school/cprose_research/wp/ipc_editor_bias.csv'

with open(combined_csvpath, 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    writer.writerow(['username', 'article_sum_bias', 'article_avg_bias','talk_page_bias'])
    for editor in combined_biases:
        line = [editor]
        line.extend(combined_biases[editor])
        writer.writerow(line)

In [41]:
# Check consistency of talk page bias model application v article
consistent_editors = []
inconsistent_editors = []
param = 10

for editor in combined_biases:
    if len(combined_biases[editor]) > 2 and not combined_biases[editor][0] == "":
        # Handling neutral classification
        article_class = classify_editor(combined_biases[editor][0], param)
        talk_class = classify_editor(combined_biases[editor][2], param)
        if article_class == 'n' or talk_class == 'n' or article_class == talk_class:
            consistent_editors.append(editor)
        else:
            inconsistent_editors.append(editor)

print("Consistent:", len(consistent_editors))
print("Inconsistent:", len(inconsistent_editors))
print("Total:", len(consistent_editors) + len(inconsistent_editors))

Consistent: 4466
Inconsistent: 853
Total: 5319


In [51]:
# Create mappings from editor to editor ID

editor_ids = collections.OrderedDict()
ctr = 1
for editor in list(editor_biases):
    editor_ids[editor] = ctr
    ctr += 1

In [52]:
# Build test alignment matrix for Yu+ 2015
alignment_matrix = []
threshold = 0
num_editors = 10

for i in range(num_editors):
    editor = list(editor_biases)[i]
    editor_bias = classify_editor(editor_biases[editor], threshold)
    #print(editor, "bias: ", editor_bias)
    editor_vec = []
    #editor_vec.append(editor) # add editor name
    editor_vec.append(editor_ids[editor]) # add editor id
    for j in range(num_editors):
        other_editor = list(editor_biases)[j]
        other_editor_bias = classify_editor(editor_biases[other_editor], threshold)
        #print("\t", other_editor, "bias: ", other_editor_bias)
        if editor_bias == 'i':
            if other_editor_bias == 'i':
                editor_vec.append(1)
            elif other_editor_bias == 'p':
                editor_vec.append(-1)
            else:
                editor_vec.append('')
        elif editor_bias == 'p':
            if other_editor_bias == 'p':
                editor_vec.append(1)
            elif other_editor_bias == 'i':
                editor_vec.append(-1)
            else:
                editor_vec.append('')
        elif editor_bias == 'n':
            editor_vec.append('')
        
    # Add in useless vote
    #editor_vec.append('none')
    
    alignment_matrix.append(editor_vec)

alignment_matrix

[[1, 1, -1, 1, 1, 1, -1, -1, -1, 1, -1],
 [2, -1, 1, -1, -1, -1, 1, 1, 1, -1, 1],
 [3, 1, -1, 1, 1, 1, -1, -1, -1, 1, -1],
 [4, 1, -1, 1, 1, 1, -1, -1, -1, 1, -1],
 [5, 1, -1, 1, 1, 1, -1, -1, -1, 1, -1],
 [6, -1, 1, -1, -1, -1, 1, 1, 1, -1, 1],
 [7, -1, 1, -1, -1, -1, 1, 1, 1, -1, 1],
 [8, -1, 1, -1, -1, -1, 1, 1, 1, -1, 1],
 [9, 1, -1, 1, 1, 1, -1, -1, -1, 1, -1],
 [10, -1, 1, -1, -1, -1, 1, 1, 1, -1, 1]]

In [53]:
# Create input file for Yu+ 2015 cluster code
out_filepath = '/home/michael/school/cprose_research/yu+_2015_killer/wp-talk_threshold0/wp-talk_threshold0.txt'
with open(out_filepath, 'w') as out:
    for row in alignment_matrix:
        out.write('\t'.join([str(item) for item in row]) + '\n')

In [54]:
# Create editor mappings file
editor_ids_filepath = '/home/michael/school/cprose_research/yu+_2015_killer/wp-talk_editor_mappings.txt'
with open(editor_ids_filepath, 'w') as f:
    editor_id_keys = list(editor_ids.keys())
    for i in range(len(editor_biases)):
        editor = editor_id_keys[i]
        editor_id = editor_ids[editor]
        editor_bias = editor_biases[editor]
        f.write(str(editor_id) + '\t' + editor + '\t' + str(editor_bias) + '\n')