In [1]:
import pandas as pd
import sys, json, re, os, math
from collections import defaultdict, OrderedDict, Counter
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import cohen_kappa_score, make_scorer
from sklearn import cross_validation, svm
from sklearn.linear_model import LogisticRegression, LinearRegression
from scipy.sparse import hstack
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from IPython.core.debugger import Tracer; debug_here = Tracer()

kappa_scorer = make_scorer(cohen_kappa_score)



# Split up into CV folds

In [2]:
# Load features
feats = pd.read_csv('/home/michael/school/research/wp/wikipedia/data/revert_discussion_features.csv')
len(feats)

2073

In [None]:
# Quantiles

# Load, test all features:
* Unigrams
* 'Winning arguments' relationship features
* Extras from Yohan

In [34]:
# Load features
feats = pd.read_csv('/home/michael/school/research/wp/wikipedia/data/revert_discussion_features.csv')

# Get labels
# thresholds = np.arange(0.4, 1.0, 0.1)
# labels = {t: {} for t in thresholds}
# for t in thresholds:
#     labels[t] = feats['score_>{:0.1}'.format(t)].values

len(feats)

2073

In [35]:
feats.columns

Index(['article', 'thread_title', 'editor', 'editor_talk', 'other_talk',
       '#editor_turns', '#other_turns', 'words_in_common',
       'intersection/editor', 'intersection/other', 'jaccard', 'ART::DEF',
       'ART::INDEF', 'AUTH::CREDENTIALS', 'AUTH::EXPERIENTIAL',
       'AUTH::EXTERNAL', 'AUTH::FORUM', 'AUTH::SOCIAL_EXPECTATIONS', 'EXAMPLE',
       'FIRST_TURN', 'LAST_TURN', 'PERS_PRON::PLUR', 'PERS_PRON::SING',
       'QUESTION', 'URL', 'T0', 'T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', 'T8',
       'T9', 'T10', 'T11', 'T12', 'T13', 'T14', 'SENTI::POS', 'SENTI::NEG',
       'editor_score', 'nonbow_pred', 'pred', 'num_users'],
      dtype='object')

In [41]:
feats.columns[5:6] | feats.columns[11:-5]

Index(['#editor_turns', 'ART::DEF', 'ART::INDEF', 'AUTH::CREDENTIALS',
       'AUTH::EXPERIENTIAL', 'AUTH::EXTERNAL', 'AUTH::FORUM',
       'AUTH::SOCIAL_EXPECTATIONS', 'EXAMPLE', 'FIRST_TURN', 'LAST_TURN',
       'PERS_PRON::PLUR', 'PERS_PRON::SING', 'QUESTION', 'SENTI::NEG',
       'SENTI::POS', 'T0', 'T1', 'T10', 'T11', 'T12', 'T13', 'T14', 'T2', 'T3',
       'T4', 'T5', 'T6', 'T7', 'T8', 'T9', 'URL'],
      dtype='object')

In [45]:
# Vectorize input features
feats_v = {}

# Get unigram features
v = CountVectorizer(min_df=1, stop_words='english')
edbow = v.fit_transform(feats['editor_talk'])
print(edbow.shape)

v_other = CountVectorizer(min_df=1, stop_words='english')
other_bow = v_other.fit_transform(feats['other_talk'])
print(other_bow.shape)

# Get exclusive editor non-unigram features
ed_nonbow_d = {}
for col in feats.columns[5:6] | feats.columns[11:-5]:
    ed_nonbow_d[col] = np.array([feats[col]]).T
#     ed_nonbow_d[col] = np.array([(v - min(feats[col]))/max(feats[col]) for v in feats[col].values]).T
ed_nonbow = np.hstack(ed_nonbow_d.values())
print(ed_nonbow.shape)

# Get others' non-unigram features
nonbow_d = {}
for col in feats.columns[6:11]:
    nonbow_d[col] = np.array([feats[col]]).T
#     nonbow_d[col] = np.array([(v - min(feats[col]))/max(feats[col]) for v in feats[col].values]).T
nonbow = np.hstack(nonbow_d.values())
print(nonbow.shape)

# Assemble editor features
edfeats = hstack([edbow, ed_nonbow])

# Assemble non-unigram features
nonbow_f = np.hstack([ed_nonbow, nonbow])

# Assemble all features
feats_v = hstack([edbow, other_bow, ed_nonbow, nonbow])
feats_v.shape

(2073, 24190)
(2073, 26861)
(66336,)
(10365,)


ValueError: blocks[0,:] has incompatible row dimensions

## Train and test editor features

In [37]:
# Train and test linear regression classifier
clf = LinearRegression()

# edpred = cross_validation.cross_val_predict(clf, edfeats, feats['editor_score'], cv=10)
edpred = cross_validation.cross_val_predict(clf, ed_nonbow, feats['editor_score'], cv=10)
edpred = np.array([max(p, 0.0) for p in edpred])
edpred = np.array([min(p, 1.0) for p in edpred])

# The mean square error
print("mse: {:2f}".format(np.mean((edpred - feats['editor_score'].values) ** 2)))

# The root mean square error
print("rmse: {:2f}".format(math.sqrt(np.mean((edpred - feats['editor_score'].values) ** 2))))

mse: 0.131566
rmse: 0.362720


## Try 0.5 baseline

In [43]:
# The mean square error
guess = [0.5]*len(feats['editor_score'])
print("mse: {:2f}".format(np.mean((guess - feats['editor_score'].values) ** 2)))

# The root mean square error
print("rmse: {:2f}".format(math.sqrt(np.mean((guess - feats['editor_score'].values) ** 2))))

mse: 0.139060
rmse: 0.372908


## Train and test non-unigram features

In [38]:
# Train and test linear regression classifier on non-unigram features (continuous editor_score input)
clf = LinearRegression()

nonbow_pred = cross_validation.cross_val_predict(clf, nonbow, feats['editor_score'], cv=10)
nonbow_pred = np.array([max(p, 0.0) for p in nonbow_pred])
nonbow_pred = np.array([min(p, 1.0) for p in nonbow_pred])

# The mean square error
print("mse: {:2f}".format(np.mean((nonbow_pred - feats['editor_score'].values) ** 2)))

# The root mean square error
print("rmse: {:2f}".format(math.sqrt(np.mean((nonbow_pred - feats['editor_score'].values) ** 2))))

mse: 0.131227
rmse: 0.362253


## Train and test editor+other features (all features)

In [51]:
# Train and test linear regression classifier on all features (continuous editor_score input)
clf = LinearRegression()

pred = cross_validation.cross_val_predict(clf, feats_v, feats['editor_score'], cv=10)

bounded_pred = np.array([max(p, 0.0) for p in pred])
bounded_pred = np.array([min(p, 1.0) for p in bounded_pred])

# The mean square error
print("mse: {:2f}".format(np.mean((bounded_pred - feats['editor_score'].values) ** 2)))
print("rmse: {:2f}".format(math.sqrt(np.mean((bounded_pred - feats['editor_score'].values) ** 2))))

mse: 0.242041
rmse: 0.491977


In [39]:
# Save predictions
feats['nonbow_pred'] = nonbow_pred
# feats['pred'] = bounded_pred
feats['edpred'] = edpred

feats.to_csv('/home/michael/school/research/wp/wikipedia/data/revert_discussion_features.csv', index=False)

In [4]:
# Train and test SVM classifier on non-unigram features
nonbow_clf = {}
for t in thresholds:
    print(t)
#     clf = svm.SVC(kernel='linear')
    clf = svm.SVC()

    scores = cross_validation.cross_val_score(clf, nonbow, labels[t], scoring=kappa_scorer)
    print("Kappa: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

    scores = cross_validation.cross_val_score(clf, nonbow, labels[t])
    print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
    
    # Majority class guess
    true_portion = np.count_nonzero(labels[t])/len(labels[t])
    print("Majority class guess:\t", max(true_portion, 1-true_portion))
    
    # Random class guess
    print("Random class guess:\t", true_portion**2 + (1-true_portion)**2)
    print()
    
    nonbow_clf[t] = clf

0.4
Kappa: 0.003 (+/- 0.003)
Accuracy: 0.653 (+/- 0.005)
Majority class guess:	 0.6594370096908168
Random class guess:	 0.5508403201182992

0.5
Kappa: -0.002 (+/- 0.019)
Accuracy: 0.572 (+/- 0.002)
Majority class guess:	 0.5897554222427319
Random class guess:	 0.5161120716439422

0.6
Kappa: -0.020 (+/- 0.070)
Accuracy: 0.513 (+/- 0.033)
Majority class guess:	 0.5366866635902169
Random class guess:	 0.5026918225707635

0.7
Kappa: 0.024 (+/- 0.033)
Accuracy: 0.524 (+/- 0.017)
Majority class guess:	 0.5279187817258884
Random class guess:	 0.5015589167461156

0.8
Kappa: 0.032 (+/- 0.024)
Accuracy: 0.578 (+/- 0.008)
Majority class guess:	 0.5911398246423627
Random class guess:	 0.5166129352716813

0.9
Kappa: 0.038 (+/- 0.011)
Accuracy: 0.671 (+/- 0.005)
Majority class guess:	 0.6797415782187356
Random class guess:	 0.5646140698811237



In [5]:
# Train and test SVM classifier on all features
for t in thresholds:
    print(t)
    clf = svm.SVC()

    scores = cross_validation.cross_val_score(clf, feats_v, labels[t], scoring=kappa_scorer)
    print("Kappa: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

    scores = cross_validation.cross_val_score(clf, feats_v, labels[t])
    print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
    
    # Majority class guess
    true_portion = np.count_nonzero(labels[t])/len(labels[t])
    print("Majority class guess:\t", max(true_portion, 1-true_portion))
    
    # Random class guess
    print("Random class guess:\t", true_portion**2 + (1-true_portion)**2)
    print()

0.4
Kappa: 0.003 (+/- 0.004)
Accuracy: 0.660 (+/- 0.002)
Majority class guess:	 0.6594370096908168
Random class guess:	 0.5508403201182992

0.5
Kappa: 0.001 (+/- 0.015)
Accuracy: 0.587 (+/- 0.012)
Majority class guess:	 0.5897554222427319
Random class guess:	 0.5161120716439422

0.6
Kappa: 0.020 (+/- 0.054)
Accuracy: 0.541 (+/- 0.028)
Majority class guess:	 0.5366866635902169
Random class guess:	 0.5026918225707635

0.7
Kappa: 0.011 (+/- 0.031)
Accuracy: 0.522 (+/- 0.011)
Majority class guess:	 0.5279187817258884
Random class guess:	 0.5015589167461156

0.8
Kappa: 0.000 (+/- 0.000)
Accuracy: 0.591 (+/- 0.001)
Majority class guess:	 0.5911398246423627
Random class guess:	 0.5166129352716813

0.9
Kappa: 0.000 (+/- 0.000)
Accuracy: 0.680 (+/- 0.001)
Majority class guess:	 0.6797415782187356
Random class guess:	 0.5646140698811237



# Merge in new editor scores

In [9]:
# Load old features
feats = pd.read_csv('/home/michael/school/research/wp/wikipedia/data/revert_discussion_features.csv')
len(feats)

2167

In [3]:
# Get editor scores
scores = pd.read_csv('/home/michael/school/research/wp/wikipedia/data/editor_thread_scores.csv')

In [4]:
edthreads = zip(feats['article'], feats['thread_title'], feats['editor'])

In [None]:
edscore_rows = []
for art, t, ed in edthreads:
    score = scores[(scores['article']==art) & (scores['thread_title']==t) & (scores['editor']==ed)].iloc[0]['editor_thread_score']
    edscore_rows.append([art, t, ed, score])

In [7]:
newscores = pd.DataFrame(edscore_rows, columns=['article', 'thread_title', 'editor', 'editor_score'])

In [8]:
newscores

Unnamed: 0,article,thread_title,editor,editor_score
0,1929 Hebron massacre,Restatement,Bless sins,0.000000
1,1929 Hebron massacre,Restatement,Ceedjee,0.800000
2,1929 Hebron massacre,Restatement,GHcool,0.968750
3,1929 Hebron massacre,Restatement,Ian Pitchford,1.000000
4,1929 Hebron massacre,Restatement,Nishidani,0.860870
5,1929 Hebron massacre,Restatement,PalestineRemembered,0.710870
6,1929 Hebron massacre,Restatement,Yahel Guhan,0.500000
7,1929 Hebron massacre,ethnic cleansing cat,Nableezy,1.000000
8,1929 Hebron massacre,false rumors,Nableezy,0.000000
9,1929 Hebron massacre,false rumors,Nishidani,0.996139


In [9]:
newfeats = pd.merge(feats, newscores, on=['article', 'thread_title', 'editor'])
newfeats.columns

Index(['article', 'thread_title', 'editor', 'editor_talk', 'other_talk',
       '#editor_turns', '#other_turns', 'words_in_common',
       'intersection/editor', 'intersection/other', 'jaccard', 'ART::DEF',
       'ART::INDEF', 'AUTH::CREDENTIALS', 'AUTH::EXPERIENTIAL',
       'AUTH::EXTERNAL', 'AUTH::FORUM', 'AUTH::SOCIAL_EXPECTATIONS', 'EXAMPLE',
       'FIRST_TURN', 'LAST_TURN', 'PERS_PRON::PLUR', 'PERS_PRON::SING',
       'QUESTION', 'URL', 'T0', 'T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', 'T8',
       'T9', 'T10', 'T11', 'T12', 'T13', 'T14', 'SENTI::POS', 'SENTI::NEG',
       'editor_score_x', 'score_>0.4', 'score_>0.5', 'score_>0.6',
       'score_>0.7', 'score_>0.8', 'score_>0.9', 'nonbow_pred', 'all_pred',
       'editor_score_y'],
      dtype='object')

In [11]:
newfeats.drop(['editor_score_x'], axis=1, inplace=True)
newfeats.columns

Index(['article', 'thread_title', 'editor', 'editor_talk', 'other_talk',
       '#editor_turns', '#other_turns', 'words_in_common',
       'intersection/editor', 'intersection/other', 'jaccard', 'ART::DEF',
       'ART::INDEF', 'AUTH::CREDENTIALS', 'AUTH::EXPERIENTIAL',
       'AUTH::EXTERNAL', 'AUTH::FORUM', 'AUTH::SOCIAL_EXPECTATIONS', 'EXAMPLE',
       'FIRST_TURN', 'LAST_TURN', 'PERS_PRON::PLUR', 'PERS_PRON::SING',
       'QUESTION', 'URL', 'T0', 'T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', 'T8',
       'T9', 'T10', 'T11', 'T12', 'T13', 'T14', 'SENTI::POS', 'SENTI::NEG',
       'score_>0.4', 'score_>0.5', 'score_>0.6', 'score_>0.7', 'score_>0.8',
       'score_>0.9', 'nonbow_pred', 'all_pred', 'editor_score_y'],
      dtype='object')

In [13]:
newfeats.rename(columns={'editor_score_y': 'editor_score'}, inplace=True)

In [14]:
thresholds = np.arange(0.4, 1, 0.1)
thresholds

array([ 0.4,  0.5,  0.6,  0.7,  0.8,  0.9])

In [19]:
colnames = ['score_>{:0.1f}'.format(t) for t in thresholds]
colnames

['score_>0.4',
 'score_>0.5',
 'score_>0.6',
 'score_>0.7',
 'score_>0.8',
 'score_>0.9']

In [21]:
newfeats.drop(colnames, axis=1, inplace=True)
newfeats.columns

Index(['article', 'thread_title', 'editor', 'editor_talk', 'other_talk',
       '#editor_turns', '#other_turns', 'words_in_common',
       'intersection/editor', 'intersection/other', 'jaccard', 'ART::DEF',
       'ART::INDEF', 'AUTH::CREDENTIALS', 'AUTH::EXPERIENTIAL',
       'AUTH::EXTERNAL', 'AUTH::FORUM', 'AUTH::SOCIAL_EXPECTATIONS', 'EXAMPLE',
       'FIRST_TURN', 'LAST_TURN', 'PERS_PRON::PLUR', 'PERS_PRON::SING',
       'QUESTION', 'URL', 'T0', 'T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', 'T8',
       'T9', 'T10', 'T11', 'T12', 'T13', 'T14', 'SENTI::POS', 'SENTI::NEG',
       'nonbow_pred', 'all_pred', 'editor_score'],
      dtype='object')

In [22]:
newfeats.drop(['nonbow_pred', 'all_pred'], axis=1, inplace=True)
newfeats.columns

Index(['article', 'thread_title', 'editor', 'editor_talk', 'other_talk',
       '#editor_turns', '#other_turns', 'words_in_common',
       'intersection/editor', 'intersection/other', 'jaccard', 'ART::DEF',
       'ART::INDEF', 'AUTH::CREDENTIALS', 'AUTH::EXPERIENTIAL',
       'AUTH::EXTERNAL', 'AUTH::FORUM', 'AUTH::SOCIAL_EXPECTATIONS', 'EXAMPLE',
       'FIRST_TURN', 'LAST_TURN', 'PERS_PRON::PLUR', 'PERS_PRON::SING',
       'QUESTION', 'URL', 'T0', 'T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', 'T8',
       'T9', 'T10', 'T11', 'T12', 'T13', 'T14', 'SENTI::POS', 'SENTI::NEG',
       'editor_score'],
      dtype='object')

In [23]:
newfeats.to_csv('/home/michael/school/research/wp/wikipedia/data/revert_discussion_features.csv', index=False)
newfeats

Unnamed: 0,article,thread_title,editor,editor_talk,other_talk,#editor_turns,#other_turns,words_in_common,intersection/editor,intersection/other,...,T8,T9,T10,T11,T12,T13,T14,SENTI::POS,SENTI::NEG,editor_score
0,1929 Hebron massacre,Restatement,Bless sins,":""The massacure was preformed by muslims, ther...",Restatement: Sometime ago I asked for source o...,1,17,40,0.769231,0.017058,...,0.001481,0.001481,0.235235,0.001481,0.001481,0.001481,0.001481,0.0,3.0,0.000000
1,1929 Hebron massacre,Restatement,Ceedjee,Restatement: Sometime ago I asked for source o...,"The massacure was preformed by muslims, theref...",9,9,471,0.349407,0.448999,...,0.044202,0.000117,0.000117,0.000117,0.000117,0.000117,0.261022,32.0,12.0,0.800000
2,1929 Hebron massacre,Restatement,GHcool,:::::::Nishidani's argument for why the Hebron...,Restatement: Sometime ago I asked for source o...,1,17,54,0.729730,0.023246,...,0.000913,0.000913,0.000913,0.000913,0.000913,0.000913,0.401444,1.0,1.0,0.968750
3,1929 Hebron massacre,Restatement,Ian Pitchford,:And by that logic all killings by Jews would ...,Restatement: Sometime ago I asked for source o...,1,17,23,0.851852,0.009705,...,0.063419,0.002151,0.002151,0.002151,0.002151,0.002151,0.002151,0.0,1.0,1.000000
4,1929 Hebron massacre,Restatement,Nishidani,::::::I disagree with your my friend. Benny Mo...,Restatement: Sometime ago I asked for source o...,1,17,273,0.423256,0.155822,...,0.065707,0.003413,0.000153,0.000153,0.000153,0.000153,0.283555,34.0,5.0,0.860870
5,1929 Hebron massacre,Restatement,PalestineRemembered,":::I'm not sure how you can say ""Hebron massac...",Restatement: Sometime ago I asked for source o...,1,17,73,0.760417,0.031725,...,0.000660,0.086039,0.000660,0.000660,0.000660,0.000660,0.175090,4.0,1.0,0.710870
6,1929 Hebron massacre,Restatement,Yahel Guhan,"The massacure was preformed by muslims, theref...",Restatement: Sometime ago I asked for source o...,2,16,33,0.750000,0.014025,...,0.046235,0.001852,0.207450,0.001852,0.001852,0.001852,0.001852,1.0,1.0,0.500000
7,1929 Hebron massacre,ethnic cleansing cat,Nableezy,"Brewcrewer, could you please provide reliable ...",Here's a few just to start: [ ][ ][ ][ ]--brew...,2,2,31,0.258333,0.248000,...,0.000702,0.000702,0.000702,0.000702,0.000702,0.000702,0.300951,10.0,4.0,1.000000
8,1929 Hebron massacre,false rumors,Nableezy,"This is silly, but a. false rumors is not exac...",I'd like to point out that the days before the...,3,17,50,0.588235,0.028201,...,0.001026,0.031820,0.129341,0.001026,0.001026,0.001026,0.073913,3.0,0.0,0.000000
9,1929 Hebron massacre,false rumors,Nishidani,::(ec)It violates WP:NPOV but has some support...,"This is silly, but a. false rumors is not exac...",9,11,307,0.237800,0.541446,...,0.035691,0.007750,0.000121,0.000121,0.002035,0.000121,0.351416,57.0,20.0,0.996139


## End test all features

# Remove singletons (only 1 editor in a conversation)

In [15]:
feats = pd.read_csv('/home/michael/school/research/wp/wikipedia/data/revert_discussion_features.csv')
threads = list(zip(feats['article'], feats['thread_title']))
singles = [t for t in threads if threads.count(t) == 1]
len(singles)

94

In [None]:
single_arts = 

In [18]:
singles

[('1929 Hebron massacre', 'ethnic cleansing cat'),
 ('1982 Lebanon War', 'Result'),
 ('Abbasid Caliphate', 'Encyclopaedia Iranica link'),
 ('Abkhazia', 'Unexplained edits by Chipmunkdavis?'),
 ('Abu Nidal', 'Images'),
 ('Abu Nidal', 'My revert'),
 ('Afula', 'Category:Settlements established in 1925'),
 ('Anwar Sadat', 'Images'),
 ('Arab League', 'Map again'),
 ('Arab citizens of Israel', 'The Star of David is also an Islamic symbol'),
 ('Ariel University', 'Ariel University in "State of Palestine"'),
 ('Ayyubid dynasty', 'Arabic source'),
 ('Bar Kokhba revolt',
  'Israel My Inheritance - Persecuted Messianic Jews Cry out for Justice and Reform'),
 ('Battle of Jenin', 'Neutral POV'),
 ('Battle of Nablus', 'Damage caused to the city'),
 ('Bayt Jibrin', 'Beit Guvrin'),
 ('Beit Ummar', 'Violence section'),
 ('Boycotts of Israel', 'using ei for facts'),
 ('Brian Avery (activist)', 'Frontpage.com'),
 ('Child suicide bombers in the Israeli–Palestinian conflict',
  'Move to "Child suicide bomb

In [16]:
len(feats)

2167

In [21]:
mask = [True]*len(feats)
for i, line in feats.iterrows():
    if (line['article'], line['thread_title']) in singles:
        mask[i] = False
        
feats_nosingles = feats[mask]
len(feats_nosingles)

2073

In [22]:
feats_nosingles.to_csv('/home/michael/school/research/wp/wikipedia/data/revert_discussion_features.csv', index=False)

# Stats on conversations

In [29]:
feats = pd.read_csv('/home/michael/school/research/wp/wikipedia/data/revert_discussion_features.csv')
threads = set(zip(feats['article'], feats['thread_title']))

edscores = {}
for t in threads:
    rows = feats[(feats['article'] == t[0]) & (feats['thread_title']==t[1])]
    edscores[t] = rows['editor_score'].tolist()

len(edscores)

655

In [30]:
# Numbers of participants in threads--and print to csv
edcounts = {t: len(scores) for t, scores in edscores.items()}
len(edcounts)

edcountrows = [[t[0], t[1], count] for t, count in edcounts.items()]
edcountrows

edcounts_data = pd.DataFrame(edcountrows, columns=['article', 'thread_title', 'num_users'])
edcounts_data

Unnamed: 0,article,thread_title,num_users
0,Cave of the Patriarchs,Reverted edit,3
1,Jewish diaspora,Quotation marks,2
2,Mukataa,Recent edits,2
3,Shmuel Katz (politician),Nielswik,3
4,Julius Stone,Repeated deletion of any mention of Julius Sto...,2
5,Judea and Samaria Area,"""biblical geographical regions""",2
6,Likud,"""direct ideological descendant""",2
7,Israeli checkpoint,IDF officers,5
8,Child suicide bombers in the Israeli–Palestini...,"Deletion of entire ""indoctrination"" section",4
9,Tommy Lapid,Random Lengthy Quotation,3


In [31]:
feats_withcounts = pd.merge(feats, edcounts_data)
feats_withcounts

Unnamed: 0,article,thread_title,editor,editor_talk,other_talk,#editor_turns,#other_turns,words_in_common,intersection/editor,intersection/other,...,T11,T12,T13,T14,SENTI::POS,SENTI::NEG,editor_score,nonbow_pred,pred,num_users
0,1929 Hebron massacre,Restatement,Bless sins,":""The massacure was preformed by muslims, ther...",Restatement: Sometime ago I asked for source o...,1,17,40,0.769231,0.017058,...,0.001481,0.001481,0.001481,0.001481,0.0,3.0,0.000000,0.618966,0.463035,7
1,1929 Hebron massacre,Restatement,Ceedjee,Restatement: Sometime ago I asked for source o...,"The massacure was preformed by muslims, theref...",9,9,471,0.349407,0.448999,...,0.000117,0.000117,0.000117,0.261022,32.0,12.0,0.800000,0.589103,0.796302,7
2,1929 Hebron massacre,Restatement,GHcool,:::::::Nishidani's argument for why the Hebron...,Restatement: Sometime ago I asked for source o...,1,17,54,0.729730,0.023246,...,0.000913,0.000913,0.000913,0.401444,1.0,1.0,0.968750,0.617359,0.638528,7
3,1929 Hebron massacre,Restatement,Ian Pitchford,:And by that logic all killings by Jews would ...,Restatement: Sometime ago I asked for source o...,1,17,23,0.851852,0.009705,...,0.002151,0.002151,0.002151,0.002151,0.0,1.0,1.000000,0.621004,0.353466,7
4,1929 Hebron massacre,Restatement,Nishidani,::::::I disagree with your my friend. Benny Mo...,Restatement: Sometime ago I asked for source o...,1,17,273,0.423256,0.155822,...,0.000153,0.000153,0.000153,0.283555,34.0,5.0,0.860870,0.593648,1.000000,7
5,1929 Hebron massacre,Restatement,PalestineRemembered,":::I'm not sure how you can say ""Hebron massac...",Restatement: Sometime ago I asked for source o...,1,17,73,0.760417,0.031725,...,0.000660,0.000660,0.000660,0.175090,4.0,1.0,0.710870,0.615390,0.507072,7
6,1929 Hebron massacre,Restatement,Yahel Guhan,"The massacure was preformed by muslims, theref...",Restatement: Sometime ago I asked for source o...,2,16,33,0.750000,0.014025,...,0.001852,0.001852,0.001852,0.001852,1.0,1.0,0.500000,0.620445,0.631426,7
7,1929 Hebron massacre,false rumors,Nableezy,"This is silly, but a. false rumors is not exac...",I'd like to point out that the days before the...,3,17,50,0.588235,0.028201,...,0.001026,0.001026,0.001026,0.073913,3.0,0.0,0.000000,0.616426,1.000000,5
8,1929 Hebron massacre,false rumors,Nishidani,::(ec)It violates WP:NPOV but has some support...,"This is silly, but a. false rumors is not exac...",9,11,307,0.237800,0.541446,...,0.000121,0.002035,0.000121,0.351416,57.0,20.0,0.996139,0.613128,1.000000,5
9,1929 Hebron massacre,false rumors,Plot Spoiler,Disagree with your position. Whether the rumor...,"This is silly, but a. false rumors is not exac...",1,19,43,0.614286,0.024049,...,0.001111,0.001111,0.001111,0.001111,1.0,0.0,1.000000,0.615813,1.000000,5


In [32]:
feats_withcounts.columns

Index(['article', 'thread_title', 'editor', 'editor_talk', 'other_talk',
       '#editor_turns', '#other_turns', 'words_in_common',
       'intersection/editor', 'intersection/other', 'jaccard', 'ART::DEF',
       'ART::INDEF', 'AUTH::CREDENTIALS', 'AUTH::EXPERIENTIAL',
       'AUTH::EXTERNAL', 'AUTH::FORUM', 'AUTH::SOCIAL_EXPECTATIONS', 'EXAMPLE',
       'FIRST_TURN', 'LAST_TURN', 'PERS_PRON::PLUR', 'PERS_PRON::SING',
       'QUESTION', 'URL', 'T0', 'T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', 'T8',
       'T9', 'T10', 'T11', 'T12', 'T13', 'T14', 'SENTI::POS', 'SENTI::NEG',
       'editor_score', 'nonbow_pred', 'pred', 'num_users'],
      dtype='object')

In [33]:
# PRINT CSV HERE
feats_withcounts.to_csv('/home/michael/school/research/wp/wikipedia/data/revert_discussion_features.csv', index=False)

In [8]:
eds_inthreads = [len(scores) for t, scores in edscores.items() if not len(scores)==1]
part_counts = Counter(eds_inthreads)
print(np.mean(eds_inthreads))
print(part_counts)

3.16488549618
Counter({2: 283, 3: 186, 4: 91, 5: 53, 6: 18, 7: 8, 8: 7, 9: 4, 10: 2, 11: 1, 12: 1, 21: 1})


In [5]:
# Discard singletons
del part_counts[1]
part_counts

Counter({2: 283,
         3: 186,
         4: 91,
         5: 53,
         6: 18,
         7: 8,
         8: 7,
         9: 4,
         10: 2,
         11: 1,
         12: 1,
         21: 1})

In [20]:
scores_bycount = {c: [s for t, s in edscores.items() if len(s)==c] for c in part_counts}
scores_bycount

{1: [[0.94845360824742264],
  [0.99285714285714277],
  [1.0],
  [0.8571428571428571],
  [1.0],
  [1.0],
  [1.0],
  [1.0],
  [1.0],
  [1.0],
  [1.0],
  [0.36986301369863012],
  [0.65789473684210531],
  [1.0],
  [0.9739130434782608],
  [0.625],
  [1.0],
  [1.0],
  [1.0],
  [0.37025316455696211],
  [1.0],
  [1.0],
  [1.0],
  [1.0],
  [0.76950105411103298],
  [1.0],
  [1.0],
  [1.0],
  [0.28048780487804881],
  [1.0],
  [1.0],
  [1.0],
  [0.97222222222222221],
  [0.5],
  [1.0],
  [1.0],
  [1.0],
  [0.56470588235294117],
  [0.5],
  [1.0],
  [0.69903433476394849],
  [1.0],
  [0.92763157894736836],
  [0.96875],
  [0.88235294117647056],
  [0.95833333333333337],
  [0.78386167146974062],
  [0.0097560975609756097],
  [1.0],
  [1.0],
  [1.0],
  [1.0],
  [1.0],
  [1.0],
  [0.97087378640776723],
  [1.0],
  [1.0],
  [1.0],
  [0.9885057471264368],
  [1.0],
  [1.0],
  [1.0],
  [1.0],
  [0.73913043478260865],
  [1.0],
  [1.0],
  [1.0],
  [0.85185185185185186],
  [1.0],
  [1.0],
  [0.85517241379310349],
 

In [28]:
threshold = .5

# 1 participant
above = [s[0] for s in scores_bycount[1] if s[0] >= threshold] 
below = [s[0] for s in scores_bycount[1] if s[0] < threshold] 
print("Proportion of editors above threshold:", len(above)/len(scores_bycount[1]))
print("Proportion of editors below threshold:", len(below)/len(scores_bycount[1]))
print("Mean score of editors above threshold:", np.mean(above))
print("Mean score of editors below threshold:", np.mean(below))

Proportion of editors above threshold: 0.925531914893617
Proportion of editors above threshold: 0.07446808510638298
Mean score of editors above threshold: 0.946527638842
Mean score of editors below threshold: 0.228323045038


In [40]:
for n_parts in scores_bycount:
    print("{0} participants".format(n_parts))
    threshold_parts = Counter()
    for l in scores_bycount[n_parts]:
        n_above = 0
        for s in l:
            if s >= threshold:
                n_above += 1
        threshold_parts[n_above] += 1

    for a in threshold_parts:
        print("score sets with exactly {:d} editors above threshold: {:.1%} ({:d}/{:d})".format(
                a, threshold_parts[a]/len(scores_bycount[n_parts]), threshold_parts[a], len(scores_bycount[n_parts])))
    print()

1 participants
score sets with exactly 0 editors above threshold: 7.4% (7/94)
score sets with exactly 1 editors above threshold: 92.6% (87/94)

2 participants
score sets with exactly 0 editors above threshold: 2.5% (7/283)
score sets with exactly 1 editors above threshold: 72.4% (205/283)
score sets with exactly 2 editors above threshold: 25.1% (71/283)

3 participants
score sets with exactly 0 editors above threshold: 1.1% (2/186)
score sets with exactly 1 editors above threshold: 25.8% (48/186)
score sets with exactly 2 editors above threshold: 59.1% (110/186)
score sets with exactly 3 editors above threshold: 14.0% (26/186)

4 participants
score sets with exactly 0 editors above threshold: 2.2% (2/91)
score sets with exactly 1 editors above threshold: 11.0% (10/91)
score sets with exactly 2 editors above threshold: 42.9% (39/91)
score sets with exactly 3 editors above threshold: 39.6% (36/91)
score sets with exactly 4 editors above threshold: 4.4% (4/91)

5 participants
score sets w

## End stats on conversations

# Merge in features from Yohan

In [9]:
# Merge in Yohan's features
yohan = pd.read_csv('/home/michael/school/research/wp/feats_no_bow.csv').loc[:,['article_title', 'thread_title', 'username', 'SENTI::POS', 'SENTI::NEG']]

newfeats = pd.merge(feats, yohan, 
                 left_on=['article','thread_title','editor'], right_on=['article_title', 'thread_title', 'username'])

newfeats.drop('article_title', axis=1, inplace=True)
newfeats.drop('username', axis=1, inplace=True)
newfeats

Unnamed: 0,article,thread_title,editor,editor_talk,other_talk,#editor_turns,#other_turns,words_in_common,intersection/editor,intersection/other,...,score_>0.4,score_>0.5,score_>0.6,score_>0.7,score_>0.8,score_>0.9,nonbow_pred,all_pred,SENTI::POS,SENTI::NEG
0,1929 Hebron massacre,Restatement,Bless sins,":""The massacure was preformed by muslims, ther...",Restatement: Sometime ago I asked for source o...,1,17,40,0.769231,0.017058,...,0,0,0,0,0,0,0.402442,0.447666,0.0,3.0
1,1929 Hebron massacre,Restatement,Ceedjee,Restatement: Sometime ago I asked for source o...,"The massacure was preformed by muslims, theref...",9,9,471,0.349407,0.448999,...,1,1,1,1,1,0,0.505729,0.910782,32.0,12.0
2,1929 Hebron massacre,Restatement,GHcool,:::::::Nishidani's argument for why the Hebron...,Restatement: Sometime ago I asked for source o...,1,17,54,0.729730,0.023246,...,1,1,1,1,1,1,0.551015,0.572843,1.0,1.0
3,1929 Hebron massacre,Restatement,Ian Pitchford,:And by that logic all killings by Jews would ...,Restatement: Sometime ago I asked for source o...,1,17,23,0.851852,0.009705,...,1,1,1,1,1,1,0.624845,0.360184,0.0,1.0
4,1929 Hebron massacre,Restatement,Nishidani,::::::I disagree with your my friend. Benny Mo...,Restatement: Sometime ago I asked for source o...,1,17,273,0.423256,0.155822,...,1,1,1,1,1,0,0.537634,1.313421,34.0,5.0
5,1929 Hebron massacre,Restatement,PalestineRemembered,":::I'm not sure how you can say ""Hebron massac...",Restatement: Sometime ago I asked for source o...,1,17,73,0.760417,0.031725,...,1,1,1,1,0,0,0.586434,0.452802,4.0,1.0
6,1929 Hebron massacre,Restatement,Yahel Guhan,"The massacure was preformed by muslims, theref...",Restatement: Sometime ago I asked for source o...,2,16,33,0.750000,0.014025,...,1,0,0,0,0,0,0.464802,0.647315,1.0,1.0
7,1929 Hebron massacre,ethnic cleansing cat,Nableezy,"Brewcrewer, could you please provide reliable ...",Here's a few just to start: [ ][ ][ ][ ]--brew...,2,2,31,0.258333,0.248000,...,1,1,1,1,1,1,0.558677,0.804386,10.0,4.0
8,1929 Hebron massacre,false rumors,Nableezy,"This is silly, but a. false rumors is not exac...",I'd like to point out that the days before the...,3,17,50,0.588235,0.028201,...,0,0,0,0,0,0,0.393640,1.655727,3.0,0.0
9,1929 Hebron massacre,false rumors,Nishidani,::(ec)It violates WP:NPOV but has some support...,"This is silly, but a. false rumors is not exac...",9,11,307,0.237800,0.541446,...,1,1,1,1,1,1,0.713861,1.674665,57.0,20.0


In [15]:
cols = newfeats.columns.tolist()
cols = cols[:-11] + cols[-2:] + cols[-11:-2]
newfeats = newfeats[cols]
newfeats.columns

Index(['article', 'thread_title', 'editor', 'editor_talk', 'other_talk',
       '#editor_turns', '#other_turns', 'words_in_common',
       'intersection/editor', 'intersection/other', 'jaccard', 'ART::DEF',
       'ART::INDEF', 'AUTH::CREDENTIALS', 'AUTH::EXPERIENTIAL',
       'AUTH::EXTERNAL', 'AUTH::FORUM', 'AUTH::SOCIAL_EXPECTATIONS', 'EXAMPLE',
       'FIRST_TURN', 'LAST_TURN', 'PERS_PRON::PLUR', 'PERS_PRON::SING',
       'QUESTION', 'URL', 'T0', 'T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', 'T8',
       'T9', 'T10', 'T11', 'T12', 'T13', 'T14', 'SENTI::POS', 'SENTI::NEG',
       'editor_score', 'score_>0.4', 'score_>0.5', 'score_>0.6', 'score_>0.7',
       'score_>0.8', 'score_>0.9', 'nonbow_pred', 'all_pred'],
      dtype='object')

In [16]:
newfeats.to_csv('/home/michael/school/research/wp/wikipedia/data/revert_discussion_features.csv', index=False)

## End merge stats on conversations

In [17]:
# Load topic features
topic_data = pd.read_csv('/home/michael/school/research/wp/topic_feats.csv')
topic_data

Unnamed: 0,article_title,thread_title,username,T0,T1,T2,T3,T4,T5,T6,T7,T8,T9,T10,T11,T12,T13,T14
0,14 July Revolution,Biased,UNKNOWN_CONTRIBUTOR,0.605549,0.002151,0.002151,0.072648,0.002151,0.002151,0.002151,0.002151,0.002151,0.002151,0.002151,0.248025,0.002151,0.002151,0.050121
1,14 July Revolution,Flag,UNKNOWN_CONTRIBUTOR,0.575484,0.001667,0.001667,0.112244,0.001667,0.001667,0.292272,0.001667,0.001667,0.001667,0.001667,0.001667,0.001667,0.001667,0.001667
2,16th Summit of the Non-Aligned Movement,Agenda,Lihaas,0.828133,0.003333,0.003333,0.003333,0.003333,0.003333,0.003333,0.003333,0.003333,0.003333,0.003333,0.003333,0.003333,0.003333,0.128534
3,16th Summit of the Non-Aligned Movement,Agenda,Sa.vakilian,0.606987,0.003030,0.003030,0.003030,0.056374,0.003030,0.300276,0.003030,0.003030,0.003030,0.003030,0.003030,0.003030,0.003030,0.003030
4,16th Summit of the Non-Aligned Movement,Appraisal,Jethro B,0.671176,0.004444,0.004444,0.004444,0.004444,0.004444,0.271046,0.004444,0.004444,0.004444,0.004444,0.004444,0.004444,0.004444,0.004444
5,16th Summit of the Non-Aligned Movement,Appraisal,Lihaas,0.700472,0.002299,0.002299,0.002299,0.002299,0.002299,0.269642,0.002299,0.002299,0.002299,0.002299,0.002299,0.002299,0.002299,0.002299
6,16th Summit of the Non-Aligned Movement,Appraisal,Sa.vakilian,0.383068,0.000813,0.000813,0.000813,0.000813,0.000813,0.323127,0.000813,0.000813,0.000813,0.284049,0.000813,0.000813,0.000813,0.000813
7,16th Summit of the Non-Aligned Movement,Ban at the Summit,Jethro B,0.486034,0.001515,0.001515,0.001515,0.001515,0.001515,0.345154,0.001515,0.001515,0.001515,0.001515,0.001515,0.021499,0.052237,0.079925
8,16th Summit of the Non-Aligned Movement,Ban at the Summit,Lihaas,0.685514,0.001802,0.001802,0.118896,0.001802,0.001802,0.173968,0.001802,0.001802,0.001802,0.001802,0.001802,0.001802,0.001802,0.001802
9,16th Summit of the Non-Aligned Movement,Ban at the Summit,Sa.vakilian,0.117626,0.034601,0.000372,0.047322,0.000372,0.000372,0.629343,0.000372,0.000372,0.000372,0.000372,0.000372,0.064618,0.000372,0.103138


In [20]:
# Merge in topic features
feats = pd.merge(feats, topic_data, 
                 left_on=['article','thread_title','editor'], right_on=['article_title', 'thread_title', 'username'])

feats.drop('article_title', axis=1, inplace=True)
feats.drop('username', axis=1, inplace=True)
print(len(feats))
feats

2167


Unnamed: 0,article,thread_title,editor,editor_talk,other_talk,#editor_turns,#other_turns,words_in_common,intersection/editor,intersection/other,...,T5,T6,T7,T8,T9,T10,T11,T12,T13,T14
0,1929 Hebron massacre,Restatement,Bless sins,":""The massacure was preformed by muslims, ther...",Restatement: Sometime ago I asked for source o...,1,17,40,0.769231,0.017058,...,0.001481,0.001481,0.001481,0.001481,0.001481,0.235235,0.001481,0.001481,0.001481,0.001481
1,1929 Hebron massacre,Restatement,Ceedjee,Restatement: Sometime ago I asked for source o...,"The massacure was preformed by muslims, theref...",9,9,471,0.349407,0.448999,...,0.000117,0.369648,0.003815,0.044202,0.000117,0.000117,0.000117,0.000117,0.000117,0.261022
2,1929 Hebron massacre,Restatement,GHcool,:::::::Nishidani's argument for why the Hebron...,Restatement: Sometime ago I asked for source o...,1,17,54,0.729730,0.023246,...,0.000913,0.000913,0.000913,0.000913,0.000913,0.000913,0.000913,0.000913,0.000913,0.401444
3,1929 Hebron massacre,Restatement,Ian Pitchford,:And by that logic all killings by Jews would ...,Restatement: Sometime ago I asked for source o...,1,17,23,0.851852,0.009705,...,0.002151,0.002151,0.002151,0.063419,0.002151,0.002151,0.002151,0.002151,0.002151,0.002151
4,1929 Hebron massacre,Restatement,Nishidani,::::::I disagree with your my friend. Benny Mo...,Restatement: Sometime ago I asked for source o...,1,17,273,0.423256,0.155822,...,0.000153,0.207274,0.000153,0.065707,0.003413,0.000153,0.000153,0.000153,0.000153,0.283555
5,1929 Hebron massacre,Restatement,PalestineRemembered,":::I'm not sure how you can say ""Hebron massac...",Restatement: Sometime ago I asked for source o...,1,17,73,0.760417,0.031725,...,0.000660,0.461024,0.000660,0.000660,0.086039,0.000660,0.000660,0.000660,0.000660,0.175090
6,1929 Hebron massacre,Restatement,Yahel Guhan,"The massacure was preformed by muslims, theref...",Restatement: Sometime ago I asked for source o...,2,16,33,0.750000,0.014025,...,0.001852,0.001852,0.001852,0.046235,0.001852,0.207450,0.001852,0.001852,0.001852,0.001852
7,1929 Hebron massacre,ethnic cleansing cat,Nableezy,"Brewcrewer, could you please provide reliable ...",Here's a few just to start: [ ][ ][ ][ ]--brew...,2,2,31,0.258333,0.248000,...,0.000702,0.382642,0.000702,0.000702,0.000702,0.000702,0.000702,0.000702,0.000702,0.300951
8,1929 Hebron massacre,false rumors,Nableezy,"This is silly, but a. false rumors is not exac...",I'd like to point out that the days before the...,3,17,50,0.588235,0.028201,...,0.001026,0.371406,0.001026,0.001026,0.031820,0.129341,0.001026,0.001026,0.001026,0.073913
9,1929 Hebron massacre,false rumors,Nishidani,::(ec)It violates WP:NPOV but has some support...,"This is silly, but a. false rumors is not exac...",9,11,307,0.237800,0.541446,...,0.000121,0.434888,0.005072,0.035691,0.007750,0.000121,0.000121,0.002035,0.000121,0.351416


In [25]:
# Rearrange columns
cols = feats.columns.tolist()
new_cols = cols[:-22] + cols[-15:] + cols[-22:-15]
new_cols

['article',
 'thread_title',
 'editor',
 'editor_talk',
 'other_talk',
 '#editor_turns',
 '#other_turns',
 'words_in_common',
 'intersection/editor',
 'intersection/other',
 'jaccard',
 'ART::DEF',
 'ART::INDEF',
 'AUTH::CREDENTIALS',
 'AUTH::EXPERIENTIAL',
 'AUTH::EXTERNAL',
 'AUTH::FORUM',
 'AUTH::SOCIAL_EXPECTATIONS',
 'EXAMPLE',
 'FIRST_TURN',
 'LAST_TURN',
 'PERS_PRON::PLUR',
 'PERS_PRON::SING',
 'QUESTION',
 'URL',
 'T0',
 'T1',
 'T2',
 'T3',
 'T4',
 'T5',
 'T6',
 'T7',
 'T8',
 'T9',
 'T10',
 'T11',
 'T12',
 'T13',
 'T14',
 'editor_score',
 'score_>0.4',
 'score_>0.5',
 'score_>0.6',
 'score_>0.7',
 'score_>0.8',
 'score_>0.9']

In [26]:
feats = feats[new_cols]
feats.columns

Index(['article', 'thread_title', 'editor', 'editor_talk', 'other_talk',
       '#editor_turns', '#other_turns', 'words_in_common',
       'intersection/editor', 'intersection/other', 'jaccard', 'ART::DEF',
       'ART::INDEF', 'AUTH::CREDENTIALS', 'AUTH::EXPERIENTIAL',
       'AUTH::EXTERNAL', 'AUTH::FORUM', 'AUTH::SOCIAL_EXPECTATIONS', 'EXAMPLE',
       'FIRST_TURN', 'LAST_TURN', 'PERS_PRON::PLUR', 'PERS_PRON::SING',
       'QUESTION', 'URL', 'T0', 'T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', 'T8',
       'T9', 'T10', 'T11', 'T12', 'T13', 'T14', 'editor_score', 'score_>0.4',
       'score_>0.5', 'score_>0.6', 'score_>0.7', 'score_>0.8', 'score_>0.9'],
      dtype='object')

In [36]:
feats['all_pred'] = pred
feats

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0,article,thread_title,editor,editor_talk,other_talk,#editor_turns,#other_turns,words_in_common,intersection/editor,intersection/other,...,T14,editor_score,score_>0.4,score_>0.5,score_>0.6,score_>0.7,score_>0.8,score_>0.9,nonbow_pred,all_pred
0,1929 Hebron massacre,Restatement,Bless sins,":""The massacure was preformed by muslims, ther...",Restatement: Sometime ago I asked for source o...,1,17,40,0.769231,0.017058,...,0.001481,0.000000,0,0,0,0,0,0,0.402442,0.447666
1,1929 Hebron massacre,Restatement,Ceedjee,Restatement: Sometime ago I asked for source o...,"The massacure was preformed by muslims, theref...",9,9,471,0.349407,0.448999,...,0.261022,0.800000,1,1,1,1,1,0,0.505729,0.910782
2,1929 Hebron massacre,Restatement,GHcool,:::::::Nishidani's argument for why the Hebron...,Restatement: Sometime ago I asked for source o...,1,17,54,0.729730,0.023246,...,0.401444,0.968254,1,1,1,1,1,1,0.551015,0.572843
3,1929 Hebron massacre,Restatement,Ian Pitchford,:And by that logic all killings by Jews would ...,Restatement: Sometime ago I asked for source o...,1,17,23,0.851852,0.009705,...,0.002151,1.000000,1,1,1,1,1,1,0.624845,0.360184
4,1929 Hebron massacre,Restatement,Nishidani,::::::I disagree with your my friend. Benny Mo...,Restatement: Sometime ago I asked for source o...,1,17,273,0.423256,0.155822,...,0.283555,0.858407,1,1,1,1,1,0,0.537634,1.313421
5,1929 Hebron massacre,Restatement,PalestineRemembered,":::I'm not sure how you can say ""Hebron massac...",Restatement: Sometime ago I asked for source o...,1,17,73,0.760417,0.031725,...,0.175090,0.707048,1,1,1,1,0,0,0.586434,0.452802
6,1929 Hebron massacre,Restatement,Yahel Guhan,"The massacure was preformed by muslims, theref...",Restatement: Sometime ago I asked for source o...,2,16,33,0.750000,0.014025,...,0.001852,0.500000,1,0,0,0,0,0,0.464802,0.647315
7,1929 Hebron massacre,ethnic cleansing cat,Nableezy,"Brewcrewer, could you please provide reliable ...",Here's a few just to start: [ ][ ][ ][ ]--brew...,2,2,31,0.258333,0.248000,...,0.300951,1.000000,1,1,1,1,1,1,0.558677,0.804386
8,1929 Hebron massacre,false rumors,Nableezy,"This is silly, but a. false rumors is not exac...",I'd like to point out that the days before the...,3,17,50,0.588235,0.028201,...,0.073913,0.000000,0,0,0,0,0,0,0.393640,1.655727
9,1929 Hebron massacre,false rumors,Nishidani,::(ec)It violates WP:NPOV but has some support...,"This is silly, but a. false rumors is not exac...",9,11,307,0.237800,0.541446,...,0.351416,0.994444,1,1,1,1,1,1,0.713861,1.674665


In [37]:
feats.to_csv('/home/michael/school/research/wp/wikipedia/data/revert_discussion_features.csv', index=False)

In [28]:
feats.columns

Index(['article', 'thread_title', 'editor', 'editor_talk', 'other_talk',
       '#editor_turns', '#other_turns', 'words_in_common',
       'intersection/editor', 'intersection/other', 'jaccard', 'editor_score',
       'score_>0.4', 'score_>0.5', 'score_>0.6', 'score_>0.7', 'score_>0.8',
       'score_>0.9', 'ART::DEF', 'ART::INDEF', 'AUTH::CREDENTIALS',
       'AUTH::EXPERIENTIAL', 'AUTH::EXTERNAL', 'AUTH::FORUM',
       'AUTH::SOCIAL_EXPECTATIONS', 'EXAMPLE', 'FIRST_TURN', 'LAST_TURN',
       'PERS_PRON::PLUR', 'PERS_PRON::SING', 'QUESTION', 'URL'],
      dtype='object')

In [29]:
cols = feats.columns.tolist()
cols[12:18]

['score_>0.4',
 'score_>0.5',
 'score_>0.6',
 'score_>0.7',
 'score_>0.8',
 'score_>0.9']

In [30]:
cols = cols[:12] + cols[18:] + cols[12:18]
feats = feats[cols]
feats.columns

Index(['article', 'thread_title', 'editor', 'editor_talk', 'other_talk',
       '#editor_turns', '#other_turns', 'words_in_common',
       'intersection/editor', 'intersection/other', 'jaccard', 'editor_score',
       'ART::DEF', 'ART::INDEF', 'AUTH::CREDENTIALS', 'AUTH::EXPERIENTIAL',
       'AUTH::EXTERNAL', 'AUTH::FORUM', 'AUTH::SOCIAL_EXPECTATIONS', 'EXAMPLE',
       'FIRST_TURN', 'LAST_TURN', 'PERS_PRON::PLUR', 'PERS_PRON::SING',
       'QUESTION', 'URL', 'score_>0.4', 'score_>0.5', 'score_>0.6',
       'score_>0.7', 'score_>0.8', 'score_>0.9'],
      dtype='object')

In [32]:
cols = feats.columns.tolist()
new_cols = cols[:11] + cols[12:26] + cols[11:12] + cols[26:]
feats = feats[new_cols]
feats.columns

Index(['article', 'thread_title', 'editor', 'editor_talk', 'other_talk',
       '#editor_turns', '#other_turns', 'words_in_common',
       'intersection/editor', 'intersection/other', 'jaccard', 'ART::DEF',
       'ART::INDEF', 'AUTH::CREDENTIALS', 'AUTH::EXPERIENTIAL',
       'AUTH::EXTERNAL', 'AUTH::FORUM', 'AUTH::SOCIAL_EXPECTATIONS', 'EXAMPLE',
       'FIRST_TURN', 'LAST_TURN', 'PERS_PRON::PLUR', 'PERS_PRON::SING',
       'QUESTION', 'URL', 'editor_score', 'score_>0.4', 'score_>0.5',
       'score_>0.6', 'score_>0.7', 'score_>0.8', 'score_>0.9'],
      dtype='object')

In [27]:
feats.drop('NUM_TURNS', axis=1, inplace=True)

In [33]:
feats.to_csv('/home/michael/school/research/wp/wikipedia/data/revert_discussion_features.csv', index=False)

In [23]:
# Load, initialize data
score_data = pd.read_csv('/home/michael/school/research/wp/wikipedia/data/editor_thread_scores.csv', parse_dates=['edit_timestamp'])
talk_data = pd.read_csv('/home/michael/school/research/wp/wikipedia/data/talk/ipc_talkpages_byarticle.csv', parse_dates=['post_timestamp'])
crit = talk_data['post_text'].map(lambda x: not re.match(r':+$', str(x))) # take out entries of just colons
relfeatures = pd.read_csv('/home/michael/school/research/wp/wikipedia/data/relationship_features.csv', index_col=[0])
talk_data = talk_data[crit]

relfeatures

Unnamed: 0,article,thread_title,editor,editor_talk,other_talk,#editor_turns,#other_turns,words_in_common,intersection/editor,intersection/other,jaccard,editor_score,score_>0.4,score_>0.5,score_>0.6,score_>0.7,score_>0.8,score_>0.9
0,1929 Hebron massacre,Restatement,Bless sins,":""The massacure was preformed by muslims, ther...",Restatement: Sometime ago I asked for source o...,1,17,40,0.769231,0.017058,0.016971,0.000000,0,0,0,0,0,0
1,1929 Hebron massacre,Restatement,Ceedjee,Restatement: Sometime ago I asked for source o...,"The massacure was preformed by muslims, theref...",9,9,471,0.349407,0.448999,0.244548,0.800000,1,1,1,1,1,0
2,1929 Hebron massacre,Restatement,GHcool,:::::::Nishidani's argument for why the Hebron...,Restatement: Sometime ago I asked for source o...,1,17,54,0.729730,0.023246,0.023047,0.968254,1,1,1,1,1,1
3,1929 Hebron massacre,Restatement,Ian Pitchford,:And by that logic all killings by Jews would ...,Restatement: Sometime ago I asked for source o...,1,17,23,0.851852,0.009705,0.009688,1.000000,1,1,1,1,1,1
4,1929 Hebron massacre,Restatement,Nishidani,::::::I disagree with your my friend. Benny Mo...,Restatement: Sometime ago I asked for source o...,1,17,273,0.423256,0.155822,0.128531,0.858407,1,1,1,1,1,0
5,1929 Hebron massacre,Restatement,PalestineRemembered,":::I'm not sure how you can say ""Hebron massac...",Restatement: Sometime ago I asked for source o...,1,17,73,0.760417,0.031725,0.031411,0.707048,1,1,1,1,0,0
6,1929 Hebron massacre,Restatement,Yahel Guhan,"The massacure was preformed by muslims, theref...",Restatement: Sometime ago I asked for source o...,2,16,33,0.750000,0.014025,0.013959,0.500000,1,0,0,0,0,0
7,1929 Hebron massacre,ethnic cleansing cat,Nableezy,"Brewcrewer, could you please provide reliable ...",Here's a few just to start: [ ][ ][ ][ ]--brew...,2,2,31,0.258333,0.248000,0.144860,1.000000,1,1,1,1,1,1
8,1929 Hebron massacre,false rumors,Nableezy,"This is silly, but a. false rumors is not exac...",I'd like to point out that the days before the...,3,17,50,0.588235,0.028201,0.027655,0.000000,0,0,0,0,0,0
9,1929 Hebron massacre,false rumors,Nishidani,::(ec)It violates WP:NPOV but has some support...,"This is silly, but a. false rumors is not exac...",9,11,307,0.237800,0.541446,0.197937,0.994444,1,1,1,1,1,1


In [26]:
# Merge in Yohan's features
yohan = pd.read_csv('/home/michael/school/research/wp/feats_no_bow.csv')

feats = pd.merge(relfeatures, yohan, 
                 left_on=['article','thread_title','editor'], right_on=['article_title', 'thread_title', 'username'])

feats.drop('article_title', axis=1, inplace=True)
feats.drop('username', axis=1, inplace=True)
feats

In [32]:
# Train and test SVM classifier on all features
for t in thresholds:
    print(t)
    clf = svm.SVC()

    scores = cross_validation.cross_val_score(clf, bow, list(labels[t].values()), scoring=kappa_scorer)
    print("Kappa: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

    scores = cross_validation.cross_val_score(clf, bow, list(labels[t].values()))
    print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
    
    # Majority class guess
    true_portion = np.count_nonzero(np.array(list(labels[t].values())))/len(labels[t].values())
    print("Majority class guess:\t", max(true_portion, 1-true_portion))
    
    # Random class guess
    print("Random class guess:\t", true_portion**2 + (1-true_portion)**2)
    print()

0.4
Kappa: 0.000 (+/- 0.000)
Majority class guess:	 0.6594370096908168
Random class guess:	 0.5508403201182992

0.5
Kappa: -0.001 (+/- 0.001)
Majority class guess:	 0.5897554222427319
Random class guess:	 0.5161120716439422

0.6
Kappa: -0.002 (+/- 0.004)
Majority class guess:	 0.5366866635902169
Random class guess:	 0.5026918225707635

0.7
Kappa: 0.007 (+/- 0.017)
Majority class guess:	 0.5279187817258884
Random class guess:	 0.5015589167461156

0.8
Kappa: 0.004 (+/- 0.004)
Majority class guess:	 0.5911398246423627
Random class guess:	 0.5166129352716813

0.9
Kappa: 0.000 (+/- 0.000)
Majority class guess:	 0.6797415782187356
Random class guess:	 0.5646140698811237



In [3]:
# Extract 'Winning arguments' features and put with labels
thresholds = np.arange(0.4, 1.0, 0.1)
poss_edthreads = set(zip(talk_data['article_title'], talk_data['thread_title'], talk_data['username']))
edthreads = list()
labels = {t: {} for t in thresholds}
scores = {}

# Prune to just those conversations occurring in score data
for i, el in enumerate(sorted(poss_edthreads)):
    rows = score_data[(score_data['article']==el[0]) &
                    (score_data['thread_title']==el[1]) &
                    (score_data['editor']==el[2])]
    if not rows.empty:
        edthreads.append(el)
        score = rows['editor_thread_score'].iloc[0]
        scores[el] = score
        for t in thresholds:
            labels[t][el] = 1 if score > t else 0

In [15]:
# Assemble input text of just editors' text
edtalk = defaultdict(str)
n_edturns = {}
n_otherturns = {}
othertalk = defaultdict(str)

for i, el in enumerate(edthreads):
    rows = talk_data[(talk_data['article_title']==el[0]) &
                    (talk_data['thread_title']==el[1])]
    edrows = rows[rows['username']==el[2]]
    edtalk[el] += ' '.join([str(t) for t in edrows['post_text'].tolist()])
    n_edturns[el] = len(edrows)
    
    other_rows = rows[rows['username']!=el[2]]
    othertalk[el] += ' '.join([str(t) for t in other_rows['post_text'].tolist()])
    n_otherturns[el] = len(other_rows)

In [23]:
# Get 'Winning arguments features' from text
edcounter = {}
othercounter = {}

for i, el in enumerate(edthreads):
    if not isinstance(edtalk[el], str):
        debug_here()
    edcounter[el] = Counter([w for w in word_tokenize(edtalk[el]) if w not in stopwords.words('english')])
    if not isinstance(othertalk[el], str):
        debug_here()
    othercounter[el] = Counter([w for w in word_tokenize(othertalk[el]) if w not in stopwords.words('english')])
    
    if i%50==0:
        print(i)
    
len(edcounter)

0
20
40
60
80
100
120
140
160
180
200
220
240
260
280
300
320
340
360
380
400
420
440
460
480
500
520
540
560
580
600
620
640
660
680
700
720
740
760
780
800
820
840
860
880
900
920
940
960
980
1000
1020
1040
1060
1080
1100
1120
1140
1160
1180
1200
1220
1240
1260
1280
1300
1320
1340
1360
1380
1400
1420
1440
1460
1480
1500
1520
1540
1560
1580
1600
1620
1640
1660
1680
1700
1720
1740
1760
1780
1800
1820
1840
1860
1880
1900
1920
1940
1960
1980
2000
2020
2040
2060
2080
2100
2120
2140
2160


2167

In [25]:
# Calculate relationships
n_commonwords = {}
reply = {}
op = {}
jaccard = {}

for el in edthreads:
    intersection = edcounter[el] - (edcounter[el]-othercounter[el])
    n_commonwords[el] = sum(intersection.values())
    reply[el] = n_commonwords[el]/sum(edcounter[el].values())
    op[el] = n_commonwords[el]/sum(othercounter[el].values())
    jaccard[el] = n_commonwords[el]/(sum(edcounter[el].values()) + sum(othercounter[el].values()) - n_commonwords[el])

In [29]:
# Write relationship features
relfeatures = pd.DataFrame([[el[0], el[1], el[2], edtalk[el], othertalk[el], n_edturns[el], n_otherturns[el],
               n_commonwords[el], reply[el], op[el], jaccard[el], scores[el], 
               *[labels[t][el] for t in thresholds]] for el in edthreads],
            columns=['article', 'thread_title', 'editor', 'editor_talk', 'other_talk',
                    '#editor_turns', '#other_turns', 
                     'words_in_common', 'intersection/editor', 'intersection/other', 'jaccard',
                     'editor_score', *['score_>{:0.1f}'.format(t) for t in thresholds]])

relfeatures.to_csv('/home/michael/school/research/wp/wikipedia/data/relationship_features.csv', index=False)
relfeatures

Unnamed: 0,article,thread_title,editor,editor_talk,other_talk,#editor_turns,#other_turns,words_in_common,intersection/editor,intersection/other,jaccard,editor_score,score_>0.4,score_>0.5,score_>0.6,score_>0.7,score_>0.8,score_>0.9
0,1929 Hebron massacre,Restatement,Bless sins,":""The massacure was preformed by muslims, ther...",Restatement: Sometime ago I asked for source o...,1,17,40,0.769231,0.017058,0.016971,0.000000,0,0,0,0,0,0
1,1929 Hebron massacre,Restatement,Ceedjee,Restatement: Sometime ago I asked for source o...,"The massacure was preformed by muslims, theref...",9,9,471,0.349407,0.448999,0.244548,0.800000,1,1,1,1,1,0
2,1929 Hebron massacre,Restatement,GHcool,:::::::Nishidani's argument for why the Hebron...,Restatement: Sometime ago I asked for source o...,1,17,54,0.729730,0.023246,0.023047,0.968254,1,1,1,1,1,1
3,1929 Hebron massacre,Restatement,Ian Pitchford,:And by that logic all killings by Jews would ...,Restatement: Sometime ago I asked for source o...,1,17,23,0.851852,0.009705,0.009688,1.000000,1,1,1,1,1,1
4,1929 Hebron massacre,Restatement,Nishidani,::::::I disagree with your my friend. Benny Mo...,Restatement: Sometime ago I asked for source o...,1,17,273,0.423256,0.155822,0.128531,0.858407,1,1,1,1,1,0
5,1929 Hebron massacre,Restatement,PalestineRemembered,":::I'm not sure how you can say ""Hebron massac...",Restatement: Sometime ago I asked for source o...,1,17,73,0.760417,0.031725,0.031411,0.707048,1,1,1,1,0,0
6,1929 Hebron massacre,Restatement,Yahel Guhan,"The massacure was preformed by muslims, theref...",Restatement: Sometime ago I asked for source o...,2,16,33,0.750000,0.014025,0.013959,0.500000,1,0,0,0,0,0
7,1929 Hebron massacre,ethnic cleansing cat,Nableezy,"Brewcrewer, could you please provide reliable ...",Here's a few just to start: [ ][ ][ ][ ]--brew...,2,2,31,0.258333,0.248000,0.144860,1.000000,1,1,1,1,1,1
8,1929 Hebron massacre,false rumors,Nableezy,"This is silly, but a. false rumors is not exac...",I'd like to point out that the days before the...,3,17,50,0.588235,0.028201,0.027655,0.000000,0,0,0,0,0,0
9,1929 Hebron massacre,false rumors,Nishidani,::(ec)It violates WP:NPOV but has some support...,"This is silly, but a. false rumors is not exac...",9,11,307,0.237800,0.541446,0.197937,0.994444,1,1,1,1,1,1


In [3]:
# Build input corpora of editors' text + others' text, get labels
thresholds = np.arange(0.4, 1.0, 0.1)
poss_edthreads = set(zip(talk_data['article_title'], talk_data['thread_title'], talk_data['username']))
edthreads = list()
labels = {t: {} for t in thresholds}
scores = {}

# Prune to just those occurring in score data
for i, el in enumerate(sorted(poss_edthreads)):
    rows = score_data[(score_data['article']==el[0]) &
                    (score_data['thread_title']==el[1]) &
                    (score_data['editor']==el[2])]
    if not rows.empty:
        edthreads.append(el)
        score = rows['editor_thread_score'].iloc[0]
        scores[el] = score
        for t in thresholds:
            labels[t][el] = 1 if score > t else 0

# Assemble input text of just editors' text
edtalk = defaultdict(str)
n_edturns = {}
n_otherturns = {}
othertalk = defaultdict(str)

for i, el in enumerate(edthreads):
    rows = talk_data[(talk_data['article_title']==el[0]) &
                    (talk_data['thread_title']==el[1])]
    edrows = rows[rows['username']==el[2]]
    edtalk[el] += ' '.join([str(t) for t in edrows['post_text'].tolist()])
    n_edturns[el] = len(edrows)
    
    other_rows = rows[rows['username']!=el[2]]
    othertalk[el] += ' '.join([str(t) for t in other_rows['post_text'].tolist()])
    n_otherturns[el] = len(other_rows)

In [4]:
# Build one relevant dataframe
outrows = []
for i, el in enumerate(edthreads):
    outrows.append([el[0], el[1], el[2], edtalk[el], othertalk[el], n_edturns[el], n_otherturns[el],
                    scores[el], *[labels[t][el] for t in thresholds]])
    
talk_scores = pd.DataFrame(outrows, columns=['article', 'thread_title', 'editor', 'editor_talk', 'other_talk',
                              '#editor_turns', '#other_turns', 'editor_score', *['score_>{:0.1f}'.format(t) for t in thresholds]])
talk_scores.to_csv('/home/michael/school/research/wp/wikipedia/data/talk_scores.csv', index=False)
talk_scores

Unnamed: 0,article,thread_title,editor,editor_talk,other_talk,#editor_turns,#other_turns,editor_score,score_>0.4,score_>0.5,score_>0.6,score_>0.7,score_>0.8,score_>0.9
0,1929 Hebron massacre,Restatement,Bless sins,":""The massacure was preformed by muslims, ther...",Restatement: Sometime ago I asked for source o...,1,17,0.000000,0,0,0,0,0,0
1,1929 Hebron massacre,Restatement,Ceedjee,Restatement: Sometime ago I asked for source o...,"The massacure was preformed by muslims, theref...",9,9,0.800000,1,1,1,1,1,0
2,1929 Hebron massacre,Restatement,GHcool,:::::::Nishidani's argument for why the Hebron...,Restatement: Sometime ago I asked for source o...,1,17,0.968254,1,1,1,1,1,1
3,1929 Hebron massacre,Restatement,Ian Pitchford,:And by that logic all killings by Jews would ...,Restatement: Sometime ago I asked for source o...,1,17,1.000000,1,1,1,1,1,1
4,1929 Hebron massacre,Restatement,Nishidani,::::::I disagree with your my friend. Benny Mo...,Restatement: Sometime ago I asked for source o...,1,17,0.858407,1,1,1,1,1,0
5,1929 Hebron massacre,Restatement,PalestineRemembered,":::I'm not sure how you can say ""Hebron massac...",Restatement: Sometime ago I asked for source o...,1,17,0.707048,1,1,1,1,0,0
6,1929 Hebron massacre,Restatement,Yahel Guhan,"The massacure was preformed by muslims, theref...",Restatement: Sometime ago I asked for source o...,2,16,0.500000,1,0,0,0,0,0
7,1929 Hebron massacre,ethnic cleansing cat,Nableezy,"Brewcrewer, could you please provide reliable ...",Here's a few just to start: [ ][ ][ ][ ]--brew...,2,2,1.000000,1,1,1,1,1,1
8,1929 Hebron massacre,false rumors,Nableezy,"This is silly, but a. false rumors is not exac...",I'd like to point out that the days before the...,3,17,0.000000,0,0,0,0,0,0
9,1929 Hebron massacre,false rumors,Nishidani,::(ec)It violates WP:NPOV but has some support...,"This is silly, but a. false rumors is not exac...",9,11,0.994444,1,1,1,1,1,1


In [5]:
# Vectorize input features
edturns = np.array([[n_edturns[el] for el in edthreads]]).T
otherturns = np.array([[n_otherturns[el] for el in edthreads]]).T
 = np.array([[n_otherturns[el] for el in edthreads]]).T

v = CountVectorizer(min_df=1, stop_words='english')
edbow = v.fit_transform([edtalk[k] for k in edthreads])
print(edbow.shape)

v_other = CountVectorizer(min_df=1, stop_words='english')
other_bow = v_other.fit_transform([othertalk[k] for k in edthreads])
print(other_bow.shape)

bow = hstack([edturns, otherturns, edbow, other_bow])
print(bow.shape)

(2167, 1)
(2167, 1)
(2167, 24508)
(2167, 27456)
(2167, 51966)


In [11]:
# Train and test logistic regression classifier--editor and others' text, with turn information
for t in thresholds:
    print(t)
    clf = LogisticRegression()

    scores = cross_validation.cross_val_score(clf, bow, list(labels[t].values()), scoring=kappa_scorer)
    print("Kappa: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

    # Majority class guess
    true_portion = np.count_nonzero(np.array(list(labels[t].values())))/len(labels[t].values())
    print("Majority class guess:\t", max(true_portion, 1-true_portion))
    
    # Random class guess
    print("Random class guess:\t", true_portion**2 + (1-true_portion)**2)
    print()

0.4
Kappa: -0.032 (+/- 0.095)
Majority class guess:	 0.6594370096908168
Random class guess:	 0.5508403201182992

0.5
Kappa: -0.041 (+/- 0.066)
Majority class guess:	 0.5897554222427319
Random class guess:	 0.5161120716439422

0.6
Kappa: -0.058 (+/- 0.068)
Majority class guess:	 0.5366866635902169
Random class guess:	 0.5026918225707635

0.7
Kappa: -0.027 (+/- 0.069)
Majority class guess:	 0.5279187817258884
Random class guess:	 0.5015589167461156

0.8
Kappa: -0.015 (+/- 0.078)
Majority class guess:	 0.5911398246423627
Random class guess:	 0.5166129352716813

0.9
Kappa: -0.004 (+/- 0.056)
Majority class guess:	 0.6797415782187356
Random class guess:	 0.5646140698811237



In [9]:
# Train and test SVM classifier--editor and others' text, with turn information
for t in thresholds:
    print(t)
    clf = svm.SVC()

    scores = cross_validation.cross_val_score(clf, bow, list(labels[t].values()), scoring=kappa_scorer)
    print("Kappa: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

    # Majority class guess
    true_portion = np.count_nonzero(np.array(list(labels[t].values())))/len(labels[t].values())
    print("Majority class guess:\t", max(true_portion, 1-true_portion))
    
    # Random class guess
    print("Random class guess:\t", true_portion**2 + (1-true_portion)**2)
    print()

0.4
Kappa: 0.000 (+/- 0.000)
Majority class guess:	 0.6594370096908168
Random class guess:	 0.5508403201182992

0.5
Kappa: 0.001 (+/- 0.002)
Majority class guess:	 0.5897554222427319
Random class guess:	 0.5161120716439422

0.6
Kappa: -0.004 (+/- 0.006)
Majority class guess:	 0.5366866635902169
Random class guess:	 0.5026918225707635

0.7
Kappa: -0.007 (+/- 0.017)
Majority class guess:	 0.5279187817258884
Random class guess:	 0.5015589167461156

0.8
Kappa: 0.000 (+/- 0.000)
Majority class guess:	 0.5911398246423627
Random class guess:	 0.5166129352716813

0.9
Kappa: 0.000 (+/- 0.000)
Majority class guess:	 0.6797415782187356
Random class guess:	 0.5646140698811237



In [24]:
# Train and test SVM classifier--just editors' text
for t in thresholds:
    print(t)
    clf = svm.SVC()

    scores = cross_validation.cross_val_score(clf, edbow, list(labels[t].values()))
    print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

    # Majority class guess
    true_portion = np.count_nonzero(np.array(list(labels[t].values())))/len(labels[t].values())
    print("Majority class guess:\t", max(true_portion, 1-true_portion))
    
    # Random class guess
    print("Random class guess:\t", true_portion**2 + (1-true_portion)**2)
    print()

0.4
Accuracy: 0.652 (+/- 0.001)
Majority class guess:	 0.652053530226119
Random class guess:	 0.5462405521084506

0.5
Accuracy: 0.586 (+/- 0.001)
Majority class guess:	 0.5856022150438394
Random class guess:	 0.5146554784408235

0.6
Accuracy: 0.533 (+/- 0.001)
Majority class guess:	 0.5329949238578681
Random class guess:	 0.502177330000773

0.7
Accuracy: 0.531 (+/- 0.001)
Majority class guess:	 0.5306875865251499
Random class guess:	 0.501883455933477

0.8
Accuracy: 0.594 (+/- 0.001)
Majority class guess:	 0.5939086294416244
Random class guess:	 0.5176376613672087

0.9
Accuracy: 0.679 (+/- 0.000)
Majority class guess:	 0.6788186432856483
Random class guess:	 0.5639522143730399



In [19]:
# Train and test NB classifier--editor and others' text
for t in thresholds:
    print(t)
    clf = MultinomialNB()

    scores = cross_validation.cross_val_score(clf, bow, list(labels[t].values()))
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

    # Majority class guess
    true_portion = np.count_nonzero(np.array(list(labels[t].values())))/len(labels[t].values())
    print("Majority class guess:\t", max(true_portion, 1-true_portion))
    
    # Random class guess
    print("Random class guess:\t", true_portion**2 + (1-true_portion)**2)
    print()

0.4
Accuracy: 0.51 (+/- 0.06)
Majority class guess:	 0.652053530226119
Random class guess:	 0.5462405521084506

0.5
Accuracy: 0.51 (+/- 0.01)
Majority class guess:	 0.5856022150438394
Random class guess:	 0.5146554784408235

0.6
Accuracy: 0.49 (+/- 0.03)
Majority class guess:	 0.5329949238578681
Random class guess:	 0.502177330000773

0.7
Accuracy: 0.51 (+/- 0.03)
Majority class guess:	 0.5306875865251499
Random class guess:	 0.501883455933477

0.8
Accuracy: 0.50 (+/- 0.05)
Majority class guess:	 0.5939086294416244
Random class guess:	 0.5176376613672087

0.9
Accuracy: 0.54 (+/- 0.04)
Majority class guess:	 0.6788186432856483
Random class guess:	 0.5639522143730399



In [26]:
# Build input corpora of just editors' text, get labels
poss_edthreads = set(zip(talk_data['article_title'], talk_data['thread_title'], talk_data['username']))
edthreads = list()
labels = {}

# Prune to just those occurring in score data
for i, el in enumerate(sorted(poss_edthreads)):
    rows = score_data[(score_data['article']==el[0]) &
                    (score_data['thread_title']==el[1]) &
                    (score_data['editor']==el[2])]
    if not rows.empty:
        edthreads.append(el)
        labels[el] = 1 if rows['editor_thread_score'].iloc[0] > 0.5 else 0
    
# print(len(edthreads))
# print(len(labels))

# Assemble input text of just editors' text
edtalk = defaultdict(str)

for i, el in enumerate(edthreads):
    rows = talk_data[(talk_data['article_title']==el[0]) &
                    (talk_data['thread_title']==el[1]) &
                    (talk_data['username']==el[2])]
    edtalk[el] += ' '.join([str(t) for t in rows['post_text'].tolist()])

# Vectorize input features
v = CountVectorizer(min_df=1)
bow = v.fit_transform([edtalk[k] for k in edthreads])
bow.shape