In [1]:
import pandas as pd
import sys, json, re, os
from collections import defaultdict, OrderedDict
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import cohen_kappa_score, make_scorer
from sklearn.grid_search import GridSearchCV
from sklearn import cross_validation, svm
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack

from IPython.core.debugger import Tracer; debug_here = Tracer()

kappa_scorer = make_scorer(cohen_kappa_score)



In [2]:
# Load, initialize data
score_data = pd.read_csv('/home/michael/school/research/wp/wikipedia/data/editor_thread_scores.csv', parse_dates=['edit_timestamp'])
talk_data = pd.read_csv('/home/michael/school/research/wp/wikipedia/data/talk/ipc_talkpages_byarticle.csv', parse_dates=['post_timestamp'])
crit = talk_data['post_text'].map(lambda x: not re.match(r':+$', str(x))) # take out entries of just colons
talk_data = talk_data[crit]

In [3]:
# Build input corpora of editors' text + others' text, get labels
thresholds = np.arange(0.4, 1.0, 0.1)
poss_edthreads = set(zip(talk_data['article_title'], talk_data['thread_title'], talk_data['username']))
edthreads = list()
labels = {t: {} for t in thresholds}
scores = {}

# Prune to just those occurring in score data
for i, el in enumerate(sorted(poss_edthreads)):
    rows = score_data[(score_data['article']==el[0]) &
                    (score_data['thread_title']==el[1]) &
                    (score_data['editor']==el[2])]
    if not rows.empty:
        edthreads.append(el)
        score = rows['editor_thread_score'].iloc[0]
        scores[el] = score
        for t in thresholds:
            labels[t][el] = 1 if score > t else 0

# Assemble input text of just editors' text
edtalk = defaultdict(str)
n_edturns = {}
n_otherturns = {}
othertalk = defaultdict(str)

for i, el in enumerate(edthreads):
    rows = talk_data[(talk_data['article_title']==el[0]) &
                    (talk_data['thread_title']==el[1])]
    edrows = rows[rows['username']==el[2]]
    edtalk[el] += ' '.join([str(t) for t in edrows['post_text'].tolist()])
    n_edturns[el] = len(edrows)
    
    other_rows = rows[rows['username']!=el[2]]
    othertalk[el] += ' '.join([str(t) for t in other_rows['post_text'].tolist()])
    n_otherturns[el] = len(other_rows)

In [4]:
# Build one relevant dataframe
outrows = []
for i, el in enumerate(edthreads):
    outrows.append([el[0], el[1], el[2], edtalk[el], othertalk[el], n_edturns[el], n_otherturns[el],
                    scores[el], *[labels[t][el] for t in thresholds]])
    
talk_scores = pd.DataFrame(outrows, columns=['article', 'thread_title', 'editor', 'editor_talk', 'other_talk',
                              '#editor_turns', '#other_turns', 'editor_score', *['score_>{:0.1f}'.format(t) for t in thresholds]])
talk_scores.to_csv('/home/michael/school/research/wp/wikipedia/data/talk_scores.csv', index=False)
talk_scores

Unnamed: 0,article,thread_title,editor,editor_talk,other_talk,#editor_turns,#other_turns,editor_score,score_>0.4,score_>0.5,score_>0.6,score_>0.7,score_>0.8,score_>0.9
0,1929 Hebron massacre,Restatement,Bless sins,":""The massacure was preformed by muslims, ther...",Restatement: Sometime ago I asked for source o...,1,17,0.000000,0,0,0,0,0,0
1,1929 Hebron massacre,Restatement,Ceedjee,Restatement: Sometime ago I asked for source o...,"The massacure was preformed by muslims, theref...",9,9,0.800000,1,1,1,1,1,0
2,1929 Hebron massacre,Restatement,GHcool,:::::::Nishidani's argument for why the Hebron...,Restatement: Sometime ago I asked for source o...,1,17,0.968254,1,1,1,1,1,1
3,1929 Hebron massacre,Restatement,Ian Pitchford,:And by that logic all killings by Jews would ...,Restatement: Sometime ago I asked for source o...,1,17,1.000000,1,1,1,1,1,1
4,1929 Hebron massacre,Restatement,Nishidani,::::::I disagree with your my friend. Benny Mo...,Restatement: Sometime ago I asked for source o...,1,17,0.858407,1,1,1,1,1,0
5,1929 Hebron massacre,Restatement,PalestineRemembered,":::I'm not sure how you can say ""Hebron massac...",Restatement: Sometime ago I asked for source o...,1,17,0.707048,1,1,1,1,0,0
6,1929 Hebron massacre,Restatement,Yahel Guhan,"The massacure was preformed by muslims, theref...",Restatement: Sometime ago I asked for source o...,2,16,0.500000,1,0,0,0,0,0
7,1929 Hebron massacre,ethnic cleansing cat,Nableezy,"Brewcrewer, could you please provide reliable ...",Here's a few just to start: [ ][ ][ ][ ]--brew...,2,2,1.000000,1,1,1,1,1,1
8,1929 Hebron massacre,false rumors,Nableezy,"This is silly, but a. false rumors is not exac...",I'd like to point out that the days before the...,3,17,0.000000,0,0,0,0,0,0
9,1929 Hebron massacre,false rumors,Nishidani,::(ec)It violates WP:NPOV but has some support...,"This is silly, but a. false rumors is not exac...",9,11,0.994444,1,1,1,1,1,1


In [5]:
# Vectorize input features
edturns = np.array([[n_edturns[el] for el in edthreads]]).T
otherturns = np.array([[n_otherturns[el] for el in edthreads]]).T
print(edturns.shape)
print(otherturns.shape)

v = CountVectorizer(min_df=1, stop_words='english')
edbow = v.fit_transform([edtalk[k] for k in edthreads])
print(edbow.shape)

v_other = CountVectorizer(min_df=1, stop_words='english')
other_bow = v_other.fit_transform([othertalk[k] for k in edthreads])
print(other_bow.shape)

bow = hstack([edturns, otherturns, edbow, other_bow])
print(bow.shape)

(2167, 1)
(2167, 1)
(2167, 24508)
(2167, 27456)
(2167, 51966)


In [11]:
# Train and test logistic regression classifier--editor and others' text, with turn information
for t in thresholds:
    print(t)
    clf = LogisticRegression()

    scores = cross_validation.cross_val_score(clf, bow, list(labels[t].values()), scoring=kappa_scorer)
    print("Kappa: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

    # Majority class guess
    true_portion = np.count_nonzero(np.array(list(labels[t].values())))/len(labels[t].values())
    print("Majority class guess:\t", max(true_portion, 1-true_portion))
    
    # Random class guess
    print("Random class guess:\t", true_portion**2 + (1-true_portion)**2)
    print()

0.4
Kappa: -0.032 (+/- 0.095)
Majority class guess:	 0.6594370096908168
Random class guess:	 0.5508403201182992

0.5
Kappa: -0.041 (+/- 0.066)
Majority class guess:	 0.5897554222427319
Random class guess:	 0.5161120716439422

0.6
Kappa: -0.058 (+/- 0.068)
Majority class guess:	 0.5366866635902169
Random class guess:	 0.5026918225707635

0.7
Kappa: -0.027 (+/- 0.069)
Majority class guess:	 0.5279187817258884
Random class guess:	 0.5015589167461156

0.8
Kappa: -0.015 (+/- 0.078)
Majority class guess:	 0.5911398246423627
Random class guess:	 0.5166129352716813

0.9
Kappa: -0.004 (+/- 0.056)
Majority class guess:	 0.6797415782187356
Random class guess:	 0.5646140698811237



In [9]:
# Train and test SVM classifier--editor and others' text, with turn information
for t in thresholds:
    print(t)
    clf = svm.SVC()

    scores = cross_validation.cross_val_score(clf, bow, list(labels[t].values()), scoring=kappa_scorer)
    print("Kappa: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

    # Majority class guess
    true_portion = np.count_nonzero(np.array(list(labels[t].values())))/len(labels[t].values())
    print("Majority class guess:\t", max(true_portion, 1-true_portion))
    
    # Random class guess
    print("Random class guess:\t", true_portion**2 + (1-true_portion)**2)
    print()

0.4
Kappa: 0.000 (+/- 0.000)
Majority class guess:	 0.6594370096908168
Random class guess:	 0.5508403201182992

0.5
Kappa: 0.001 (+/- 0.002)
Majority class guess:	 0.5897554222427319
Random class guess:	 0.5161120716439422

0.6
Kappa: -0.004 (+/- 0.006)
Majority class guess:	 0.5366866635902169
Random class guess:	 0.5026918225707635

0.7
Kappa: -0.007 (+/- 0.017)
Majority class guess:	 0.5279187817258884
Random class guess:	 0.5015589167461156

0.8
Kappa: 0.000 (+/- 0.000)
Majority class guess:	 0.5911398246423627
Random class guess:	 0.5166129352716813

0.9
Kappa: 0.000 (+/- 0.000)
Majority class guess:	 0.6797415782187356
Random class guess:	 0.5646140698811237



In [24]:
# Train and test SVM classifier--just editors' text
for t in thresholds:
    print(t)
    clf = svm.SVC()

    scores = cross_validation.cross_val_score(clf, edbow, list(labels[t].values()))
    print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

    # Majority class guess
    true_portion = np.count_nonzero(np.array(list(labels[t].values())))/len(labels[t].values())
    print("Majority class guess:\t", max(true_portion, 1-true_portion))
    
    # Random class guess
    print("Random class guess:\t", true_portion**2 + (1-true_portion)**2)
    print()

0.4
Accuracy: 0.652 (+/- 0.001)
Majority class guess:	 0.652053530226119
Random class guess:	 0.5462405521084506

0.5
Accuracy: 0.586 (+/- 0.001)
Majority class guess:	 0.5856022150438394
Random class guess:	 0.5146554784408235

0.6
Accuracy: 0.533 (+/- 0.001)
Majority class guess:	 0.5329949238578681
Random class guess:	 0.502177330000773

0.7
Accuracy: 0.531 (+/- 0.001)
Majority class guess:	 0.5306875865251499
Random class guess:	 0.501883455933477

0.8
Accuracy: 0.594 (+/- 0.001)
Majority class guess:	 0.5939086294416244
Random class guess:	 0.5176376613672087

0.9
Accuracy: 0.679 (+/- 0.000)
Majority class guess:	 0.6788186432856483
Random class guess:	 0.5639522143730399



In [19]:
# Train and test NB classifier--editor and others' text
for t in thresholds:
    print(t)
    clf = MultinomialNB()

    scores = cross_validation.cross_val_score(clf, bow, list(labels[t].values()))
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

    # Majority class guess
    true_portion = np.count_nonzero(np.array(list(labels[t].values())))/len(labels[t].values())
    print("Majority class guess:\t", max(true_portion, 1-true_portion))
    
    # Random class guess
    print("Random class guess:\t", true_portion**2 + (1-true_portion)**2)
    print()

0.4
Accuracy: 0.51 (+/- 0.06)
Majority class guess:	 0.652053530226119
Random class guess:	 0.5462405521084506

0.5
Accuracy: 0.51 (+/- 0.01)
Majority class guess:	 0.5856022150438394
Random class guess:	 0.5146554784408235

0.6
Accuracy: 0.49 (+/- 0.03)
Majority class guess:	 0.5329949238578681
Random class guess:	 0.502177330000773

0.7
Accuracy: 0.51 (+/- 0.03)
Majority class guess:	 0.5306875865251499
Random class guess:	 0.501883455933477

0.8
Accuracy: 0.50 (+/- 0.05)
Majority class guess:	 0.5939086294416244
Random class guess:	 0.5176376613672087

0.9
Accuracy: 0.54 (+/- 0.04)
Majority class guess:	 0.6788186432856483
Random class guess:	 0.5639522143730399



In [26]:
# Build input corpora of just editors' text, get labels
poss_edthreads = set(zip(talk_data['article_title'], talk_data['thread_title'], talk_data['username']))
edthreads = list()
labels = {}

# Prune to just those occurring in score data
for i, el in enumerate(sorted(poss_edthreads)):
    rows = score_data[(score_data['article']==el[0]) &
                    (score_data['thread_title']==el[1]) &
                    (score_data['editor']==el[2])]
    if not rows.empty:
        edthreads.append(el)
        labels[el] = 1 if rows['editor_thread_score'].iloc[0] > 0.5 else 0
    
# print(len(edthreads))
# print(len(labels))

# Assemble input text of just editors' text
edtalk = defaultdict(str)

for i, el in enumerate(edthreads):
    rows = talk_data[(talk_data['article_title']==el[0]) &
                    (talk_data['thread_title']==el[1]) &
                    (talk_data['username']==el[2])]
    edtalk[el] += ' '.join([str(t) for t in rows['post_text'].tolist()])

# Vectorize input features
v = CountVectorizer(min_df=1)
bow = v.fit_transform([edtalk[k] for k in edthreads])
bow.shape