In [1]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn import cross_validation
from sklearn.metrics import cohen_kappa_score, make_scorer
import pandas as pd
import numpy as np
import scipy
from random import random

from IPython.core.debugger import Tracer; debug_here = Tracer()

In [None]:
# CLASSIFYING 1 DOCUMENT/EDITOR, with cross-validation
# Prepare data
threshold = 0.2
data = pd.read_csv('/home/michael/school/research/wp/wikipedia/data/editortext_factors_{:1.1f}.csv'.format(threshold))
eds = {}

# Get neutral editors
rows = data[data['factor']==0]
eds[0] = set(rows['editor'].tolist())

edx = {}
edy = {}

for i in range(1, 11):
    rows = data[data['factor']==i]
    eds[i] = set(rows['editor'].tolist())
    
    # Split up editor corpus (including neutral editors)
    edx[i] = list(eds[0]) + list(eds[i])
    edy[i] = [0] * len(eds[0]) + [1] * len(eds[i])
    
# Extract features
# vectorizer = CountVectorizer(stop_words='english', min_df=1)
# vectorizer = CountVectorizer(min_df=1)
vectorizer = TfidfVectorizer(min_df=1)
corpus = {}
X = {}
y = {}

# Fit to factor words
for i in range(1, 11):
#     print(i)
    corpus[i] = []
    y[i] = []
    
    # Get text from editors (includes neutral editors)
    for j, ed in enumerate(edx[i]):
        text = data[data['editor']==ed]['additions'].values[0]
        if isinstance(text, str):
            corpus[i].append(text)
            y[i].append(edy[i][j])
        
    vectorizer.fit(corpus[i])
        
# Fit to neutral words
# vectorizer.fit([c for c in data[data['editor'].isin(eds[0])]['additions'].tolist() if not isinstance(c, float)])

# for i in range(1, 11):
    X[i] = vectorizer.transform(corpus[i])

In [3]:
# CLASSIFYING 1 DOCUMENT/EDITOR, with cross-validation--CITATION WINDOW
# Prepare data
threshold = 0.2
data = pd.read_csv('/home/michael/school/research/wp/wp_articles/editor_citewindow_factor{:1.1f}.csv'.format(threshold))
eds = {}

# Get neutral editors
rows = data[data['factor']==0]
eds[0] = set(rows['editor'].tolist())

edx = {}
edy = {}

for i in range(1, 11):
    rows = data[data['factor']==i]
    eds[i] = set(rows['editor'].tolist())
    
    # Split up editor corpus (including neutral editors)
    edx[i] = list(eds[0]) + list(eds[i])
    edy[i] = [0] * len(eds[0]) + [1] * len(eds[i])
    
# Extract features
# vectorizer = CountVectorizer(stop_words='english', min_df=1)
# vectorizer = CountVectorizer(min_df=1)
vectorizer = TfidfVectorizer(min_df=1)
corpus = {}
X = {}
y = {}

# Fit to factor words
for i in range(1, 11):
#     print(i)
    corpus[i] = []
    y[i] = []
    
    # Get text from editors (includes neutral editors)
    for j, ed in enumerate(edx[i]):
        text = data[data['editor']==ed]['text'].values[0]
        if isinstance(text, str):
            corpus[i].append(text)
            y[i].append(edy[i][j])
        
    vectorizer.fit(corpus[i])
        
# Fit to neutral words
# vectorizer.fit([c for c in data[data['editor'].isin(eds[0])]['additions'].tolist() if not isinstance(c, float)])

# for i in range(1, 11):
    X[i] = vectorizer.transform(corpus[i])

In [4]:
# Print number of editors
for i in range(11):
    print(i, len(eds[i]))

0 461
1 204
2 123
3 128
4 46
5 162
6 94
7 215
8 126
9 100
10 190


In [5]:
# Train classifier -- CITATION WINDOW
clf = MultinomialNB()
results = []
zeror = []
rands = []
kappa_scorer = make_scorer(cohen_kappa_score)
kappas = []

for i in range(1,11):
    print("{:d}: ".format(i), end='')

    # accuracy
    scores = cross_validation.cross_val_score(clf, X[i], y[i], cv=10)
    results.append(scores.mean())
    print("\taccuracy:\t%f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    
    # kappa
#     scores = cross_validation.cross_val_score(clf, X[i], y[i], cv=10, scoring=kappa_scorer)
#     kappa = scores.mean()
#     kappas.append(kappa)
#     print("\tkappa:\t\t".format(kappa))

    # Random baseline
    factor_perc = len(eds[i])/(len(eds[i]) + len(eds[0]))
    neutral_perc = 1-factor_perc
    zeror.append(neutral_perc)
    print('\tall neutral:\t{:f}'.format(neutral_perc))
    
    rand_baseline = factor_perc**2 + neutral_perc**2
    rands.append(rand_baseline)
    print('\trandom:\t\t{:f}'.format(rand_baseline))
    print()
    
print()
print('mean accuracy:\t\t', np.mean(results))
print('mean all neutral:\t', np.mean(zeror))
# print('mean kappa:\t', np.mean(kappas))
print('mean random:\t\t', np.mean(rands))

1: 	accuracy:	0.693270 (+/- 0.01)
	all neutral:	0.693233
	random:		0.574678

2: 	accuracy:	0.789438 (+/- 0.01)
	all neutral:	0.789384
	random:		0.667486

3: 	accuracy:	0.782717 (+/- 0.01)
	all neutral:	0.782683
	random:		0.659819

4: 	accuracy:	0.909365 (+/- 0.02)
	all neutral:	0.909270
	random:		0.835004

5: 	accuracy:	0.740002 (+/- 0.01)
	all neutral:	0.739968
	random:		0.615169

6: 	accuracy:	0.830703 (+/- 0.01)
	all neutral:	0.830631
	random:		0.718633

7: 	accuracy:	0.681988 (+/- 0.01)
	all neutral:	0.681953
	random:		0.566214

8: 	accuracy:	0.785405 (+/- 0.01)
	all neutral:	0.785349
	random:		0.662848

9: 	accuracy:	0.821742 (+/- 0.00)
	all neutral:	0.821747
	random:		0.707042

10: 	accuracy:	0.708135 (+/- 0.00)
	all neutral:	0.708141
	random:		0.586646


mean accuracy:		 0.774276399442
mean all neutral:	 0.774235799968
mean random:		 0.659353882057


In [50]:
# Train classifier
clf = MultinomialNB()
results = []
zeror = []
rands = []
kappa_scorer = make_scorer(cohen_kappa_score)
kappas = []

for i in range(1,11):
    print("{:d}: ".format(i), end='')

    # accuracy
    scores = cross_validation.cross_val_score(clf, X[i], y[i], cv=10)
    results.append(scores.mean())
    print("\taccuracy:\t%f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    
    # kappa
#     scores = cross_validation.cross_val_score(clf, X[i], y[i], cv=10, scoring=kappa_scorer)
#     kappa = scores.mean()
#     kappas.append(kappa)
#     print("\tkappa:\t\t".format(kappa))

    # Random baseline
    factor_perc = len(eds[i])/(len(eds[i]) + len(eds[0]))
    neutral_perc = 1-factor_perc
    zeror.append(neutral_perc)
    print('\tall neutral:\t{:f}'.format(neutral_perc))
    
    rand_baseline = factor_perc**2 + neutral_perc**2
    rands.append(rand_baseline)
    print('\trandom:\t\t{:f}'.format(rand_baseline))
    print()
    
print()
print('mean accuracy:\t\t', np.mean(results))
print('mean all neutral:\t', np.mean(zeror))
# print('mean kappa:\t', np.mean(kappas))
print('mean random:\t\t', np.mean(rands))

1: 	accuracy:	0.707421 (+/- 0.01)
	all neutral:	0.707801
	random:		0.586363

2: 	accuracy:	0.798137 (+/- 0.01)
	all neutral:	0.797125
	random:		0.676566

3: 	accuracy:	0.794281 (+/- 0.01)
	all neutral:	0.789557
	random:		0.667686

4: 	accuracy:	0.917229 (+/- 0.02)
	all neutral:	0.915596
	random:		0.845441

5: 	accuracy:	0.752307 (+/- 0.01)
	all neutral:	0.752640
	random:		0.627653

6: 	accuracy:	0.839869 (+/- 0.01)
	all neutral:	0.840067
	random:		0.731292

7: 	accuracy:	0.696534 (+/- 0.01)
	all neutral:	0.695955
	random:		0.576797

8: 	accuracy:	0.796856 (+/- 0.01)
	all neutral:	0.797125
	random:		0.676566

9: 	accuracy:	0.825938 (+/- 0.01)
	all neutral:	0.826159
	random:		0.712759

10: 	accuracy:	0.715546 (+/- 0.01)
	all neutral:	0.715925
	random:		0.593248


mean accuracy:		 0.784411807903
mean all neutral:	 0.783795047397
mean random:		 0.66943709778


In [40]:
# Train classifier
clf = MultinomialNB()
results = []
zeror = []
rands = []

for i in range(1,11):
    print("{:d}: ".format(i), end='')

    # Test
    scores = cross_validation.cross_val_score(clf, X[i], y[i], cv=10)
    results.append(scores.mean())
    print("\taccuracy:\t%f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

    # Random baseline
    factor_perc = len(eds[i])/(len(eds[i]) + len(eds[0]))
    neutral_perc = 1-factor_perc
    zeror.append(neutral_perc)
    print('\tall neutral:\t{:f}'.format(neutral_perc))
    
    rand_baseline = factor_perc**2 + neutral_perc**2
    rands.append(rand_baseline)
    print('\trandom:\t\t{:f}'.format(rand_baseline))
    print()
    
print()
print('mean accuracy:\t\t', np.mean(results))
print('mean all neutral:\t', np.mean(zeror))
print('mean accuracy:\t\t', np.mean(rands))

1: 	accuracy:	0.522676 (+/- 0.12)
	all neutral:	0.707801
	random:		0.586363

2: 	accuracy:	0.543218 (+/- 0.15)
	all neutral:	0.797125
	random:		0.676566

3: 	accuracy:	0.537380 (+/- 0.13)
	all neutral:	0.789557
	random:		0.667686

4: 	accuracy:	0.778514 (+/- 0.14)
	all neutral:	0.915596
	random:		0.845441

5: 	accuracy:	0.572115 (+/- 0.15)
	all neutral:	0.752640
	random:		0.627653

6: 	accuracy:	0.585519 (+/- 0.12)
	all neutral:	0.840067
	random:		0.731292

7: 	accuracy:	0.510524 (+/- 0.13)
	all neutral:	0.695955
	random:		0.576797

8: 	accuracy:	0.538990 (+/- 0.11)
	all neutral:	0.797125
	random:		0.676566

9: 	accuracy:	0.580639 (+/- 0.15)
	all neutral:	0.826159
	random:		0.712759

10: 	accuracy:	0.525966 (+/- 0.13)
	all neutral:	0.715925
	random:		0.593248


mean accuracy:		 0.569554135712
mean all neutral:	 0.783795047397
mean accuracy:		 0.66943709778


In [18]:
# Prepare corpus -- editor-aware
data = pd.read_csv('/home/michael/school/research/wp/wikipedia/data/article_diffs_factors_0.2.csv')
# data = pd.read_csv('/home/michael/school/research/wp/wikipedia/data/article_diffs_factors_0.6.csv')
eds = {}

# Get neutral editors
rows = data[data['factor']==0]
eds[0] = set(rows['editor'].tolist())

edx_train = {}
edx_test = {}
edy_train = {}
edy_test = {}

for i in range(1, 11):
    rows = data[data['factor']==i]
    eds[i] = set(rows['editor'].tolist())
    
    # Split up editor corpus (including neutral editors)
    X = list(eds[0]) + list(eds[i])
    y = [0] * len(eds[0]) + [1] * len(eds[i])
    edx_train[i], edx_test[i], edy_train[i], edy_test[i] = train_test_split(X, y, test_size=0.2, random_state=42)
    
# Extract features
# vectorizer = CountVectorizer(stop_words='english', min_df=1)
vectorizer = CountVectorizer(min_df=1)
# vectorizer = TfidfVectorizer(min_df=1)
corpus_train = {}
corpus_test = {}
X_train = {}
X_test = {}
y_train = {}
y_test = {}

# Fit to factor words
for i in range(1, 11):
#     print(i)
    corpus_train[i] = []
    corpus_test[i] = []
    y_train[i] = []
    y_test[i] = []
    
    # Get text from editors (includes neutral editors)
    for j, ed in enumerate(edx_train[i]):
        tgt = edy_train[i][j]
        contributions = []
        for k, row in data[data['editor']==ed].iterrows():
            if not isinstance(row['additions'], float):
                contributions.append(str(row['additions']))
        corpus_train[i] += contributions
        y_train[i] += [tgt] * len(contributions)
        
    vectorizer.fit(corpus_train[i])
        
    for j, ed in enumerate(edx_test[i]):
        tgt = edy_test[i][j]
        contributions = []
        for k, row in data[data['editor']==ed].iterrows():
            if not isinstance(row['additions'], float):
                contributions.append(row['additions'])
        corpus_test[i] += contributions
        y_test[i] += [tgt] * len(contributions)
        
    vectorizer.fit(corpus_test[i])
    
# Fit to neutral words
vectorizer.fit([c for c in data[data['editor'].isin(eds[0])]['additions'].tolist() if not isinstance(c, float)])

for i in range(1, 11):
    X_train[i] = vectorizer.transform(corpus_train[i])
    X_test[i] = vectorizer.transform(corpus_test[i])
#     print(X_train[i].size)

# Train classifier
clf = MultinomialNB()
results = []

for i in range(1,11):
    print("{:d}: ".format(i), end='')
    clf.fit(X_train[i], y_train[i])

    # Test
    predicted = clf.predict(X_test[i])
    result = np.mean(predicted == y_test[i])
    print(result)
    results.append(result)

    # Random baseline
    baseline = np.array([round(random()) for _ in range(len(y_test[i]))])
    print('\trandom:\t\t', np.mean(baseline == y_test[i]))
    baseline = np.array([1 for _ in range(len(y_test[i]))]) # predict always the factor
    print('\tall factor:\t', np.mean(baseline == y_test[i]))
    baseline = np.array([0 for _ in range(len(y_test[i]))]) # predict always neutral
    print('\tall neutral:\t', np.mean(baseline == y_test[i]))
    
print()
print(np.mean(results))

1: 0.683547461644
	random:		 0.500311837346
	all factor:	 0.644006486217
	all neutral:	 0.355993513783
2: 0.702021653732
	random:		 0.49765258216
	all factor:	 0.750407205136
	all neutral:	 0.249592794864
3: 0.565072830906
	random:		 0.503008233059
	all factor:	 0.496200126662
	all neutral:	 0.503799873338
4: 0.557328853423
	random:		 0.4996100078
	all factor:	 0.843043139137
	all neutral:	 0.156956860863
5: 0.611668897813
	random:		 0.5
	all factor:	 0.697568049978
	all neutral:	 0.302431950022
6: 0.536787564767
	random:		 0.502849740933
	all factor:	 0.673056994819
	all neutral:	 0.326943005181
7: 0.712840888568
	random:		 0.498555174282
	all factor:	 0.720787430016
	all neutral:	 0.279212569984
8: 0.579863739591
	random:		 0.488872066616
	all factor:	 0.536866010598
	all neutral:	 0.463133989402
9: 0.578982009653
	random:		 0.485300570426
	all factor:	 0.568451075033
	all neutral:	 0.431548924967
10: 0.533207157605
	random:		 0.509463179628
	all factor:	 0.508602890571
	all neutral:

In [17]:
# ORIGINAL--faster/different than above
# Prepare corpus -- editor-aware (editors divided train/test)
data = pd.read_csv('/home/michael/school/research/wp/wikipedia/data/article_diffs_factors_0.2.csv')
# data = pd.read_csv('/home/michael/school/research/wp/wikipedia/data/article_diffs_factors_0.6.csv')
eds = {}

# Get neutral editors
rows = data[data['factor']==0]
eds[0] = set(rows['editor'].tolist())
print("Number of editors:")
print("\t0:", len(eds[0]))

edx_train = {}
edx_test = {}
edy_train = {}
edy_test = {}

for i in range(1, 11):
    rows = data[data['factor']==i]
    eds[i] = set(rows['editor'].tolist())
    
    # Split up editors
    X = list(eds[0]) + list(eds[i])
    print("\t{0}: {1}".format(i, len(eds[i])))
    y = [0] * len(eds[0]) + [1] * len(eds[i])
    edx_train[i], edx_test[i], edy_train[i], edy_test[i] = train_test_split(X, y, test_size=0.2, random_state=42)

# Extract features
# vectorizer = CountVectorizer(stop_words='english', min_df=1)
vectorizer = CountVectorizer(min_df=1)
# vectorizer = TfidfVectorizer(min_df=1)
corpus_train = {}
corpus_test = {}
X_train = {}
X_test = {}
y_train = {}
y_test = {}

# Fit to neutral words
vectorizer.fit([c for c in data[data['editor'].isin(eds[0])]['additions'].tolist() if not isinstance(c, float)])

for i in range(1, 11):
    corpus_train[i] = []
    corpus_test[i] = []
    y_train[i] = []
    y_test[i] = []
    
    # Get text from editors
    for j, ed in enumerate(edx_train[i]):
        contributions = data[data['editor']==ed]['additions'].tolist()
        contributions = [c for c in contributions if not isinstance(c, float)]
        corpus_train[i] += contributions
        y_train[i] += [edy_train[i][j]] * len(contributions)
        
    vectorizer.fit(corpus_train[i])
        
    for j, ed in enumerate(edx_test[i]):
        contributions = [c for c in data[data['editor']==ed]['additions'].tolist() if not isinstance(c, float)]
        corpus_test[i] += contributions
        y_test[i] += [edy_test[i][j]] * len(contributions)
        
    vectorizer.fit(corpus_test[i])
    
for i in range(1, 11):
    X_train[i] = vectorizer.transform(corpus_train[i])
    X_test[i] = vectorizer.transform(corpus_test[i])
#     print(X_train[i].size)

# Train classifier
clf = MultinomialNB()
results = []
print()

for i in range(1,11):
    print("{:d}: ".format(i), end='')
    clf.fit(X_train[i], y_train[i])

    # Test
    predicted = clf.predict(X_test[i])
    result = np.mean(predicted == y_test[i])
    print(result)
    results.append(result)

    # Random baseline
    baseline = np.array([round(random()) for _ in range(len(y_test[i]))])
    print('\trandom:\t\t', np.mean(baseline == y_test[i]))
    baseline = np.array([1 for _ in range(len(y_test[i]))]) # predict always the factor
    print('\tall factor:\t', np.mean(baseline == y_test[i]))
    baseline = np.array([0 for _ in range(len(y_test[i]))]) # predict always neutral
    print('\tall neutral:\t', np.mean(baseline == y_test[i]))
    
print()
print(np.mean(results))

Number of editors:
	0 499
	1: 206
	2: 126
	3: 131
	4: 46
	5: 164
	6: 95
	7: 217
	8: 127
	9: 105
	10: 198

1: 0.69240364226
	random:		 0.501559186728
	all factor:	 0.644006486217
	all neutral:	 0.355993513783
2: 0.704608604005
	random:		 0.49640701351
	all factor:	 0.750407205136
	all neutral:	 0.249592794864
3: 0.56380620646
	random:		 0.49778340722
	all factor:	 0.496200126662
	all neutral:	 0.503799873338
4: 0.551328973421
	random:		 0.507469850603
	all factor:	 0.843043139137
	all neutral:	 0.156956860863
5: 0.602855867916
	random:		 0.491633199465
	all factor:	 0.697568049978
	all neutral:	 0.302431950022
6: 0.54481865285
	random:		 0.501295336788
	all factor:	 0.673056994819
	all neutral:	 0.326943005181
7: 0.714646920715
	random:		 0.501625428933
	all factor:	 0.720787430016
	all neutral:	 0.279212569984
8: 0.580923542771
	random:		 0.506131718395
	all factor:	 0.536866010598
	all neutral:	 0.463133989402
9: 0.575252303642
	random:		 0.507459412023
	all factor:	 0.568451075033
	a