In [1]:
import pandas as pd
import sys, json, re, os, math
from collections import defaultdict, OrderedDict, Counter
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import cohen_kappa_score, make_scorer
from sklearn import cross_validation, svm
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, Ridge
from sklearn.model_selection import GroupKFold
from sklearn.neural_network import MLPRegressor
from scipy.sparse import hstack
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import random
import pickle
from tqdm import tqdm_notebook as tqdm
import itertools
from scipy.stats import pearsonr
import operator

from IPython.core.debugger import Tracer; debug_here = Tracer()

kappa_scorer = make_scorer(cohen_kappa_score)



# Conversational outcome measure

In [3]:
role_behavior_distros = pd.read_csv('/home/michael/school/research/wp/wikipedia/data/enwiki/role_weights/roles_K5-fold0.csv')
role_behavior_distros

Unnamed: 0,EXAMPLE,FIRST_TURN,LAST_TURN,ART::DEF,ART::INDEF,PERS_PRON::PLUR,PERS_PRON::SING,QUESTION,SENTI::NEG,SENTI::POS,URL
0,0.004419,0.071513,0.072779,0.037584,0.243713,0.239818,0.005043,0.325124,5e-06,1.623836e-06,8.710576e-07
1,0.00806,0.077198,0.074679,0.00487,0.115393,0.07046,0.179927,0.386751,0.082659,2.254544e-06,8.897492e-07
2,0.008381,0.01412,0.014641,0.178586,0.275576,0.040257,0.041066,0.427198,0.000172,1.581402e-06,1.027463e-06
3,0.004207,0.022339,0.021876,0.002305,0.507191,0.016835,0.035133,0.3899,0.000214,1.416467e-06,6.053579e-07
4,0.01185,0.021605,0.021954,0.021901,0.124207,0.012487,0.058343,0.727646,5e-06,7.88816e-07,9.154996e-07


# Regression experiments w role features, train on dev, test on test

## Make combination features

In [91]:
for n_roles in range(2,6):
# for n_roles in range(3,4):
    print('******** {} ROLES ***********'.format(n_roles))
    
    # Load features
    train_csvpath = '/home/michael/school/research/wp/wikipedia/data/talk/enwiki/role_weights/PRPM_K{}-dev_all.csv'.format(n_roles)
    test_csvpath = '/home/michael/school/research/wp/wikipedia/data/talk/enwiki/role_weights/PRPM_K{}-test_all.csv'.format(n_roles)
    feat_dfs = {'train': pd.read_csv(train_csvpath),
                'test': pd.read_csv(test_csvpath)
               }
    
    old_featnames = ['ART::DEF', 'ART::INDEF',
           'AUTH::CREDENTIALS', 'AUTH::EXPERIENTIAL', 'AUTH::EXTERNAL',
           'AUTH::FORUM', 'AUTH::SOCIAL_EXPECTATIONS', 'EXAMPLE', 'FIRST_TURN',
           'LAST_TURN', 'NUM_TURNS', 'PERS_PRON::PLUR', 'PERS_PRON::SING',
           'QUESTION', 'SENTI::NEG', 'SENTI::POS', 'URL'] 
    ftopic_featnames = ['ftopic{:d}'.format(x) for x in range(20)]
#     tgt_rolenames = ['TargetRole_{}'.format(n) for n in range(n_roles)]

    other_rolenames = ['NontargetRole_{}'.format(n) for n in range(n_roles)]
    
    tgt_behaviors = sorted(['#editor_turns'] + old_featnames + ftopic_featnames + other_rolenames)
    
    for split in ['train', 'test']:
        for tgt_role in tgt_rolenames:
            for other_role in other_rolenames:
                feat_dfs[split]['{}_{}'.format(tgt_role, other_role)] = feat_dfs[split][tgt_role] * feat_dfs[split][other_role]
                
    feat_dfs['train'].to_csv(train_csvpath)
    feat_dfs['test'].to_csv(test_csvpath)

******** 2 ROLES ***********
******** 3 ROLES ***********
******** 4 ROLES ***********
******** 5 ROLES ***********


In [128]:
for n_roles in range(2,6):
# for n_roles in range(3,4):
    print('******** {} ROLES ***********'.format(n_roles))
    
    # Load features
    feat_dfs = {'train': pd.read_csv('/home/michael/school/research/wp/wikipedia/data/talk/enwiki/role_weights/PRPM_K{}-dev_all.csv'.format(n_roles)),
                'test': pd.read_csv('/home/michael/school/research/wp/wikipedia/data/talk/enwiki/role_weights/PRPM_K{}-test_all.csv'.format(n_roles))
               }
#     print(len(feat_dfs['train']))
#     print(len(feat_dfs['test']))

    featset_names = ['roles', 'no_roles', 'ed', 'other', 'all', 'selected']
    featsets = {name: {'feats': {'train': None, 'test': None}, 'colnames': []} for name in featset_names}

    # Select features

    tgt_rolenames = ['TargetRole_{}'.format(n) for n in range(n_roles)]
    other_rolenames = ['NontargetRole_{}'.format(n) for n in range(n_roles)]
    featsets['roles']['colnames'] = sorted(
#                                     ['TargetRole_{}'.format(n) for n in range(n_roles)] + \
#                                     ['NontargetRole_{}'.format(n) for n in range(n_roles)] + \
                                        ['{}_{}'.format(i,j) for i,j in itertools.product(tgt_rolenames, other_rolenames)])
    # print(len(featsets['roles']['colnames']))

    old_featnames = ['ART::DEF', 'ART::INDEF',
           'AUTH::CREDENTIALS', 'AUTH::EXPERIENTIAL', 'AUTH::EXTERNAL',
           'AUTH::FORUM', 'AUTH::SOCIAL_EXPECTATIONS', 'EXAMPLE', 'FIRST_TURN',
           'LAST_TURN', 'NUM_TURNS', 'PERS_PRON::PLUR', 'PERS_PRON::SING',
           'QUESTION', 'SENTI::NEG', 'SENTI::POS', 'URL'] 
    ftopic_featnames = ['ftopic{:d}'.format(x) for x in range(20)]
    other_ftopic_featnames = ['other_ftopic{:d}'.format(x) for x in range(20)]

    featsets['no_roles']['colnames'] = sorted(old_featnames + ftopic_featnames + other_ftopic_featnames)
    # print(len(featsets['no_roles']['colnames']))

    featsets['ed']['colnames'] = sorted(['#editor_turns'] + ['TargetRole_{}'.format(n) for n in range(n_roles)] + \
                                old_featnames + ftopic_featnames)
    # print(len(featsets['ed']['colnames']))

    featsets['other']['colnames'] = sorted(['#other_turns'] + ['NontargetRole_{}'.format(n) for n in range(n_roles)] + \
                                    other_ftopic_featnames)
    # print(len(featsets['other']['colnames']))

    featsets['all']['colnames'] = sorted(set(itertools.chain.from_iterable(featsets[x]['colnames'] for x in featsets)))
    # print(len(featsets['all']['colnames']))
    
    featsets['selected']['colnames'] = sorted(['#editor_turns'] + \
                                old_featnames + ftopic_featnames + other_rolenames)

    # Vectorize features
    for name in featsets:
        for split in ['train', 'test']:
            featsets[name]['feats'][split] = np.hstack([np.array([feat_dfs[split][col]]).T for col in featsets[name]['colnames']])

    # Train and test regression classifier
    focus_idx = sorted(featsets.keys()).index('roles')
#     for name in featsets:
    for name in sorted(featsets.keys())[focus_idx:focus_idx+1]:
        print(name)
#         clf = LinearRegression()
        clf = Ridge()
#         clf = Lasso()
#         clf = svm.SVR('poly', degree=2)
#         clf = MLPRegressor(hidden_layer_sizes=(20,))

        #normalize
#         norm_mean = np.mean(featsets[name]['feats']['train'], axis=0)
#         norm_std = np.std(featsets[name]['feats']['train'], axis=0)
#         norm_train = np.apply_along_axis(minmax_train, 0, featsets[name]['feats']['train'])

        norm_train = []
        mins_maxs = []
        for i in range(featsets[name]['feats']['train'].shape[1]):
            newrow, rowmin, rowmax = minmax_train(featsets[name]['feats']['train'][:,i])
            norm_train.append(newrow)
            mins_maxs.append((rowmin, rowmax))
        norm_train = np.array(norm_train).T

        clf.fit(norm_train, feat_dfs['train']['editor_score'].values)

#         norm_test = np.apply_along_axis(minmax_test, 0, featsets[name]['feats']['test'])
        norm_test = []
        for i in range(featsets[name]['feats']['test'].shape[1]):
            newrow = minmax_test(featsets[name]['feats']['test'][:,i], mins_maxs[i])
            norm_test.append(newrow)
        norm_test = np.array(norm_test).T
            
        pred = clf.predict(norm_test)
        pred = np.array([max(p, 0.0) for p in pred])
        pred = np.array([min(p, 1.0) for p in pred])

        # Align pred
        feat_dfs['test']['{}_pred'.format(name)] = pred

        # rmse
        print("OVERALL:", end='\t')
        print(math.sqrt(np.mean((pred - feat_dfs['test']['editor_score'].values) ** 2)))

        # split by conversation size
        for i in range(2,5):
            print(i, end=':\t\t')
            selected = feat_dfs['test'][feat_dfs['test']['#editors']==i]
            print(math.sqrt(np.mean((selected['{}_pred'.format(name)] - selected['editor_score'].values) ** 2)))

        print(5, end=':\t\t')
        selected = feat_dfs['test'][feat_dfs['test']['#editors']>=5]
        print(math.sqrt(np.mean((selected['{}_pred'.format(name)] - selected['editor_score'].values) ** 2)))
        print()
    print()

******** 2 ROLES ***********
roles
OVERALL:	0.29471377032933943
2:		0.2900469358271706
3:		0.3010004859580724
4:		0.28844396300142205
5:		0.308114185777801


******** 3 ROLES ***********
roles
OVERALL:	0.29447786610095134
2:		0.2898229651769129
3:		0.3010675928052809
4:		0.28778238938970296
5:		0.30759326621650734


******** 4 ROLES ***********
roles
OVERALL:	0.29431461177590146
2:		0.28962347161903124
3:		0.30078292453270267
4:		0.2874768637695508
5:		0.3079922386276985


******** 5 ROLES ***********
roles
OVERALL:	0.29451250123369754
2:		0.28971171279270924
3:		0.30087895853990004
4:		0.28884440184648813
5:		0.30771309849115036




In [76]:
def minmax_train(arr):
#     saved.append((min(arr), max(arr)))
    return (arr - min(arr))/(max(arr) - min(arr)), min(arr), max(arr)

In [77]:
def minmax_test(arr, minmax):
    rowmin = minmax[0]
    rowmax = minmax[1]
    return (arr - rowmin)/(rowmax - rowmin)

In [43]:
for i in range(69):
    print(featsets['all']['colnames'][i])
    print(min(featsets['all']['feats']['train'][:,i]))
    print(np.mean(featsets['all']['feats']['train'][:,i]))
    print(max(featsets['all']['feats']['train'][:,i]))
    print()

#editor_turns
1.0
1.96815883124
64.0

#other_turns
1.0
5.25491665106
226.0

ART::DEF
0.0
8.51198726353
380.0

ART::INDEF
0.0
3.72326278329
109.0

AUTH::CREDENTIALS
-9.57054229958e-05
0.0121531114321
0.170392537909

AUTH::EXPERIENTIAL
-0.00147272557404
0.00235784845926
0.0288983138768

AUTH::EXTERNAL
-0.00406866371692
-0.000267266742752
0.0246155361845

AUTH::FORUM
-0.00285357595625
0.000829920750704
0.0322154493126

AUTH::SOCIAL_EXPECTATIONS
-0.000890650489802
0.00213081055632
0.0248010315427

EXAMPLE
0.0
0.0909346319535
6.0

FIRST_TURN
0.0
0.390709870762
1.0

LAST_TURN
0.0
0.391740026222
1.0

NUM_TURNS
1.0
1.00018730099
2.0

NontargetRole_0
0.00301409675679
0.181325840978
0.922400164401

NontargetRole_1
0.00223798528313
0.206979440964
0.929954300693

NontargetRole_2
0.00315344371871
0.219520297783
0.965876203196

NontargetRole_3
0.00111788435328
0.128976971705
0.951945706582

NontargetRole_4
0.00398952212392
0.263197448571
0.972847933641

PERS_PRON::PLUR
0.0
0.701442217644
62.0

PERS_

In [84]:
for i in range(69):
    print(featsets['all']['colnames'][i])
    print(min(norm_test[:,i]))
    print(np.mean(norm_test[:,i]))
    print(max(norm_test[:,i]))
    print()

#editor_turns
0.0
0.0153632908562
0.84126984127

#other_turns
0.0
0.0194291079812
0.64

ART::DEF
0.0
0.0229078329627
1.03157894737

ART::INDEF
0.0
0.0348132833699
1.89908256881

AUTH::CREDENTIALS
0.000236495697121
0.0699736652432
0.760838948056

AUTH::EXPERIENTIAL
0.0206790280085
0.127128612472
0.814885898877

AUTH::EXTERNAL
0.0257860015245
0.13426709511
1.48423577463

AUTH::FORUM
0.0111153624159
0.104638725732
0.725708346715

AUTH::SOCIAL_EXPECTATIONS
-0.0492528289576
0.118122664902
1.17567315239

EXAMPLE
0.0
0.0150078247261
1.66666666667

FIRST_TURN
0.0
0.390234741784
1.0

LAST_TURN
0.0
0.391737089202
1.0

NUM_TURNS
0.0
0.000281690140845
1.0

NontargetRole_0
0.000374238839914
0.195753113458
1.05525423394

NontargetRole_1
-2.13802879355e-05
0.216811649533
1.02932862

NontargetRole_2
0.00145934828743
0.239391830435
1.01210793731

NontargetRole_3
0.000585168736026
0.121885407925
0.981292639876

NontargetRole_4
0.00107714180633
0.267340071841
1.00139924541

PERS_PRON::PLUR
0.0
0.01103589

# Analyze feature weights

In [123]:
clf.coef_

array([-0.08139736, -0.1024373 ,  0.19236847,  0.0866874 , -0.11279869,
        0.12750898, -0.01756795, -0.02214359, -0.10362837,  0.06868167,
        0.05315139,  0.04762064,  0.18103833, -0.02718142, -0.00164675,
       -0.02949516,  0.03814008,  0.01924838,  0.01844059,  0.03320372,
       -0.10520798,  0.02026939, -0.27463695, -0.01179504, -0.07546589,
       -0.04974203, -0.05967502,  0.16507036, -0.00145428, -0.00259315,
        0.0059954 ,  0.00966079,  0.01495335,  0.00765434, -0.04078292,
       -0.00958326, -0.05085789, -0.03570327,  0.03979838,  0.0145677 ,
        0.00163883,  0.02449775,  0.06860158,  0.03571654, -0.0404914 ,
       -0.07006968,  0.04124309, -0.00057668,  0.01581135,  0.03053576,
       -0.00929926, -0.05963634,  0.02043026,  0.02279527, -0.130653  ,
       -0.15464228, -0.16645498, -0.13120807,  0.06038261, -0.04768613,
        0.1127487 ,  0.38197838, -0.00262085,  0.23779662,  0.19622946,
       -0.07183442,  0.13029938, -0.02831862, -0.0423878 , -0.05

In [129]:
name = 'roles'
print_top_features(clf, featsets[name]['colnames'], n=100)
print()
print_bottom_features(clf, featsets[name]['colnames'], n=100)

TargetRole_0_NontargetRole_3:	0.25057404064644145
TargetRole_3_NontargetRole_0:	0.1256267179413726
TargetRole_3_NontargetRole_4:	0.07121173316470249
TargetRole_3_NontargetRole_1:	0.04809625572645841
TargetRole_1_NontargetRole_3:	0.03516686650477483
TargetRole_0_NontargetRole_4:	0.018296028927662158
TargetRole_2_NontargetRole_3:	0.015286382330101482
TargetRole_1_NontargetRole_2:	0.014534350087288029
TargetRole_4_NontargetRole_4:	0.012673667986057752
TargetRole_1_NontargetRole_1:	0.011326095800843636
TargetRole_2_NontargetRole_4:	0.009790428377077656
TargetRole_4_NontargetRole_1:	0.00830239777881537
TargetRole_4_NontargetRole_0:	0.0005499338552577368
TargetRole_1_NontargetRole_0:	-0.0006641282704409174
TargetRole_2_NontargetRole_2:	-0.008618397088786905
TargetRole_1_NontargetRole_4:	-0.021584473338562202
TargetRole_3_NontargetRole_3:	-0.035437460667778836
TargetRole_3_NontargetRole_2:	-0.04635785017050464
TargetRole_4_NontargetRole_3:	-0.04804085990133351
TargetRole_2_NontargetRole_1:	-0

In [33]:
feat_dfs['test'].loc[:,['all_pred', 'editor_score']]

Unnamed: 0,all_pred,editor_score
0,0.599525,0.850000
1,0.602031,1.000000
2,0.581688,0.595376
3,0.568855,0.852941
4,0.579757,0.270586
5,0.572944,0.098113
6,0.543945,0.096154
7,0.552642,0.536288
8,0.549125,0.008219
9,0.609993,0.407689


In [14]:
def print_top_features(clf, feature_names, n=20):
    """Prints features with the highest coefficient values"""
    top = np.argsort(clf.coef_)[-1*n:]
    for j in reversed(top):
        print("{}:\t{}".format(feature_names[j], clf.coef_[j]))
#         print(" ".join(reversed(feature_names[j] for j in top)))

In [24]:
def print_bottom_features(clf, feature_names, n=20):
    """Prints features with the highest coefficient values"""
    top = np.argsort(clf.coef_)[:n]
    for j in top:
        print("{}:\t{}".format(feature_names[j], clf.coef_[j]))
#         print(" ".join(reversed(feature_names[j] for j in top)))

In [130]:
# Try straight correlations
corrs = {}
for name in featsets['all']['colnames']:
#     print(name, end=': ')
    corrs[name] = pearsonr(feat_dfs['train'][name], feat_dfs['train']['editor_score'])
#     print(corrs[name])

sorted(corrs.items(), key=operator.itemgetter(1), reverse=True)

[('FIRST_TURN', (0.046636517427545945, 1.4268578805408675e-06)),
 ('TargetRole_3_NontargetRole_4',
  (0.04134621052659241, 1.9215779036468186e-05)),
 ('TargetRole_0_NontargetRole_3',
  (0.037538756312221686, 0.00010447428559671602)),
 ('NontargetRole_3', (0.037377658368216195, 0.00011186733846686165)),
 ('TargetRole_3', (0.035767599557268839, 0.00021835669814289612)),
 ('TargetRole_1_NontargetRole_3',
  (0.034813653852502235, 0.00032052844910589125)),
 ('NontargetRole_4', (0.034225258357551379, 0.00040428763628233311)),
 ('LAST_TURN', (0.033089305358587999, 0.00062667213418058092)),
 ('ftopic15', (0.032016495072388886, 0.00093674500533435015)),
 ('TargetRole_3_NontargetRole_1',
  (0.025631488593423488, 0.0080792516494579032)),
 ('TargetRole_3_NontargetRole_0',
  (0.024933247792562004, 0.0099788507973880593)),
 ('TargetRole_4_NontargetRole_4',
  (0.024384575231964981, 0.011740641478177702)),
 ('TargetRole_3_NontargetRole_3',
  (0.022633768064993183, 0.019342016516592984)),
 ('ftopic14',

In [112]:
# Try straight correlations
corrs = {}
for name in featsets['roles']['colnames']:
#     print(name, end=': ')
    corrs[name] = pearsonr(feat_dfs['train'][name], feat_dfs['train']['editor_score'])
#     print(corrs[name])

sorted(corrs.items(), key=operator.itemgetter(1), reverse=True)

[('TargetRole_3_NontargetRole_4',
  (0.04134621052659241, 1.9215779036468186e-05)),
 ('TargetRole_0_NontargetRole_3',
  (0.037538756312221686, 0.00010447428559671602)),
 ('TargetRole_1_NontargetRole_3',
  (0.034813653852502235, 0.00032052844910589125)),
 ('TargetRole_3_NontargetRole_1',
  (0.025631488593423488, 0.0080792516494579032)),
 ('TargetRole_3_NontargetRole_0',
  (0.024933247792562004, 0.0099788507973880593)),
 ('TargetRole_4_NontargetRole_4',
  (0.024384575231964981, 0.011740641478177702)),
 ('TargetRole_3_NontargetRole_3',
  (0.022633768064993183, 0.019342016516592984)),
 ('TargetRole_1_NontargetRole_4',
  (0.018368738411418836, 0.057688095412875832)),
 ('TargetRole_4_NontargetRole_3', (0.014893254385991263, 0.12383095582826362)),
 ('TargetRole_2_NontargetRole_3', (0.014775593001917896, 0.12682772794391212)),
 ('TargetRole_2_NontargetRole_4',
  (0.0079465929989436905, 0.41160450700265372)),
 ('TargetRole_3_NontargetRole_2', (0.007633103389672183, 0.43029825756867945)),
 ('Tar

In [114]:
# Quantiles
nq = 50
feat_dfs['train']['{}q'.format(nq)] = pd.qcut(feat_dfs['train']['editor_score'], nq, range(100))
feat_dfs['train'][feat_dfs['train']['{}q'.format(nq)]==0]

ValueError: Bin edges must be unique: array([ 0.        ,  0.        ,  0.01615884,  0.07077717,  0.11521247,
        0.14898748,  0.18625731,  0.21678322,  0.24363636,  0.26515152,
        0.29223454,  0.3208736 ,  0.34470643,  0.36697248,  0.38596491,
        0.40838847,  0.42619506,  0.44589067,  0.46604078,  0.48521739,
        0.5       ,  0.51069034,  0.53277281,  0.54805504,  0.56758748,
        0.58570618,  0.60251526,  0.62101278,  0.63979213,  0.66077739,
        0.67409846,  0.69289827,  0.71280774,  0.73084677,  0.75      ,
        0.76630435,  0.78947368,  0.80596532,  0.82747604,  0.8449106 ,
        0.86476729,  0.88594071,  0.90789474,  0.93206166,  0.953125  ,
        0.97641026,  1.        ,  1.        ,  1.        ,  1.        ,  1.        ])

# Linear regression baseline with train+dev, test split

In [5]:
# Load features
feats = pd.read_csv('/home/michael/school/research/wp/wikipedia/data/talk/enwiki/enwiki_talk_scores_allfeats.csv')

# Split into folds
train_folds_nums = [0,1]
test_folds_nums = range(2, 10)

test_folds = feats[feats['fold'].isin(train_folds_nums)]
train_folds = feats[feats['fold'].isin(test_folds_nums)]

print(len(feats))
print(feats.columns)

53175
Index(['article', 'thread', 'editor', 'editor_talk', 'other_talk',
       '#editor_turns', '#other_turns', 'editor_score', 'fold', 'ftopic0',
       'ftopic1', 'ftopic2', 'ftopic3', 'ftopic4', 'ftopic5', 'ftopic6',
       'ftopic7', 'ftopic8', 'ftopic9', 'ftopic10', 'ftopic11', 'ftopic12',
       'ftopic13', 'ftopic14', 'ftopic15', 'ftopic16', 'ftopic17', 'ftopic18',
       'ftopic19', 'other_ftopic0', 'other_ftopic1', 'other_ftopic2',
       'other_ftopic3', 'other_ftopic4', 'other_ftopic5', 'other_ftopic6',
       'other_ftopic7', 'other_ftopic8', 'other_ftopic9', 'other_ftopic10',
       'other_ftopic11', 'other_ftopic12', 'other_ftopic13', 'other_ftopic14',
       'other_ftopic15', 'other_ftopic16', 'other_ftopic17', 'other_ftopic18',
       'other_ftopic19', '#editors', 'ed_dialog_act_pred', 'ed_pred',
       'all_dialog_act_pred', 'all_pred', 'ART::DEF', 'ART::INDEF',
       'AUTH::CREDENTIALS', 'AUTH::EXPERIENTIAL', 'AUTH::EXTERNAL',
       'AUTH::FORUM', 'AUTH::SOCIAL_EXP

In [3]:
old_featnames = ['ART::DEF', 'ART::INDEF',
       'AUTH::CREDENTIALS', 'AUTH::EXPERIENTIAL', 'AUTH::EXTERNAL',
       'AUTH::FORUM', 'AUTH::SOCIAL_EXPECTATIONS', 'EXAMPLE', 'FIRST_TURN',
       'LAST_TURN', 'NUM_TURNS', 'PERS_PRON::PLUR', 'PERS_PRON::SING',
       'QUESTION', 'SENTI::NEG', 'SENTI::POS', 'URL'] 

ftopic_featnames = ['ftopic{:d}'.format(x) for x in range(20)]

ed_featnames = ['#editor_turns'] + old_featnames + ftopic_featnames
print(len(ed_featnames))

other_featnames = ['#other_turns'] + ['other_ftopic{:d}'.format(x) for x in range(20)]
print(len(other_featnames))

38
21


## Vectorize features

In [6]:
allfeats = {'train': {}, 'test': {}}
oldfeats = {'train': {}, 'test': {}} # just from editor (for now)
ftopic_feats = {'train': {}, 'test': {}} # just from editor (for now)
edbow = {'train': {}, 'test': {}}
other_bow = {'train': {}, 'test': {}}
ed_nonbow = {'train': {}, 'test': {}}
other_nonbow = {'train': {}, 'test': {}}
edfeats = {'train': {}, 'test': {}}
nonbow = {'train': {}, 'test': {}}

# Training folds
# Get unigram features
# v = CountVectorizer(min_df=1, stop_words='english')
v = TfidfVectorizer(min_df=1, stop_words='english')
v.fit(train_folds['editor_talk'])
v.fit(test_folds['editor_talk'])
edbow['train'] = v.transform(train_folds['editor_talk'])
edbow['test'] = v.transform(test_folds['editor_talk'])

# v_other = CountVectorizer(min_df=1, stop_words='english')
v_other = TfidfVectorizer(min_df=1, stop_words='english')
v_other.fit(train_folds['other_talk'])
v_other.fit(test_folds['other_talk'])
other_bow['train'] = v_other.transform(train_folds['other_talk'])
other_bow['test'] = v_other.transform(test_folds['other_talk'])

# Old features
oldfeats_d = {}
for col in old_featnames:
    oldfeats_d[col] = np.array([train_folds[col]]).T
oldfeats['train'] = np.hstack(oldfeats_d.values())

oldfeats_d = {}
for col in old_featnames:
    oldfeats_d[col] = np.array([test_folds[col]]).T
oldfeats['test'] = np.hstack(oldfeats_d.values())

# ftopic features
ftopic_d = {}
for col in ftopic_featnames:
    ftopic_d[col] = np.array([train_folds[col]]).T
ftopic_feats['train'] = np.hstack(ftopic_d.values())

ftopic_d = {}
for col in ftopic_featnames:
    ftopic_d[col] = np.array([test_folds[col]]).T
ftopic_feats['test'] = np.hstack(ftopic_d.values())

# Get exclusive editor non-unigram features
ed_nonbow_d = {}
for col in ed_featnames:
    ed_nonbow_d[col] = np.array([train_folds[col]]).T
#     ed_nonbow_d[col] = np.array([(v - min(feats[col]))/max(feats[col]) for v in feats[col].values]).T
ed_nonbow['train'] = np.hstack(ed_nonbow_d.values())

ed_nonbow_d = {}
for col in ed_featnames:
    ed_nonbow_d[col] = np.array([test_folds[col]]).T
ed_nonbow['test'] = np.hstack(ed_nonbow_d.values())

# Get others' non-unigram features
nonbow_d = {}
for col in other_featnames:
    nonbow_d[col] = np.array([train_folds[col]]).T
#     nonbow_d[col] = np.array([(v - min(feats[col]))/max(feats[col]) for v in feats[col].values]).T
other_nonbow['train'] = np.hstack(nonbow_d.values())

nonbow_d = {}
for col in other_featnames:
    nonbow_d[col] = np.array([test_folds[col]]).T
other_nonbow['test'] = np.hstack(nonbow_d.values())

# Assemble editor features
edfeats['train'] = hstack([edbow['train'], ed_nonbow['train']])
edfeats['test'] = hstack([edbow['test'], ed_nonbow['test']])

# Assemble non-unigram features
nonbow['train'] = np.hstack([ed_nonbow['train'], other_nonbow['train']])
nonbow['test'] = np.hstack([ed_nonbow['test'], other_nonbow['test']])

# Assemble all features
allfeats['train'] = hstack([edbow['train'], other_bow['train'], ed_nonbow['train'], other_nonbow['train']])
allfeats['test'] = hstack([edbow['test'], other_bow['test'], ed_nonbow['test'], other_nonbow['test']])

## Train and test editor features

### Old nonbow

In [14]:
# OLD NONBOW

# Train and test linear regression classifier
clf = LinearRegression()

clf.fit(oldfeats['train'], train_folds['editor_score'].values)

pred = clf.predict(oldfeats['test'])
pred = np.array([max(p, 0.0) for p in pred])
pred = np.array([min(p, 1.0) for p in pred])

# Add pred
test_folds['ed_old_pred'] = pred
test_folds

# rmse
print("OVERALL:", end='\t')
print(math.sqrt(np.mean((pred - test_folds['editor_score'].values) ** 2)))

# split by conversation size
for i in range(2,5):
    print(i, end=':\t\t')
    selected = test_folds[test_folds['#editors']==i]
    print(math.sqrt(np.mean((selected['ed_old_pred'] - selected['editor_score'].values) ** 2)))
    
print(5, end=':\t\t')
selected = test_folds[test_folds['#editors']>=5]
print(math.sqrt(np.mean((selected['ed_old_pred'] - selected['editor_score'].values) ** 2)))

OVERALL:	0.29240963274119897
2:		0.28642636584419745
3:		0.30235107497051
4:		0.28725132223274413
5:		0.3023390288665307


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


### Ftopic nonbow

In [15]:
# Train and test linear regression classifier
clf = LinearRegression()
colname = 'ed_ftopic_pred'

clf.fit(ftopic_feats['train'], train_folds['editor_score'].values)

pred = clf.predict(ftopic_feats['test'])
pred = np.array([max(p, 0.0) for p in pred])
pred = np.array([min(p, 1.0) for p in pred])

# Add pred
test_folds['ed_ftopic_pred'] = pred
test_folds

# rmse
print("OVERALL:", end='\t')
print(math.sqrt(np.mean((pred - test_folds['editor_score'].values) ** 2)))

# split by conversation size
for i in range(2,5):
    print(i, end=':\t\t')
    selected = test_folds[test_folds['#editors']==i]
    print(math.sqrt(np.mean((selected['ed_ftopic_pred'] - selected['editor_score'].values) ** 2)))
    
print(5, end=':\t\t')
selected = test_folds[test_folds['#editors']>=5]
print(math.sqrt(np.mean((selected['ed_ftopic_pred'] - selected['editor_score'].values) ** 2)))

OVERALL:	0.2940426888221045
2:		0.2880618563290608
3:		0.3013914902186158
4:		0.2883675870252445
5:		0.31017228264581537


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


### Editor nonbow

In [7]:
rmse_ed_nonbow = []
rmse_ed = []

# EDITOR NONBOW

# Train and test linear regression classifier
clf = LinearRegression()

clf.fit(ed_nonbow['train'], train_folds['editor_score'].values)

#     edpred = cross_validation.cross_val_predict(clf, ed_nonbow, feats['editor_score'], cv=10)
edpred = clf.predict(ed_nonbow['test'])
edpred = np.array([max(p, 0.0) for p in edpred])
edpred = np.array([min(p, 1.0) for p in edpred])

# Add edpred
test_folds['ed_dialog_act_pred'] = edpred
test_folds

# rmse
print("OVERALL:", end='\t')
print(math.sqrt(np.mean((edpred - test_folds['editor_score'].values) ** 2)))

# split by conversation size
for i in range(2,5):
    print(i, end=':\t\t')
    selected = test_folds[test_folds['#editors']==i]
    print(math.sqrt(np.mean((selected['ed_dialog_act_pred'] - selected['editor_score'].values) ** 2)))
    
print(5, end=':\t\t')
selected = test_folds[test_folds['#editors']>=5]
print(math.sqrt(np.mean((selected['ed_dialog_act_pred'] - selected['editor_score'].values) ** 2)))

OVERALL:	0.2920909761568702
2:		0.2862686478790591
3:		0.3017700285565271
4:		0.28732722748540657
5:		0.3015027680831825


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


### Editor all

In [8]:
# EDITOR ALL

# Train and test linear regression classifier
clf = LinearRegression()

clf.fit(edfeats['train'], train_folds['editor_score'].values)

edpred = clf.predict(edfeats['test'])
edpred = np.array([max(p, 0.0) for p in edpred])
edpred = np.array([min(p, 1.0) for p in edpred])

# Add edpred
colname = 'ed_pred'
test_folds[colname] = edpred
test_folds

# rmse
print("OVERALL:", end='\t')
print(math.sqrt(np.mean((edpred - test_folds['editor_score'].values) ** 2)))

# split by conversation size
for i in range(2,5):
    print(i, end=':\t\t')
    selected = test_folds[test_folds['#editors']==i]
    print(math.sqrt(np.mean((selected[colname] - selected['editor_score'].values) ** 2)))
    
print(5, end=':\t\t')
selected = test_folds[test_folds['#editors']>=5]
print(math.sqrt(np.mean((selected[colname] - selected['editor_score'].values) ** 2)))

OVERALL:	0.4842115217398755
2:		0.4793375786326374
3:		0.49781870497517683
4:		0.4857913213622261
5:		0.4739178907901679


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [16]:
# Merge in test_rows predictions to feats, save
merged = pd.merge(feats, test_folds, how='left')
print(len(merged))
print(len(set(zip(merged['article'], merged['thread'], merged['editor']))))
merged.columns

53175
53175


Index(['article', 'thread', 'editor', 'editor_talk', 'other_talk',
       '#editor_turns', '#other_turns', 'editor_score', 'fold', 'ftopic0',
       'ftopic1', 'ftopic2', 'ftopic3', 'ftopic4', 'ftopic5', 'ftopic6',
       'ftopic7', 'ftopic8', 'ftopic9', 'ftopic10', 'ftopic11', 'ftopic12',
       'ftopic13', 'ftopic14', 'ftopic15', 'ftopic16', 'ftopic17', 'ftopic18',
       'ftopic19', 'other_ftopic0', 'other_ftopic1', 'other_ftopic2',
       'other_ftopic3', 'other_ftopic4', 'other_ftopic5', 'other_ftopic6',
       'other_ftopic7', 'other_ftopic8', 'other_ftopic9', 'other_ftopic10',
       'other_ftopic11', 'other_ftopic12', 'other_ftopic13', 'other_ftopic14',
       'other_ftopic15', 'other_ftopic16', 'other_ftopic17', 'other_ftopic18',
       'other_ftopic19', '#editors', 'ed_dialog_act_pred', 'ed_pred',
       'all_dialog_act_pred', 'all_pred', 'ART::DEF', 'ART::INDEF',
       'AUTH::CREDENTIALS', 'AUTH::EXPERIENTIAL', 'AUTH::EXTERNAL',
       'AUTH::FORUM', 'AUTH::SOCIAL_EXPECTATI

In [17]:
merged.to_csv('/home/michael/school/research/wp/wikipedia/data/talk/enwiki/enwiki_talk_scores_allfeats.csv', index=False)

## Train and test all features

In [18]:
rmse_nonbow = []
rmse = []

# ALL NONBOW

# Train and test linear regression classifier
clf = LinearRegression()

clf.fit(nonbow['train'], train_folds['editor_score'].values)

pred = clf.predict(nonbow['test'])
pred = np.array([max(p, 0.0) for p in pred])
pred = np.array([min(p, 1.0) for p in pred])

# Add pred
predname = 'all_nonbow_pred'
test_folds[predname] = pred

# rmse
print("OVERALL:", end='\t')
print(math.sqrt(np.mean((pred - test_folds['editor_score'].values) ** 2)))

# split by conversation size
for i in range(2,5):
    print(i, end=':\t\t')
    selected = test_folds[test_folds['#editors']==i]
    print(math.sqrt(np.mean((selected[predname] - selected['editor_score'].values) ** 2)))
    
print(5, end=':\t\t')
selected = test_folds[test_folds['#editors']>=5]
print(math.sqrt(np.mean((selected[predname] - selected['editor_score'].values) ** 2)))

OVERALL:	0.2913544386648021
2:		0.28571541401071665
3:		0.3018019972860507
4:		0.2877228260460119
5:		0.29707410620226016


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [19]:
# ALL

# Train and test linear regression classifier
clf = LinearRegression()

clf.fit(allfeats['train'], train_folds['editor_score'].values)

#     pred = cross_validation.cross_val_predict(clf, ed_nonbow, feats['editor_score'], cv=10)
pred = clf.predict(allfeats['test'])
pred = np.array([max(p, 0.0) for p in pred])
pred = np.array([min(p, 1.0) for p in pred])

# Add pred
predname = 'all_pred'
test_folds[predname] = pred

# rmse
print("OVERALL:", end='\t')
print(math.sqrt(np.mean((pred - test_folds['editor_score'].values) ** 2)))

# split by conversation size
for i in range(2,5):
    print(i, end=':\t\t')
    selected = test_folds[test_folds['#editors']==i]
    print(math.sqrt(np.mean((selected[predname] - selected['editor_score'].values) ** 2)))
    
print(5, end=':\t\t')
selected = test_folds[test_folds['#editors']>=5]
print(math.sqrt(np.mean((selected[predname] - selected['editor_score'].values) ** 2)))

OVERALL:	0.4749839079168484
2:		0.472639431567687
3:		0.47806389038203384
4:		0.47476755221162326
5:		0.47903410960198356


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [20]:
test_folds.columns

Index(['article', 'thread', 'editor', 'editor_talk', 'other_talk',
       '#editor_turns', '#other_turns', 'editor_score', 'fold', 'ftopic0',
       'ftopic1', 'ftopic2', 'ftopic3', 'ftopic4', 'ftopic5', 'ftopic6',
       'ftopic7', 'ftopic8', 'ftopic9', 'ftopic10', 'ftopic11', 'ftopic12',
       'ftopic13', 'ftopic14', 'ftopic15', 'ftopic16', 'ftopic17', 'ftopic18',
       'ftopic19', 'other_ftopic0', 'other_ftopic1', 'other_ftopic2',
       'other_ftopic3', 'other_ftopic4', 'other_ftopic5', 'other_ftopic6',
       'other_ftopic7', 'other_ftopic8', 'other_ftopic9', 'other_ftopic10',
       'other_ftopic11', 'other_ftopic12', 'other_ftopic13', 'other_ftopic14',
       'other_ftopic15', 'other_ftopic16', 'other_ftopic17', 'other_ftopic18',
       'other_ftopic19', '#editors', 'ed_dialog_act_pred', 'ed_pred',
       'all_dialog_act_pred', 'all_pred', 'ART::DEF', 'ART::INDEF',
       'AUTH::CREDENTIALS', 'AUTH::EXPERIENTIAL', 'AUTH::EXTERNAL',
       'AUTH::FORUM', 'AUTH::SOCIAL_EXPECTATI

In [21]:
# Merge in test_rows predictions to feats, save
merged = pd.merge(feats, test_folds, how='left')
print(len(merged))
merged.columns

53175


Index(['article', 'thread', 'editor', 'editor_talk', 'other_talk',
       '#editor_turns', '#other_turns', 'editor_score', 'fold', 'ftopic0',
       'ftopic1', 'ftopic2', 'ftopic3', 'ftopic4', 'ftopic5', 'ftopic6',
       'ftopic7', 'ftopic8', 'ftopic9', 'ftopic10', 'ftopic11', 'ftopic12',
       'ftopic13', 'ftopic14', 'ftopic15', 'ftopic16', 'ftopic17', 'ftopic18',
       'ftopic19', 'other_ftopic0', 'other_ftopic1', 'other_ftopic2',
       'other_ftopic3', 'other_ftopic4', 'other_ftopic5', 'other_ftopic6',
       'other_ftopic7', 'other_ftopic8', 'other_ftopic9', 'other_ftopic10',
       'other_ftopic11', 'other_ftopic12', 'other_ftopic13', 'other_ftopic14',
       'other_ftopic15', 'other_ftopic16', 'other_ftopic17', 'other_ftopic18',
       'other_ftopic19', '#editors', 'ed_dialog_act_pred', 'ed_pred',
       'all_dialog_act_pred', 'all_pred', 'ART::DEF', 'ART::INDEF',
       'AUTH::CREDENTIALS', 'AUTH::EXPERIENTIAL', 'AUTH::EXTERNAL',
       'AUTH::FORUM', 'AUTH::SOCIAL_EXPECTATI

In [22]:
merged.to_csv('/home/michael/school/research/wp/wikipedia/data/talk/enwiki/enwiki_talk_scores_allfeats.csv', index=False)

## Try 0.5 baseline

In [6]:
guess = [0.5]*len(test_folds['editor_score'])
# The root mean square error
print("rmse: {:2f}".format(math.sqrt(np.mean((guess - test_folds['editor_score'].values) ** 2))))

# split by conversation size
for i in range(2,5):
    print(i, end=':\t')
    selected = test_folds[test_folds['#editors']==i]
    print(math.sqrt(np.mean(([0.5]*len(selected) - selected['editor_score'].values) ** 2)))
    
print('5+', end=':\t')
selected = test_folds[test_folds['#editors']>=5]
print(math.sqrt(np.mean(([0.5]*len(selected) - selected['editor_score'].values) ** 2)))

rmse: 0.305019
2:		0.3116779719768905
3:		0.3061059701283725
4:		0.28946164196719393
5:		0.28748616375582053


## Split by conversation size

In [40]:
test_split = {}
for i in range(2,5):
    print(i, end=': ')
    test_split[i] = test_folds[test_folds['#editors']==i]
    print(len(test_split[i]))

# 5 or more participants
test_split[5] = test_folds[test_folds['#editors']>=5]
print('5+: {:d}'.format(len(test_split[5])))

2: 5861
3: 2802
4: 1314
5+: 1244


# Prepare baseline feature data

## Folds

In [2]:
# Load folds
with open('/home/michael/school/research/wp/wikipedia/data/talk/enwiki/ijcnlp-rawfolds.pickle', 'rb') as f:
    pagefolds = pickle.load(f, encoding='latin1')
    
print(len(pagefolds))
pagefolds

7211


{'Ocean Park Cable Car': 0,
 'Ipswich, Queensland': 1,
 'West End, Vancouver': 2,
 'List of FLCL characters': 8,
 'Karkadann': 3,
 'List of Red Garden characters': 4,
 'List of religious leaders in 1847': 5,
 'List of whisky brands': 7,
 'John the Fearless': 6,
 'Grow a Pear': 0,
 'Architectural structure': 0,
 'Sacd': 7,
 'Flavio Costantini': 7,
 'Fear Itself (The 4400)': 8,
 'Polizei-Bataillon 33': 2,
 'Merlin': 6,
 'Holy Wars (film)': 9,
 'Rhodes University': 0,
 'Deerfoot Trail': 1,
 'List of actors who have played the Doctor': 8,
 "Multnomah County Sheriff's Office Search and Rescue": 8,
 'Mandela, Massachusetts': 9,
 'Moray Bridge': 8,
 'Attalla, Alabama': 3,
 'List of Will & Grace episodes': 3,
 'List of IARC Group 2B carcinogens': 0,
 'Peter Edwards (artist)': 5,
 'Help (movie)': 1,
 'Goodie Mob': 7,
 'Trombone Shorty': 8,
 'Halo ce': 2,
 'Insert key': 1,
 'American Idol': 3,
 '???? (Nintendo character)': 0,
 'Jimmie Angel': 0,
 'Richard Rodgers': 0,
 'Mighty Mike McGee': 1,
 '

In [3]:
# Load feature file
feats = pd.read_csv('/home/michael/school/research/wp/wikipedia/data/talk/enwiki/enwiki_talk_scores_allfeats.csv')
feats

Unnamed: 0,article,thread_title,editor,editor_talk,other_talk,#editor_turns,#other_turns,editor_score,article_name,thread,...,ftopic10,ftopic11,ftopic12,ftopic13,ftopic14,ftopic15,ftopic16,ftopic17,ftopic18,ftopic19
0,"""Tiger"" William Dunlop",Current image,Deconstructhis,I've noticed that the current image that's acc...,I removed the deletion tag because the image i...,2,3,0.033058,"""Tiger"" William Dunlop",Current image,...,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,0.0
1,"""Tiger"" William Dunlop",Current image,Skinsmoke,::Good move ukexpat. That was really helpful (...,I've noticed that the current image that's acc...,1,4,0.482759,"""Tiger"" William Dunlop",Current image,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,"""Tiger"" William Dunlop",Current image,Ukexpat,I removed the deletion tag because the image i...,I've noticed that the current image that's acc...,1,4,0.392157,"""Tiger"" William Dunlop",Current image,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,"""Tiger"" William Dunlop",Requested move,CJ3370,"William ""Tiger"" Dunlop → William ""Tiger"" Dunlo...",The following discussion is an archived discus...,2,6,0.826415,"""Tiger"" William Dunlop",Requested move,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,"""Tiger"" William Dunlop",Requested move,CJ3370,"William ""Tiger"" Dunlop → William ""Tiger"" Dunlo...",The following discussion is an archived discus...,2,6,0.826415,"""Tiger"" William Dunlop",Requested move,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
5,"""Tiger"" William Dunlop",Requested move,Deconstructhis,Not a problem to merge if that is the most app...,The following discussion is an archived discus...,1,7,0.033058,"""Tiger"" William Dunlop",Requested move,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
6,"""Tiger"" William Dunlop",Requested move,Deconstructhis,Not a problem to merge if that is the most app...,The following discussion is an archived discus...,1,7,0.033058,"""Tiger"" William Dunlop",Requested move,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
7,"""Tiger"" William Dunlop",Requested move,Skinsmoke,User:CJ3370/William_Tiger_Dunlop → William Tig...,The following discussion is an archived discus...,3,5,0.456897,"""Tiger"" William Dunlop",Requested move,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
8,"""Tiger"" William Dunlop",Requested move,Skinsmoke,User:CJ3370/William_Tiger_Dunlop → William Tig...,The following discussion is an archived discus...,3,5,0.456897,"""Tiger"" William Dunlop",Requested move,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
9,"""V"" Is for Vagina","Why is the ""i"" capitalized?",Gunmetal Angel,"""I"" is supposed to be capatalized in ""is"" in t...",It doesn't make sense and it doesn't represent...,1,9,1.000000,"""V"" Is for Vagina","Why is the ""i"" capitalized?",...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [7]:
len(feats['article'].unique())

10374

In [8]:
feats['fold'] = [pagefolds.get(art, np.nan) for art in feats['article'].tolist()]
feats

Unnamed: 0,article,thread_title,editor,editor_talk,other_talk,#editor_turns,#other_turns,editor_score,article_name,thread,...,ftopic11,ftopic12,ftopic13,ftopic14,ftopic15,ftopic16,ftopic17,ftopic18,ftopic19,fold
0,"""Tiger"" William Dunlop",Current image,Deconstructhis,I've noticed that the current image that's acc...,I removed the deletion tag because the image i...,2,3,0.033058,"""Tiger"" William Dunlop",Current image,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,0.0,7.0
1,"""Tiger"" William Dunlop",Current image,Skinsmoke,::Good move ukexpat. That was really helpful (...,I've noticed that the current image that's acc...,1,4,0.482759,"""Tiger"" William Dunlop",Current image,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,7.0
2,"""Tiger"" William Dunlop",Current image,Ukexpat,I removed the deletion tag because the image i...,I've noticed that the current image that's acc...,1,4,0.392157,"""Tiger"" William Dunlop",Current image,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,7.0
3,"""Tiger"" William Dunlop",Requested move,CJ3370,"William ""Tiger"" Dunlop → William ""Tiger"" Dunlo...",The following discussion is an archived discus...,2,6,0.826415,"""Tiger"" William Dunlop",Requested move,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,7.0
4,"""Tiger"" William Dunlop",Requested move,CJ3370,"William ""Tiger"" Dunlop → William ""Tiger"" Dunlo...",The following discussion is an archived discus...,2,6,0.826415,"""Tiger"" William Dunlop",Requested move,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,7.0
5,"""Tiger"" William Dunlop",Requested move,Deconstructhis,Not a problem to merge if that is the most app...,The following discussion is an archived discus...,1,7,0.033058,"""Tiger"" William Dunlop",Requested move,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,7.0
6,"""Tiger"" William Dunlop",Requested move,Deconstructhis,Not a problem to merge if that is the most app...,The following discussion is an archived discus...,1,7,0.033058,"""Tiger"" William Dunlop",Requested move,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,7.0
7,"""Tiger"" William Dunlop",Requested move,Skinsmoke,User:CJ3370/William_Tiger_Dunlop → William Tig...,The following discussion is an archived discus...,3,5,0.456897,"""Tiger"" William Dunlop",Requested move,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,7.0
8,"""Tiger"" William Dunlop",Requested move,Skinsmoke,User:CJ3370/William_Tiger_Dunlop → William Tig...,The following discussion is an archived discus...,3,5,0.456897,"""Tiger"" William Dunlop",Requested move,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,7.0
9,"""V"" Is for Vagina","Why is the ""i"" capitalized?",Gunmetal Angel,"""I"" is supposed to be capatalized in ""is"" in t...",It doesn't make sense and it doesn't represent...,1,9,1.000000,"""V"" Is for Vagina","Why is the ""i"" capitalized?",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,8.0


In [9]:
len(feats)

69215

In [10]:
feats_folds = feats[np.isfinite(feats['fold'])]
len(feats_folds)

62593

In [13]:
feats_folds.to_csv('/home/michael/school/research/wp/wikipedia/data/talk/enwiki/enwiki_talk_scores_folds.csv', index=False)

## DAs from Yohan

In [2]:
thread_user_topics = pd.read_csv('/home/michael/school/research/wp/wikipedia/data/talk/enwiki/enwiki_dialog_acts.csv')

### Others' DAs

In [9]:
# Load sentence-level DAs
sent_das = pd.read_csv('/home/michael/school/research/wp/wikipedia/data/talk/enwiki/HSTTM8W-talk0-S5-FT20-BT100-FA0.1-B0.001-FG1.0-K0.1-E0.75-N0.75-I10000-InstSentAssign.csv')
sent_das

Unnamed: 0,SeqId,InstNo,User,Sentence,TaggedText,State,FTopic
0,"""Tiger"" William Dunlop#Current image",0,Deconstructhis,0.0,F16:'ve B41:noticed F16:current F16:image F16:...,3.0,16.0
1,"""Tiger"" William Dunlop#Current image",0,Deconstructhis,1.0,F9:'s F9:too F9:bad F9:really F9:always F9:thi...,3.0,9.0
2,"""Tiger"" William Dunlop#Current image",0,Deconstructhis,2.0,F10:trust F10:me F10:'m F10:last F10:person B6...,3.0,10.0
3,"""Tiger"" William Dunlop#Current image",0,Deconstructhis,3.0,F10:if F10:have F10:some B30:time F10:over F10...,3.0,10.0
4,"""Tiger"" William Dunlop#Current image",0,Deconstructhis,4.0,F0:wonder F0:how F0:difficult F0:would F0:try ...,3.0,0.0
5,"""Tiger"" William Dunlop#Current image",0,Deconstructhis,5.0,F5:what F5:we F5:need F5:copy B41:late B14:19t...,3.0,5.0
6,"""Tiger"" William Dunlop#Current image",0,Deconstructhis,6.0,F18:cheers F18:deconstructhis F18:02:19 F18:july,3.0,18.0
7,"""Tiger"" William Dunlop#Current image",1,Ukexpat,0.0,F9:removed F9:deletion F9:tag F9:because F9:im...,3.0,9.0
8,"""Tiger"" William Dunlop#Current image",1,Ukexpat,1.0,F10:any F10:copyright F10:has F10:long F10:sin...,3.0,10.0
9,"""Tiger"" William Dunlop#Current image",1,Ukexpat,2.0,F18:ukexpat F18:02:30 F18:july,3.0,18.0


In [10]:
# Group things by seqid and user
user_grouped = sent_das.groupby(['SeqId', 'User'])
user_grouped.groups

{('"Tiger" William Dunlop#Current image',
  'Deconstructhis'): Int64Index([0, 1, 2, 3, 4, 5, 6, 10, 11, 12], dtype='int64'),
 ('"Tiger" William Dunlop#Current image',
  'Ukexpat'): Int64Index([7, 8, 9], dtype='int64'),
 ('"Tiger" William Dunlop#Current image',
  'Skinsmoke'): Int64Index([13, 14, 15], dtype='int64'),
 ('"Tiger" William Dunlop#Current image',
  'Canglesea'): Int64Index([16, 17], dtype='int64'),
 ('"Tiger" William Dunlop#Requested move',
  'Billinghurst'): Int64Index([18, 19, 20, 21, 22], dtype='int64'),
 ('"Tiger" William Dunlop#Requested move',
  'CJ3370'): Int64Index([23, 24], dtype='int64'),
 ('"Tiger" William Dunlop#Requested move',
  'Skinsmoke'): Int64Index([25, 26, 27, 28, 29], dtype='int64'),
 ('"Tiger" William Dunlop#Requested move',
  'Deconstructhis'): Int64Index([30, 31, 32, 33, 34, 35, 36], dtype='int64'),
 ('"V" Is for Vagina#Why is the "i" capitalized?',
  'Jennavecia'): Int64Index([37, 38, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 69, 70, 71, 72, 73,
      

In [11]:
# Group things by just seqid
grouped = sent_das.groupby(['SeqId'])
grouped.groups

{'"Tiger" William Dunlop#Current image': Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17], dtype='int64'),
 '"Tiger" William Dunlop#Requested move': Int64Index([18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
             35, 36],
            dtype='int64'),
 '"V" Is for Vagina#Why is the "i" capitalized?': Int64Index([37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
             54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70,
             71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86],
            dtype='int64'),
 '&#"O" as a word': Int64Index([87, 88, 89, 90, 91, 92, 93, 94, 95, 96], dtype='int64'),
 '&#And per se and as a letter': Int64Index([ 97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
             110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120],
            dtype='int64'),
 '&#Etymology': Int64Index([121, 122, 123, 124, 125], dtype='int64'),
 '&#Evolution

In [12]:
# Build up other ftopic counts
ftopic_other_counts = {}

for (thread, user), inds in tqdm(user_grouped.groups.items()):
    thread_inds = grouped.groups[thread]
    other_inds = list(set(thread_inds) - set(inds))
    ftopics_other = Counter(sent_das.iloc[other_inds]['FTopic'].tolist())
    ftopic_other_counts[(thread,user)] = ftopics_other
    
len(ftopic_other_counts)




105653

In [13]:
# Make FTopic columns
# thread_other_topics = pd.DataFrame(ftopic_other_counts).transpose().fillna(0.0, inplace=True)
thread_other_topics = pd.DataFrame(ftopic_other_counts).transpose()
thread_other_topics = thread_other_topics[[float(x) for x in range(20)]]
thread_other_topics.fillna(0.0, inplace=True)
thread_other_topics

Unnamed: 0,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0
"(""Tiger"" William Dunlop#Current image, Deconstructhis)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0
"(""Tiger"" William Dunlop#Current image, Ukexpat)",2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,4.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,3.0,0.0
"(""Tiger"" William Dunlop#Current image, Skinsmoke)",2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,3.0,0.0
"(""Tiger"" William Dunlop#Current image, Canglesea)",2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,5.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,4.0,0.0
"(""Tiger"" William Dunlop#Requested move, Billinghurst)",2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,4.0,0.0
"(""Tiger"" William Dunlop#Requested move, CJ3370)",3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,1.0,0.0,0.0,0.0,2.0,3.0,0.0,0.0,3.0,0.0
"(""Tiger"" William Dunlop#Requested move, Skinsmoke)",3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,2.0,3.0,0.0,0.0,3.0,0.0
"(""Tiger"" William Dunlop#Requested move, Deconstructhis)",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,2.0,4.0,0.0,0.0,2.0,0.0
"(""V"" Is for Vagina#Why is the ""i"" capitalized?, Jennavecia)",0.0,0.0,0.0,0.0,1.0,0.0,18.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,5.0,1.0
"(""V"" Is for Vagina#Why is the ""i"" capitalized?, Skomorokh)",0.0,0.0,1.0,3.0,3.0,0.0,16.0,0.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,2.0


In [14]:
# Manipulate columns
thread_other_topics['editor'] = [item[1] for item in thread_other_topics.index]
thread_other_topics['article_name'] = [item[0].split('#')[0] for item in thread_other_topics.index]
thread_other_topics['thread'] = [item[0].split('#')[1] for item in thread_other_topics.index]
thread_other_topics

Unnamed: 0,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,13.0,14.0,15.0,16.0,17.0,18.0,19.0,editor,article_name,thread
"(""Tiger"" William Dunlop#Current image, Deconstructhis)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,2.0,2.0,0.0,Deconstructhis,"""Tiger"" William Dunlop",Current image
"(""Tiger"" William Dunlop#Current image, Ukexpat)",2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,3.0,3.0,0.0,Ukexpat,"""Tiger"" William Dunlop",Current image
"(""Tiger"" William Dunlop#Current image, Skinsmoke)",2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,1.0,3.0,3.0,0.0,Skinsmoke,"""Tiger"" William Dunlop",Current image
"(""Tiger"" William Dunlop#Current image, Canglesea)",2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,1.0,1.0,4.0,0.0,Canglesea,"""Tiger"" William Dunlop",Current image
"(""Tiger"" William Dunlop#Requested move, Billinghurst)",2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,...,0.0,0.0,2.0,0.0,0.0,4.0,0.0,Billinghurst,"""Tiger"" William Dunlop",Requested move
"(""Tiger"" William Dunlop#Requested move, CJ3370)",3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,...,0.0,2.0,3.0,0.0,0.0,3.0,0.0,CJ3370,"""Tiger"" William Dunlop",Requested move
"(""Tiger"" William Dunlop#Requested move, Skinsmoke)",3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,2.0,3.0,0.0,0.0,3.0,0.0,Skinsmoke,"""Tiger"" William Dunlop",Requested move
"(""Tiger"" William Dunlop#Requested move, Deconstructhis)",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,2.0,4.0,0.0,0.0,2.0,0.0,Deconstructhis,"""Tiger"" William Dunlop",Requested move
"(""V"" Is for Vagina#Why is the ""i"" capitalized?, Jennavecia)",0.0,0.0,0.0,0.0,1.0,0.0,18.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,5.0,1.0,Jennavecia,"""V"" Is for Vagina","Why is the ""i"" capitalized?"
"(""V"" Is for Vagina#Why is the ""i"" capitalized?, Skomorokh)",0.0,0.0,1.0,3.0,3.0,0.0,16.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,7.0,2.0,Skomorokh,"""V"" Is for Vagina","Why is the ""i"" capitalized?"


In [15]:
thread_other_topics.rename(columns={n: 'other_ftopic{:.0f}'.format(n) for n in range(20)}, inplace=True)
thread_other_topics

Unnamed: 0,other_ftopic0,other_ftopic1,other_ftopic2,other_ftopic3,other_ftopic4,other_ftopic5,other_ftopic6,other_ftopic7,other_ftopic8,other_ftopic9,...,other_ftopic13,other_ftopic14,other_ftopic15,other_ftopic16,other_ftopic17,other_ftopic18,other_ftopic19,editor,article_name,thread
"(""Tiger"" William Dunlop#Current image, Deconstructhis)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,2.0,2.0,0.0,Deconstructhis,"""Tiger"" William Dunlop",Current image
"(""Tiger"" William Dunlop#Current image, Ukexpat)",2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,3.0,3.0,0.0,Ukexpat,"""Tiger"" William Dunlop",Current image
"(""Tiger"" William Dunlop#Current image, Skinsmoke)",2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,1.0,3.0,3.0,0.0,Skinsmoke,"""Tiger"" William Dunlop",Current image
"(""Tiger"" William Dunlop#Current image, Canglesea)",2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,1.0,1.0,4.0,0.0,Canglesea,"""Tiger"" William Dunlop",Current image
"(""Tiger"" William Dunlop#Requested move, Billinghurst)",2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,...,0.0,0.0,2.0,0.0,0.0,4.0,0.0,Billinghurst,"""Tiger"" William Dunlop",Requested move
"(""Tiger"" William Dunlop#Requested move, CJ3370)",3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,...,0.0,2.0,3.0,0.0,0.0,3.0,0.0,CJ3370,"""Tiger"" William Dunlop",Requested move
"(""Tiger"" William Dunlop#Requested move, Skinsmoke)",3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,2.0,3.0,0.0,0.0,3.0,0.0,Skinsmoke,"""Tiger"" William Dunlop",Requested move
"(""Tiger"" William Dunlop#Requested move, Deconstructhis)",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,2.0,4.0,0.0,0.0,2.0,0.0,Deconstructhis,"""Tiger"" William Dunlop",Requested move
"(""V"" Is for Vagina#Why is the ""i"" capitalized?, Jennavecia)",0.0,0.0,0.0,0.0,1.0,0.0,18.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,5.0,1.0,Jennavecia,"""V"" Is for Vagina","Why is the ""i"" capitalized?"
"(""V"" Is for Vagina#Why is the ""i"" capitalized?, Skomorokh)",0.0,0.0,1.0,3.0,3.0,0.0,16.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,7.0,2.0,Skomorokh,"""V"" Is for Vagina","Why is the ""i"" capitalized?"


In [16]:
ed_ftopics = pd.read_csv('/home/michael/school/research/wp/wikipedia/data/talk/enwiki/enwiki_talk_scores_ftopics.csv')
ed_ftopics

Unnamed: 0,article,thread_title,editor,editor_talk,other_talk,#editor_turns,#other_turns,editor_score,article_name,thread,...,ftopic10,ftopic11,ftopic12,ftopic13,ftopic14,ftopic15,ftopic16,ftopic17,ftopic18,ftopic19
0,"""Tiger"" William Dunlop",Current image,Deconstructhis,I've noticed that the current image that's acc...,I removed the deletion tag because the image i...,2,3,0.033058,"""Tiger"" William Dunlop",Current image,...,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,0.0
1,"""Tiger"" William Dunlop",Current image,Skinsmoke,::Good move ukexpat. That was really helpful (...,I've noticed that the current image that's acc...,1,4,0.482759,"""Tiger"" William Dunlop",Current image,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,"""Tiger"" William Dunlop",Current image,Ukexpat,I removed the deletion tag because the image i...,I've noticed that the current image that's acc...,1,4,0.392157,"""Tiger"" William Dunlop",Current image,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,"""Tiger"" William Dunlop",Requested move,CJ3370,"William ""Tiger"" Dunlop → William ""Tiger"" Dunlo...",The following discussion is an archived discus...,2,6,0.826415,"""Tiger"" William Dunlop",Requested move,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,"""Tiger"" William Dunlop",Requested move,CJ3370,"William ""Tiger"" Dunlop → William ""Tiger"" Dunlo...",The following discussion is an archived discus...,2,6,0.826415,"""Tiger"" William Dunlop",Requested move,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
5,"""Tiger"" William Dunlop",Requested move,Deconstructhis,Not a problem to merge if that is the most app...,The following discussion is an archived discus...,1,7,0.033058,"""Tiger"" William Dunlop",Requested move,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
6,"""Tiger"" William Dunlop",Requested move,Deconstructhis,Not a problem to merge if that is the most app...,The following discussion is an archived discus...,1,7,0.033058,"""Tiger"" William Dunlop",Requested move,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
7,"""Tiger"" William Dunlop",Requested move,Skinsmoke,User:CJ3370/William_Tiger_Dunlop → William Tig...,The following discussion is an archived discus...,3,5,0.456897,"""Tiger"" William Dunlop",Requested move,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
8,"""Tiger"" William Dunlop",Requested move,Skinsmoke,User:CJ3370/William_Tiger_Dunlop → William Tig...,The following discussion is an archived discus...,3,5,0.456897,"""Tiger"" William Dunlop",Requested move,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
9,"""V"" Is for Vagina","Why is the ""i"" capitalized?",Gunmetal Angel,"""I"" is supposed to be capatalized in ""is"" in t...",It doesn't make sense and it doesn't represent...,1,9,1.000000,"""V"" Is for Vagina","Why is the ""i"" capitalized?",...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [18]:
# Merge in other topics
merged = pd.merge(ed_ftopics, thread_other_topics, left_on=['article', 'thread_title', 'editor'], right_on=['article_name', 'thread', 'editor'])
print(len(merged))
merged.columns

69217


Index(['article', 'thread_title', 'editor', 'editor_talk', 'other_talk',
       '#editor_turns', '#other_turns', 'editor_score', 'article_name_x',
       'thread_x', 'ftopic0', 'ftopic1', 'ftopic2', 'ftopic3', 'ftopic4',
       'ftopic5', 'ftopic6', 'ftopic7', 'ftopic8', 'ftopic9', 'ftopic10',
       'ftopic11', 'ftopic12', 'ftopic13', 'ftopic14', 'ftopic15', 'ftopic16',
       'ftopic17', 'ftopic18', 'ftopic19', 'other_ftopic0', 'other_ftopic1',
       'other_ftopic2', 'other_ftopic3', 'other_ftopic4', 'other_ftopic5',
       'other_ftopic6', 'other_ftopic7', 'other_ftopic8', 'other_ftopic9',
       'other_ftopic10', 'other_ftopic11', 'other_ftopic12', 'other_ftopic13',
       'other_ftopic14', 'other_ftopic15', 'other_ftopic16', 'other_ftopic17',
       'other_ftopic18', 'other_ftopic19', 'article_name_y', 'thread_y'],
      dtype='object')

In [19]:
merged.drop(['article_name_x', 'thread_x', 'article_name_y', 'thread_y'], axis=1, inplace=True)
merged.columns

Index(['article', 'thread_title', 'editor', 'editor_talk', 'other_talk',
       '#editor_turns', '#other_turns', 'editor_score', 'ftopic0', 'ftopic1',
       'ftopic2', 'ftopic3', 'ftopic4', 'ftopic5', 'ftopic6', 'ftopic7',
       'ftopic8', 'ftopic9', 'ftopic10', 'ftopic11', 'ftopic12', 'ftopic13',
       'ftopic14', 'ftopic15', 'ftopic16', 'ftopic17', 'ftopic18', 'ftopic19',
       'other_ftopic0', 'other_ftopic1', 'other_ftopic2', 'other_ftopic3',
       'other_ftopic4', 'other_ftopic5', 'other_ftopic6', 'other_ftopic7',
       'other_ftopic8', 'other_ftopic9', 'other_ftopic10', 'other_ftopic11',
       'other_ftopic12', 'other_ftopic13', 'other_ftopic14', 'other_ftopic15',
       'other_ftopic16', 'other_ftopic17', 'other_ftopic18', 'other_ftopic19'],
      dtype='object')

In [22]:
merged.to_csv('/home/michael/school/research/wp/wikipedia/data/talk/enwiki/enwiki_talk_scores_ftopics.csv', index=False)

In [25]:
merged.rename(columns={'thread_title': 'thread'}, inplace=True)
merged.columns

Index(['article', 'thread', 'editor', 'editor_talk', 'other_talk',
       '#editor_turns', '#other_turns', 'editor_score', 'ftopic0', 'ftopic1',
       'ftopic2', 'ftopic3', 'ftopic4', 'ftopic5', 'ftopic6', 'ftopic7',
       'ftopic8', 'ftopic9', 'ftopic10', 'ftopic11', 'ftopic12', 'ftopic13',
       'ftopic14', 'ftopic15', 'ftopic16', 'ftopic17', 'ftopic18', 'ftopic19',
       'other_ftopic0', 'other_ftopic1', 'other_ftopic2', 'other_ftopic3',
       'other_ftopic4', 'other_ftopic5', 'other_ftopic6', 'other_ftopic7',
       'other_ftopic8', 'other_ftopic9', 'other_ftopic10', 'other_ftopic11',
       'other_ftopic12', 'other_ftopic13', 'other_ftopic14', 'other_ftopic15',
       'other_ftopic16', 'other_ftopic17', 'other_ftopic18', 'other_ftopic19'],
      dtype='object')

In [23]:
# Merge into allfeats
feats = pd.read_csv('/home/michael/school/research/wp/wikipedia/data/talk/enwiki/enwiki_talk_scores_allfeats.csv')
feats.columns

Index(['fold', 'article', 'thread', 'editor', 'editor_talk', 'other_talk',
       '#editor_turns', '#other_turns', 'editor_score', 'ftopic0', 'ftopic1',
       'ftopic2', 'ftopic3', 'ftopic4', 'ftopic5', 'ftopic6', 'ftopic7',
       'ftopic8', 'ftopic9', 'ftopic10', 'ftopic11', 'ftopic12', 'ftopic13',
       'ftopic14', 'ftopic15', 'ftopic16', 'ftopic17', 'ftopic18', 'ftopic19',
       '#editors'],
      dtype='object')

In [26]:
allfeats = pd.merge(feats, merged)
print(len(allfeats))
allfeats.columns

56055


Index(['fold', 'article', 'thread', 'editor', 'editor_talk', 'other_talk',
       '#editor_turns', '#other_turns', 'editor_score', 'ftopic0', 'ftopic1',
       'ftopic2', 'ftopic3', 'ftopic4', 'ftopic5', 'ftopic6', 'ftopic7',
       'ftopic8', 'ftopic9', 'ftopic10', 'ftopic11', 'ftopic12', 'ftopic13',
       'ftopic14', 'ftopic15', 'ftopic16', 'ftopic17', 'ftopic18', 'ftopic19',
       '#editors', 'other_ftopic0', 'other_ftopic1', 'other_ftopic2',
       'other_ftopic3', 'other_ftopic4', 'other_ftopic5', 'other_ftopic6',
       'other_ftopic7', 'other_ftopic8', 'other_ftopic9', 'other_ftopic10',
       'other_ftopic11', 'other_ftopic12', 'other_ftopic13', 'other_ftopic14',
       'other_ftopic15', 'other_ftopic16', 'other_ftopic17', 'other_ftopic18',
       'other_ftopic19'],
      dtype='object')

In [27]:
cols = allfeats.columns.tolist()
ed_idx = cols.index('#editors')
new_cols = cols[:ed_idx] + cols[ed_idx+1:] + cols[ed_idx:ed_idx+1]
new_cols

['fold',
 'article',
 'thread',
 'editor',
 'editor_talk',
 'other_talk',
 '#editor_turns',
 '#other_turns',
 'editor_score',
 'ftopic0',
 'ftopic1',
 'ftopic2',
 'ftopic3',
 'ftopic4',
 'ftopic5',
 'ftopic6',
 'ftopic7',
 'ftopic8',
 'ftopic9',
 'ftopic10',
 'ftopic11',
 'ftopic12',
 'ftopic13',
 'ftopic14',
 'ftopic15',
 'ftopic16',
 'ftopic17',
 'ftopic18',
 'ftopic19',
 'other_ftopic0',
 'other_ftopic1',
 'other_ftopic2',
 'other_ftopic3',
 'other_ftopic4',
 'other_ftopic5',
 'other_ftopic6',
 'other_ftopic7',
 'other_ftopic8',
 'other_ftopic9',
 'other_ftopic10',
 'other_ftopic11',
 'other_ftopic12',
 'other_ftopic13',
 'other_ftopic14',
 'other_ftopic15',
 'other_ftopic16',
 'other_ftopic17',
 'other_ftopic18',
 'other_ftopic19',
 '#editors']

In [28]:
rearranged = allfeats[new_cols]
print(len(rearranged))
rearranged.columns

56055


Index(['fold', 'article', 'thread', 'editor', 'editor_talk', 'other_talk',
       '#editor_turns', '#other_turns', 'editor_score', 'ftopic0', 'ftopic1',
       'ftopic2', 'ftopic3', 'ftopic4', 'ftopic5', 'ftopic6', 'ftopic7',
       'ftopic8', 'ftopic9', 'ftopic10', 'ftopic11', 'ftopic12', 'ftopic13',
       'ftopic14', 'ftopic15', 'ftopic16', 'ftopic17', 'ftopic18', 'ftopic19',
       'other_ftopic0', 'other_ftopic1', 'other_ftopic2', 'other_ftopic3',
       'other_ftopic4', 'other_ftopic5', 'other_ftopic6', 'other_ftopic7',
       'other_ftopic8', 'other_ftopic9', 'other_ftopic10', 'other_ftopic11',
       'other_ftopic12', 'other_ftopic13', 'other_ftopic14', 'other_ftopic15',
       'other_ftopic16', 'other_ftopic17', 'other_ftopic18', 'other_ftopic19',
       '#editors'],
      dtype='object')

In [29]:
rearranged.to_csv('/home/michael/school/research/wp/wikipedia/data/talk/enwiki/enwiki_talk_scores_allfeats.csv', index=False)

### Editor DAs

In [9]:
# Load editor talk scores
scores = pd.read_csv('/home/michael/school/research/wp/wikipedia/data/talk/enwiki/enwiki_talk_scores.csv')
scores

Unnamed: 0,article,thread_title,editor,editor_talk,other_talk,#editor_turns,#other_turns,editor_score
0,"""Tiger"" William Dunlop",Current image,Deconstructhis,I've noticed that the current image that's acc...,I removed the deletion tag because the image i...,2,3,0.033058
1,"""Tiger"" William Dunlop",Current image,Skinsmoke,::Good move ukexpat. That was really helpful (...,I've noticed that the current image that's acc...,1,4,0.482759
2,"""Tiger"" William Dunlop",Current image,Ukexpat,I removed the deletion tag because the image i...,I've noticed that the current image that's acc...,1,4,0.392157
3,"""Tiger"" William Dunlop",Requested move,CJ3370,"William ""Tiger"" Dunlop → William ""Tiger"" Dunlo...",The following discussion is an archived discus...,2,6,0.826415
4,"""Tiger"" William Dunlop",Requested move,CJ3370,"William ""Tiger"" Dunlop → William ""Tiger"" Dunlo...",The following discussion is an archived discus...,2,6,0.826415
5,"""Tiger"" William Dunlop",Requested move,Deconstructhis,Not a problem to merge if that is the most app...,The following discussion is an archived discus...,1,7,0.033058
6,"""Tiger"" William Dunlop",Requested move,Deconstructhis,Not a problem to merge if that is the most app...,The following discussion is an archived discus...,1,7,0.033058
7,"""Tiger"" William Dunlop",Requested move,Skinsmoke,User:CJ3370/William_Tiger_Dunlop → William Tig...,The following discussion is an archived discus...,3,5,0.456897
8,"""Tiger"" William Dunlop",Requested move,Skinsmoke,User:CJ3370/William_Tiger_Dunlop → William Tig...,The following discussion is an archived discus...,3,5,0.456897
9,"""V"" Is for Vagina","Why is the ""i"" capitalized?",Gunmetal Angel,"""I"" is supposed to be capatalized in ""is"" in t...",It doesn't make sense and it doesn't represent...,1,9,1.000000


In [5]:
thread_user_topics

Unnamed: 0,article_name,thread,editor,ftopic0,ftopic1,ftopic2,ftopic3,ftopic4,ftopic5,ftopic6,...,ftopic10,ftopic11,ftopic12,ftopic13,ftopic14,ftopic15,ftopic16,ftopic17,ftopic18,ftopic19
0,"""Tiger"" William Dunlop",Current image,Deconstructhis,2.0,0.0,0.0,0.0,0.0,1.0,0.0,...,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,0.0
1,"""Tiger"" William Dunlop",Current image,Ukexpat,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,"""Tiger"" William Dunlop",Current image,Skinsmoke,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,"""Tiger"" William Dunlop",Current image,Canglesea,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,"""Tiger"" William Dunlop",Requested move,Billinghurst,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0
5,"""Tiger"" William Dunlop",Requested move,CJ3370,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
6,"""Tiger"" William Dunlop",Requested move,Skinsmoke,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
7,"""Tiger"" William Dunlop",Requested move,Deconstructhis,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
8,"""V"" Is for Vagina","Why is the ""i"" capitalized?",Jennavecia,0.0,0.0,1.0,3.0,2.0,0.0,8.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0
9,"""V"" Is for Vagina","Why is the ""i"" capitalized?",Skomorokh,0.0,0.0,0.0,0.0,0.0,0.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0


In [11]:
print(len(thread_user_topics))
print(len(scores))

105653
69330


In [10]:
# Merge scores and topics
merged = pd.merge(scores, thread_user_topics, left_on=['article', 'thread_title', 'editor'], right_on=['article_name', 'thread', 'editor'])
print(len(merged))
merged

69215


Unnamed: 0,article,thread_title,editor,editor_talk,other_talk,#editor_turns,#other_turns,editor_score,article_name,thread,...,ftopic10,ftopic11,ftopic12,ftopic13,ftopic14,ftopic15,ftopic16,ftopic17,ftopic18,ftopic19
0,"""Tiger"" William Dunlop",Current image,Deconstructhis,I've noticed that the current image that's acc...,I removed the deletion tag because the image i...,2,3,0.033058,"""Tiger"" William Dunlop",Current image,...,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,0.0
1,"""Tiger"" William Dunlop",Current image,Skinsmoke,::Good move ukexpat. That was really helpful (...,I've noticed that the current image that's acc...,1,4,0.482759,"""Tiger"" William Dunlop",Current image,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,"""Tiger"" William Dunlop",Current image,Ukexpat,I removed the deletion tag because the image i...,I've noticed that the current image that's acc...,1,4,0.392157,"""Tiger"" William Dunlop",Current image,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,"""Tiger"" William Dunlop",Requested move,CJ3370,"William ""Tiger"" Dunlop → William ""Tiger"" Dunlo...",The following discussion is an archived discus...,2,6,0.826415,"""Tiger"" William Dunlop",Requested move,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,"""Tiger"" William Dunlop",Requested move,CJ3370,"William ""Tiger"" Dunlop → William ""Tiger"" Dunlo...",The following discussion is an archived discus...,2,6,0.826415,"""Tiger"" William Dunlop",Requested move,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
5,"""Tiger"" William Dunlop",Requested move,Deconstructhis,Not a problem to merge if that is the most app...,The following discussion is an archived discus...,1,7,0.033058,"""Tiger"" William Dunlop",Requested move,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
6,"""Tiger"" William Dunlop",Requested move,Deconstructhis,Not a problem to merge if that is the most app...,The following discussion is an archived discus...,1,7,0.033058,"""Tiger"" William Dunlop",Requested move,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
7,"""Tiger"" William Dunlop",Requested move,Skinsmoke,User:CJ3370/William_Tiger_Dunlop → William Tig...,The following discussion is an archived discus...,3,5,0.456897,"""Tiger"" William Dunlop",Requested move,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
8,"""Tiger"" William Dunlop",Requested move,Skinsmoke,User:CJ3370/William_Tiger_Dunlop → William Tig...,The following discussion is an archived discus...,3,5,0.456897,"""Tiger"" William Dunlop",Requested move,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
9,"""V"" Is for Vagina","Why is the ""i"" capitalized?",Gunmetal Angel,"""I"" is supposed to be capatalized in ""is"" in t...",It doesn't make sense and it doesn't represent...,1,9,1.000000,"""V"" Is for Vagina","Why is the ""i"" capitalized?",...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [12]:
merged.to_csv('/home/michael/school/research/wp/wikipedia/data/talk/enwiki/enwiki_talk_scores_allfeats.csv', index=False)

In [2]:
# Load sentence-level DAs
sent_das = pd.read_csv('/home/michael/school/research/wp/wikipedia/data/talk/enwiki/HSTTM8W-talk0-S5-FT20-BT100-FA0.1-B0.001-FG1.0-K0.1-E0.75-N0.75-I10000-InstSentAssign.csv')
sent_das

Unnamed: 0,SeqId,InstNo,User,Sentence,TaggedText,State,FTopic
0,"""Tiger"" William Dunlop#Current image",0,Deconstructhis,0.0,F16:'ve B41:noticed F16:current F16:image F16:...,3.0,16.0
1,"""Tiger"" William Dunlop#Current image",0,Deconstructhis,1.0,F9:'s F9:too F9:bad F9:really F9:always F9:thi...,3.0,9.0
2,"""Tiger"" William Dunlop#Current image",0,Deconstructhis,2.0,F10:trust F10:me F10:'m F10:last F10:person B6...,3.0,10.0
3,"""Tiger"" William Dunlop#Current image",0,Deconstructhis,3.0,F10:if F10:have F10:some B30:time F10:over F10...,3.0,10.0
4,"""Tiger"" William Dunlop#Current image",0,Deconstructhis,4.0,F0:wonder F0:how F0:difficult F0:would F0:try ...,3.0,0.0
5,"""Tiger"" William Dunlop#Current image",0,Deconstructhis,5.0,F5:what F5:we F5:need F5:copy B41:late B14:19t...,3.0,5.0
6,"""Tiger"" William Dunlop#Current image",0,Deconstructhis,6.0,F18:cheers F18:deconstructhis F18:02:19 F18:july,3.0,18.0
7,"""Tiger"" William Dunlop#Current image",1,Ukexpat,0.0,F9:removed F9:deletion F9:tag F9:because F9:im...,3.0,9.0
8,"""Tiger"" William Dunlop#Current image",1,Ukexpat,1.0,F10:any F10:copyright F10:has F10:long F10:sin...,3.0,10.0
9,"""Tiger"" William Dunlop#Current image",1,Ukexpat,2.0,F18:ukexpat F18:02:30 F18:july,3.0,18.0


In [5]:
# Empty string for nans in text
feats = feats[feats['editor_talk'].map(lambda x: isinstance(x, str))]
feats = feats[feats['other_talk'].map(lambda x: isinstance(x, str))]
len(feats)

53175

In [3]:
sent_das['State'].unique()

array([  3.,   0.,   2.,   4.,   1.,  nan])

In [4]:
sent_das['FTopic'].unique()

array([ 16.,   9.,  10.,   0.,   5.,  18.,  17.,  15.,  14.,   6.,   7.,
        19.,   3.,   2.,   4.,   8.,  13.,  11.,   1.,  12.,  nan])

In [3]:
# Group things by seqid
grouped = sent_das.groupby(['SeqId', 'User'])
grouped

<pandas.core.groupby.DataFrameGroupBy object at 0x7fa53ac00828>

In [10]:
grouped.groups

{('"Tiger" William Dunlop#Current image',
  'Deconstructhis'): Int64Index([0, 1, 2, 3, 4, 5, 6, 10, 11, 12], dtype='int64'),
 ('"Tiger" William Dunlop#Current image',
  'Ukexpat'): Int64Index([7, 8, 9], dtype='int64'),
 ('"Tiger" William Dunlop#Current image',
  'Skinsmoke'): Int64Index([13, 14, 15], dtype='int64'),
 ('"Tiger" William Dunlop#Current image',
  'Canglesea'): Int64Index([16, 17], dtype='int64'),
 ('"Tiger" William Dunlop#Requested move',
  'Billinghurst'): Int64Index([18, 19, 20, 21, 22], dtype='int64'),
 ('"Tiger" William Dunlop#Requested move',
  'CJ3370'): Int64Index([23, 24], dtype='int64'),
 ('"Tiger" William Dunlop#Requested move',
  'Skinsmoke'): Int64Index([25, 26, 27, 28, 29], dtype='int64'),
 ('"Tiger" William Dunlop#Requested move',
  'Deconstructhis'): Int64Index([30, 31, 32, 33, 34, 35, 36], dtype='int64'),
 ('"V" Is for Vagina#Why is the "i" capitalized?',
  'Jennavecia'): Int64Index([37, 38, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 69, 70, 71, 72, 73,
      

In [4]:
ftopic_counts = {}

for (thread, user), inds in tqdm(grouped.groups.items()):
    ftopics = Counter(sent_das.iloc[inds]['FTopic'].tolist())
    ftopic_counts[(thread, user)] = ftopics
    
len(ftopic_counts)




105653

In [18]:
ftopic_counts[list(ftopic_counts.keys())[0]]

Counter({0.0: 2, 5.0: 1, 9.0: 1, 10.0: 2, 16.0: 1, 17.0: 1, 18.0: 2})

In [5]:
# Make FTopic columns
thread_user_topics = pd.DataFrame(ftopic_counts).transpose()
thread_user_topics

Unnamed: 0,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,nan,nan.1,nan.2,nan.3,nan.4,nan.5,nan.6,nan.7,nan.8,nan.9
"(""Tiger"" William Dunlop#Current image, Deconstructhis)",2.0,,,,,1.0,,,,1.0,...,,,,,,,,,,
"(""Tiger"" William Dunlop#Current image, Ukexpat)",,,,,,,,,,1.0,...,,,,,,,,,,
"(""Tiger"" William Dunlop#Current image, Skinsmoke)",,,,,,,,,,,...,,,,,,,,,,
"(""Tiger"" William Dunlop#Current image, Canglesea)",,,,,,,,,,,...,,,,,,,,,,
"(""Tiger"" William Dunlop#Requested move, Billinghurst)",1.0,,,,,,,,,,...,,,,,,,,,,
"(""Tiger"" William Dunlop#Requested move, CJ3370)",,,,,,,,,,,...,,,,,,,,,,
"(""Tiger"" William Dunlop#Requested move, Skinsmoke)",,,,,,,,,,3.0,...,,,,,,,,,,
"(""Tiger"" William Dunlop#Requested move, Deconstructhis)",2.0,,,,,,,,,2.0,...,,,,,,,,,,
"(""V"" Is for Vagina#Why is the ""i"" capitalized?, Jennavecia)",,,1.0,3.0,2.0,,8.0,,1.0,,...,,,,,,,,,,
"(""V"" Is for Vagina#Why is the ""i"" capitalized?, Skomorokh)",,,,,,,10.0,1.0,,,...,,,,,,,,,,


In [6]:
thread_user_topics.fillna(0.0, inplace=True)
thread_user_topics

Unnamed: 0,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,nan,nan.1,nan.2,nan.3,nan.4,nan.5,nan.6,nan.7,nan.8,nan.9
"(""Tiger"" William Dunlop#Current image, Deconstructhis)",2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(""Tiger"" William Dunlop#Current image, Ukexpat)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(""Tiger"" William Dunlop#Current image, Skinsmoke)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(""Tiger"" William Dunlop#Current image, Canglesea)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(""Tiger"" William Dunlop#Requested move, Billinghurst)",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(""Tiger"" William Dunlop#Requested move, CJ3370)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(""Tiger"" William Dunlop#Requested move, Skinsmoke)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(""Tiger"" William Dunlop#Requested move, Deconstructhis)",2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(""V"" Is for Vagina#Why is the ""i"" capitalized?, Jennavecia)",0.0,0.0,1.0,3.0,2.0,0.0,8.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(""V"" Is for Vagina#Why is the ""i"" capitalized?, Skomorokh)",0.0,0.0,0.0,0.0,0.0,0.0,10.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
thread_user_topics.columns.tolist()

[0.0,
 1.0,
 2.0,
 3.0,
 4.0,
 5.0,
 6.0,
 7.0,
 8.0,
 9.0,
 10.0,
 11.0,
 12.0,
 13.0,
 14.0,
 15.0,
 16.0,
 17.0,
 18.0,
 19.0,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan]

In [7]:
thread_user_topics = thread_user_topics[[float(x) for x in range(20)]]
thread_user_topics.columns.tolist()

[0.0,
 1.0,
 2.0,
 3.0,
 4.0,
 5.0,
 6.0,
 7.0,
 8.0,
 9.0,
 10.0,
 11.0,
 12.0,
 13.0,
 14.0,
 15.0,
 16.0,
 17.0,
 18.0,
 19.0]

In [8]:
# Manipulate columns
thread_user_topics['editor'] = [item[1] for item in thread_user_topics.index]
thread_user_topics['article_name'] = [item[0].split('#')[0] for item in thread_user_topics.index]
thread_user_topics['thread'] = [item[0].split('#')[1] for item in thread_user_topics.index]
thread_user_topics

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,13.0,14.0,15.0,16.0,17.0,18.0,19.0,editor,article_name,thread
"(""Tiger"" William Dunlop#Current image, Deconstructhis)",2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,1.0,2.0,0.0,Deconstructhis,"""Tiger"" William Dunlop",Current image
"(""Tiger"" William Dunlop#Current image, Ukexpat)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,Ukexpat,"""Tiger"" William Dunlop",Current image
"(""Tiger"" William Dunlop#Current image, Skinsmoke)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,Skinsmoke,"""Tiger"" William Dunlop",Current image
"(""Tiger"" William Dunlop#Current image, Canglesea)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,Canglesea,"""Tiger"" William Dunlop",Current image
"(""Tiger"" William Dunlop#Requested move, Billinghurst)",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,2.0,0.0,0.0,0.0,0.0,Billinghurst,"""Tiger"" William Dunlop",Requested move
"(""Tiger"" William Dunlop#Requested move, CJ3370)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,CJ3370,"""Tiger"" William Dunlop",Requested move
"(""Tiger"" William Dunlop#Requested move, Skinsmoke)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,Skinsmoke,"""Tiger"" William Dunlop",Requested move
"(""Tiger"" William Dunlop#Requested move, Deconstructhis)",2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,2.0,0.0,Deconstructhis,"""Tiger"" William Dunlop",Requested move
"(""V"" Is for Vagina#Why is the ""i"" capitalized?, Jennavecia)",0.0,0.0,1.0,3.0,2.0,0.0,8.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,1.0,Jennavecia,"""V"" Is for Vagina","Why is the ""i"" capitalized?"
"(""V"" Is for Vagina#Why is the ""i"" capitalized?, Skomorokh)",0.0,0.0,0.0,0.0,0.0,0.0,10.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,2.0,0.0,Skomorokh,"""V"" Is for Vagina","Why is the ""i"" capitalized?"


In [31]:
len(thread_user_topics)

105653

In [9]:
thread_user_topics.rename(columns={n: 'ftopic{:.0f}'.format(n) for n in range(20)}, inplace=True)
thread_user_topics

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)


Unnamed: 0,ftopic0,ftopic1,ftopic2,ftopic3,ftopic4,ftopic5,ftopic6,ftopic7,ftopic8,ftopic9,...,ftopic13,ftopic14,ftopic15,ftopic16,ftopic17,ftopic18,ftopic19,editor,article_name,thread
"(""Tiger"" William Dunlop#Current image, Deconstructhis)",2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,1.0,2.0,0.0,Deconstructhis,"""Tiger"" William Dunlop",Current image
"(""Tiger"" William Dunlop#Current image, Ukexpat)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,Ukexpat,"""Tiger"" William Dunlop",Current image
"(""Tiger"" William Dunlop#Current image, Skinsmoke)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,Skinsmoke,"""Tiger"" William Dunlop",Current image
"(""Tiger"" William Dunlop#Current image, Canglesea)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,Canglesea,"""Tiger"" William Dunlop",Current image
"(""Tiger"" William Dunlop#Requested move, Billinghurst)",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,2.0,0.0,0.0,0.0,0.0,Billinghurst,"""Tiger"" William Dunlop",Requested move
"(""Tiger"" William Dunlop#Requested move, CJ3370)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,CJ3370,"""Tiger"" William Dunlop",Requested move
"(""Tiger"" William Dunlop#Requested move, Skinsmoke)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,Skinsmoke,"""Tiger"" William Dunlop",Requested move
"(""Tiger"" William Dunlop#Requested move, Deconstructhis)",2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,2.0,0.0,Deconstructhis,"""Tiger"" William Dunlop",Requested move
"(""V"" Is for Vagina#Why is the ""i"" capitalized?, Jennavecia)",0.0,0.0,1.0,3.0,2.0,0.0,8.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,1.0,Jennavecia,"""V"" Is for Vagina","Why is the ""i"" capitalized?"
"(""V"" Is for Vagina#Why is the ""i"" capitalized?, Skomorokh)",0.0,0.0,0.0,0.0,0.0,0.0,10.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,2.0,0.0,Skomorokh,"""V"" Is for Vagina","Why is the ""i"" capitalized?"


In [11]:
cols = thread_user_topics.columns.tolist()
new_cols = cols[-3:] + cols[:-3]
thread_user_topics = thread_user_topics[new_cols]
thread_user_topics

Unnamed: 0,editor,article_name,thread,ftopic0,ftopic1,ftopic2,ftopic3,ftopic4,ftopic5,ftopic6,...,ftopic10,ftopic11,ftopic12,ftopic13,ftopic14,ftopic15,ftopic16,ftopic17,ftopic18,ftopic19
"(""Tiger"" William Dunlop#Current image, Deconstructhis)",Deconstructhis,"""Tiger"" William Dunlop",Current image,2.0,0.0,0.0,0.0,0.0,1.0,0.0,...,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,0.0
"(""Tiger"" William Dunlop#Current image, Ukexpat)",Ukexpat,"""Tiger"" William Dunlop",Current image,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
"(""Tiger"" William Dunlop#Current image, Skinsmoke)",Skinsmoke,"""Tiger"" William Dunlop",Current image,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
"(""Tiger"" William Dunlop#Current image, Canglesea)",Canglesea,"""Tiger"" William Dunlop",Current image,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
"(""Tiger"" William Dunlop#Requested move, Billinghurst)",Billinghurst,"""Tiger"" William Dunlop",Requested move,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0
"(""Tiger"" William Dunlop#Requested move, CJ3370)",CJ3370,"""Tiger"" William Dunlop",Requested move,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
"(""Tiger"" William Dunlop#Requested move, Skinsmoke)",Skinsmoke,"""Tiger"" William Dunlop",Requested move,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
"(""Tiger"" William Dunlop#Requested move, Deconstructhis)",Deconstructhis,"""Tiger"" William Dunlop",Requested move,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
"(""V"" Is for Vagina#Why is the ""i"" capitalized?, Jennavecia)",Jennavecia,"""V"" Is for Vagina","Why is the ""i"" capitalized?",0.0,0.0,1.0,3.0,2.0,0.0,8.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0
"(""V"" Is for Vagina#Why is the ""i"" capitalized?, Skomorokh)",Skomorokh,"""V"" Is for Vagina","Why is the ""i"" capitalized?",0.0,0.0,0.0,0.0,0.0,0.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0


In [14]:
new_cols = cols[-2:] + cols[-3:-2] + cols[:-3]
new_cols

['article_name',
 'thread',
 'editor',
 'ftopic0',
 'ftopic1',
 'ftopic2',
 'ftopic3',
 'ftopic4',
 'ftopic5',
 'ftopic6',
 'ftopic7',
 'ftopic8',
 'ftopic9',
 'ftopic10',
 'ftopic11',
 'ftopic12',
 'ftopic13',
 'ftopic14',
 'ftopic15',
 'ftopic16',
 'ftopic17',
 'ftopic18',
 'ftopic19']

In [15]:
thread_user_topics = thread_user_topics[new_cols]
thread_user_topics

Unnamed: 0,article_name,thread,editor,ftopic0,ftopic1,ftopic2,ftopic3,ftopic4,ftopic5,ftopic6,...,ftopic10,ftopic11,ftopic12,ftopic13,ftopic14,ftopic15,ftopic16,ftopic17,ftopic18,ftopic19
"(""Tiger"" William Dunlop#Current image, Deconstructhis)","""Tiger"" William Dunlop",Current image,Deconstructhis,2.0,0.0,0.0,0.0,0.0,1.0,0.0,...,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,0.0
"(""Tiger"" William Dunlop#Current image, Ukexpat)","""Tiger"" William Dunlop",Current image,Ukexpat,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
"(""Tiger"" William Dunlop#Current image, Skinsmoke)","""Tiger"" William Dunlop",Current image,Skinsmoke,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
"(""Tiger"" William Dunlop#Current image, Canglesea)","""Tiger"" William Dunlop",Current image,Canglesea,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
"(""Tiger"" William Dunlop#Requested move, Billinghurst)","""Tiger"" William Dunlop",Requested move,Billinghurst,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0
"(""Tiger"" William Dunlop#Requested move, CJ3370)","""Tiger"" William Dunlop",Requested move,CJ3370,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
"(""Tiger"" William Dunlop#Requested move, Skinsmoke)","""Tiger"" William Dunlop",Requested move,Skinsmoke,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
"(""Tiger"" William Dunlop#Requested move, Deconstructhis)","""Tiger"" William Dunlop",Requested move,Deconstructhis,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
"(""V"" Is for Vagina#Why is the ""i"" capitalized?, Jennavecia)","""V"" Is for Vagina","Why is the ""i"" capitalized?",Jennavecia,0.0,0.0,1.0,3.0,2.0,0.0,8.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0
"(""V"" Is for Vagina#Why is the ""i"" capitalized?, Skomorokh)","""V"" Is for Vagina","Why is the ""i"" capitalized?",Skomorokh,0.0,0.0,0.0,0.0,0.0,0.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0


In [17]:
thread_user_topics.to_csv('/home/michael/school/research/wp/wikipedia/data/talk/enwiki/enwiki_dialog_acts.csv', index=False)

In [None]:
# Load editor talk scores
scores = pd.read_csv('/home/michael/school/research/wp/wikipedia/data/')

# Linear regression baseline on Keith's data

In [7]:
# Load data
with open('/home/michael/school/research/wp/ijcnlp_data/ijcnlp_fold1-traintest.pickle', 'rb') as f:
    fold1 = pickle.load(f, encoding='latin1')
    
fold1

AttributeError: 'tuple' object has no attribute 'shape'

In [11]:
(train_X, train_y), (test_X, test_y) = fold1
train_X

array([[ 0.,  0.,  0., ...,  1.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  1.,  0.],
       [ 0.,  0.,  0., ...,  2.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  1.,  2.,  2.],
       [ 0.,  0.,  0., ...,  0.,  1.,  0.],
       [ 0.,  0.,  0., ...,  0.,  1.,  0.]])

In [12]:
train_X.shape

(47951, 20)

In [13]:
train_y.shape

(47951,)

In [14]:
test_X.shape

(5224, 20)

In [15]:
# Train and test linear regression classifier
clf = LinearRegression()

clf.fit(train_X, train_y)

pred = clf.predict(test_X)
pred = np.array([max(p, 0.0) for p in pred])
pred = np.array([min(p, 1.0) for p in pred])

# The root mean square error
math.sqrt(np.mean((pred - test_y) ** 2))

0.29211660839349146

## 0.5 baseline

In [16]:
guess = [0.5]*len(test_X)

# The root mean square error
print("rmse: {:2f}".format(math.sqrt(np.mean((guess - test_y) ** 2))))

rmse: 0.305373


# Split into folds based on conversation

In [14]:
# Load features
feats = pd.read_csv('/home/michael/school/research/wp/wikipedia/data/revert_discussion_features.csv')
len(feats)

2073

In [15]:
# Generate groups by conversation
convos = sorted(set(zip(feats['article'], feats['thread_title'])))

groups = []

for row in feats.itertuples():
    groups.append(convos.index((row.article, row.thread_title)))
    
gkf = GroupKFold(n_splits=10)
group_train_folds = []
group_test_folds = []

for train, test in gkf.split(feats, feats['editor_score'], groups=groups):
    group_train_folds.append(train)
    group_test_folds.append(test)

group_fold_inds = []
for i in range(len(feats)):
    for j in range(10):
        if i in group_test_folds[j]:
            fold = j
            break
    group_fold_inds.append(fold)

feats['group_fold'] = group_fold_inds

## OLD CV: Train and test non-unigram features

In [38]:
# Train and test linear regression classifier on non-unigram features (continuous editor_score input)
clf = LinearRegression()

nonbow_pred = cross_validation.cross_val_predict(clf, nonbow, feats['editor_score'], cv=10)
nonbow_pred = np.array([max(p, 0.0) for p in nonbow_pred])
nonbow_pred = np.array([min(p, 1.0) for p in nonbow_pred])

# The mean square error
print("mse: {:2f}".format(np.mean((nonbow_pred - feats['editor_score'].values) ** 2)))

# The root mean square error
print("rmse: {:2f}".format(math.sqrt(np.mean((nonbow_pred - feats['editor_score'].values) ** 2))))

mse: 0.131227
rmse: 0.362253


## Train and test editor+other features (all features)

In [51]:
# Train and test linear regression classifier on all features (continuous editor_score input)
clf = LinearRegression()

pred = cross_validation.cross_val_predict(clf, feats_v, feats['editor_score'], cv=10)

bounded_pred = np.array([max(p, 0.0) for p in pred])
bounded_pred = np.array([min(p, 1.0) for p in bounded_pred])

# The mean square error
print("mse: {:2f}".format(np.mean((bounded_pred - feats['editor_score'].values) ** 2)))
print("rmse: {:2f}".format(math.sqrt(np.mean((bounded_pred - feats['editor_score'].values) ** 2))))

mse: 0.242041
rmse: 0.491977


In [39]:
# Save predictions
feats['nonbow_pred'] = nonbow_pred
# feats['pred'] = bounded_pred
feats['edpred'] = edpred

feats.to_csv('/home/michael/school/research/wp/wikipedia/data/revert_discussion_features.csv', index=False)

In [4]:
# Train and test SVM classifier on non-unigram features
nonbow_clf = {}
for t in thresholds:
    print(t)
#     clf = svm.SVC(kernel='linear')
    clf = svm.SVC()

    scores = cross_validation.cross_val_score(clf, nonbow, labels[t], scoring=kappa_scorer)
    print("Kappa: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

    scores = cross_validation.cross_val_score(clf, nonbow, labels[t])
    print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
    
    # Majority class guess
    true_portion = np.count_nonzero(labels[t])/len(labels[t])
    print("Majority class guess:\t", max(true_portion, 1-true_portion))
    
    # Random class guess
    print("Random class guess:\t", true_portion**2 + (1-true_portion)**2)
    print()
    
    nonbow_clf[t] = clf

0.4
Kappa: 0.003 (+/- 0.003)
Accuracy: 0.653 (+/- 0.005)
Majority class guess:	 0.6594370096908168
Random class guess:	 0.5508403201182992

0.5
Kappa: -0.002 (+/- 0.019)
Accuracy: 0.572 (+/- 0.002)
Majority class guess:	 0.5897554222427319
Random class guess:	 0.5161120716439422

0.6
Kappa: -0.020 (+/- 0.070)
Accuracy: 0.513 (+/- 0.033)
Majority class guess:	 0.5366866635902169
Random class guess:	 0.5026918225707635

0.7
Kappa: 0.024 (+/- 0.033)
Accuracy: 0.524 (+/- 0.017)
Majority class guess:	 0.5279187817258884
Random class guess:	 0.5015589167461156

0.8
Kappa: 0.032 (+/- 0.024)
Accuracy: 0.578 (+/- 0.008)
Majority class guess:	 0.5911398246423627
Random class guess:	 0.5166129352716813

0.9
Kappa: 0.038 (+/- 0.011)
Accuracy: 0.671 (+/- 0.005)
Majority class guess:	 0.6797415782187356
Random class guess:	 0.5646140698811237



In [5]:
# Train and test SVM classifier on all features
for t in thresholds:
    print(t)
    clf = svm.SVC()

    scores = cross_validation.cross_val_score(clf, feats_v, labels[t], scoring=kappa_scorer)
    print("Kappa: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

    scores = cross_validation.cross_val_score(clf, feats_v, labels[t])
    print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
    
    # Majority class guess
    true_portion = np.count_nonzero(labels[t])/len(labels[t])
    print("Majority class guess:\t", max(true_portion, 1-true_portion))
    
    # Random class guess
    print("Random class guess:\t", true_portion**2 + (1-true_portion)**2)
    print()

0.4
Kappa: 0.003 (+/- 0.004)
Accuracy: 0.660 (+/- 0.002)
Majority class guess:	 0.6594370096908168
Random class guess:	 0.5508403201182992

0.5
Kappa: 0.001 (+/- 0.015)
Accuracy: 0.587 (+/- 0.012)
Majority class guess:	 0.5897554222427319
Random class guess:	 0.5161120716439422

0.6
Kappa: 0.020 (+/- 0.054)
Accuracy: 0.541 (+/- 0.028)
Majority class guess:	 0.5366866635902169
Random class guess:	 0.5026918225707635

0.7
Kappa: 0.011 (+/- 0.031)
Accuracy: 0.522 (+/- 0.011)
Majority class guess:	 0.5279187817258884
Random class guess:	 0.5015589167461156

0.8
Kappa: 0.000 (+/- 0.000)
Accuracy: 0.591 (+/- 0.001)
Majority class guess:	 0.5911398246423627
Random class guess:	 0.5166129352716813

0.9
Kappa: 0.000 (+/- 0.000)
Accuracy: 0.680 (+/- 0.001)
Majority class guess:	 0.6797415782187356
Random class guess:	 0.5646140698811237

