# Comparing Automated to Hand-Coding

This notebook demonstrates how we calculated the metrics to compare the three automated methods (k-means, structural topic modeling, and latent Dirichlet allocation) to the hand-coding.

In [1]:
import pandas
import numpy as np

## Clean and Merge Data

In [2]:
#dataset with the topic weights from the stuctural topic model
df_stm_weights = pandas.read_csv("../data/stm60_theta.csv")
df_stm_weights

Unnamed: 0,ID,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X51,X52,X53,X54,X55,X56,X57,X58,X59,X60
0,1,2.341945e-05,0.001378,0.000941,0.000007,0.000602,4.831264e-07,1.204299e-07,0.000676,0.000083,...,1.856308e-05,3.221664e-06,0.002125,0.000089,0.001433,0.044679,0.005791,0.000289,5.271752e-05,3.738774e-03
1,2,6.829184e-05,0.013254,0.000616,0.028261,0.001775,5.486996e-04,2.567636e-06,0.000106,0.001920,...,2.474175e-03,2.003198e-04,0.087323,0.000146,0.024863,0.003353,0.050871,0.000179,1.137347e-04,2.147234e-05
2,3,2.342587e-05,0.007105,0.000258,0.000089,0.001851,1.690230e-04,1.111551e-05,0.000595,0.004280,...,6.797081e-03,2.570716e-03,0.027197,0.000177,0.013944,0.003116,0.121088,0.002493,5.230997e-02,2.059647e-05
3,4,2.424631e-06,0.006878,0.000389,0.006655,0.000061,6.615226e-01,5.990955e-04,0.000024,0.050346,...,8.961000e-06,2.169935e-04,0.019494,0.000148,0.045764,0.004581,0.015037,0.000027,1.346899e-04,6.148988e-05
4,5,8.383395e-04,0.006929,0.000318,0.000632,0.000799,1.474560e-07,3.034300e-05,0.763867,0.000575,...,4.868603e-06,2.349363e-05,0.001645,0.000296,0.026244,0.000651,0.004754,0.000059,2.462613e-05,1.418729e-03
5,6,2.680471e-04,0.053765,0.000428,0.270446,0.000287,4.071572e-02,8.726789e-04,0.000025,0.000915,...,7.035714e-04,9.291804e-04,0.069713,0.000070,0.014133,0.000053,0.010638,0.000160,4.643222e-04,2.574566e-06
6,7,3.407387e-05,0.013620,0.000295,0.034157,0.000201,2.656494e-03,1.413918e-04,0.000054,0.057354,...,2.945997e-04,9.087999e-04,0.014534,0.000767,0.025149,0.000147,0.019970,0.000053,1.195428e-04,1.505522e-05
7,8,3.391536e-06,0.024237,0.000373,0.000480,0.002109,2.428715e-04,4.360878e-04,0.000064,0.125402,...,7.178140e-06,6.414210e-04,0.009749,0.000168,0.017996,0.000216,0.022493,0.000137,3.780026e-02,1.608806e-05
8,9,5.278743e-06,0.004208,0.000070,0.000272,0.004436,3.409456e-05,4.469850e-02,0.000096,0.001175,...,1.282440e-05,9.069087e-04,0.003004,0.000018,0.006226,0.000078,0.024995,0.000218,2.222631e-05,6.735110e-07
9,10,4.546685e-05,0.037961,0.000177,0.000968,0.001289,3.291870e-04,5.282983e-04,0.000294,0.001024,...,6.630799e-04,7.050510e-04,0.082330,0.000199,0.029315,0.000290,0.022278,0.000070,1.157906e-05,3.533475e-05


In [3]:
#dataset with the hand-coded labels
df_handcode = pandas.read_csv("../data/final_dataset.csv")
df_handcode

Unnamed: 0,ID,year,code,weight,code_label,id,binary1,binary2,three_code1,three_code2,four_code,five_code,binary3,three_code3,four_code2
0,1,1981,5,6.67,irrelevant,1,irrelevant,relchanges/releconomy/irrelevant,irrelevant,irrelevant,irrelevant,irrelevant,releconomy/irrelevant,releconomy/irrelevant,irrelevant
1,2,1982,3,10.00,relchanges (income/wages/earnings/etc.),2,explicit/implicit/relchanges/releconomy,relchanges/releconomy/irrelevant,implicit/relchanges/releconomy,relchanges/releconomy,relchanges/releconomy,relchanges,explicit/implicit/relchanges,relchanges,relchanges
2,3,1984,1,6.67,"relinequality, explicit",3,explicit/implicit/relchanges/releconomy,explicit/implicit,explicit,explicit/implicit,explicit,explicit,explicit/implicit/relchanges,explicit/implicit,explicit/implicit
3,4,1995,5,6.67,irrelevant,4,irrelevant,relchanges/releconomy/irrelevant,irrelevant,irrelevant,irrelevant,irrelevant,releconomy/irrelevant,releconomy/irrelevant,irrelevant
4,5,1991,5,6.67,irrelevant,5,irrelevant,relchanges/releconomy/irrelevant,irrelevant,irrelevant,irrelevant,irrelevant,releconomy/irrelevant,releconomy/irrelevant,irrelevant
5,6,2004,5,6.67,irrelevant,6,irrelevant,relchanges/releconomy/irrelevant,irrelevant,irrelevant,irrelevant,irrelevant,releconomy/irrelevant,releconomy/irrelevant,irrelevant
6,7,2012,1,6.67,"relinequality, explicit",7,explicit/implicit/relchanges/releconomy,explicit/implicit,explicit,explicit/implicit,explicit,explicit,explicit/implicit/relchanges,explicit/implicit,explicit/implicit
7,8,2010,3,6.67,relchanges (income/wages/earnings/etc.),8,explicit/implicit/relchanges/releconomy,relchanges/releconomy/irrelevant,implicit/relchanges/releconomy,relchanges/releconomy,relchanges/releconomy,relchanges,explicit/implicit/relchanges,relchanges,relchanges
8,9,2000,5,6.67,irrelevant,9,irrelevant,relchanges/releconomy/irrelevant,irrelevant,irrelevant,irrelevant,irrelevant,releconomy/irrelevant,releconomy/irrelevant,irrelevant
9,10,2002,5,6.67,irrelevant,10,irrelevant,relchanges/releconomy/irrelevant,irrelevant,irrelevant,irrelevant,irrelevant,releconomy/irrelevant,releconomy/irrelevant,irrelevant


In [4]:
#merge together
df_stm = df_handcode.merge(df_stm_weights, on = 'ID')
df_stm

Unnamed: 0,ID,year,code,weight,code_label,id,binary1,binary2,three_code1,three_code2,...,X51,X52,X53,X54,X55,X56,X57,X58,X59,X60
0,1,1981,5,6.67,irrelevant,1,irrelevant,relchanges/releconomy/irrelevant,irrelevant,irrelevant,...,1.856308e-05,3.221664e-06,0.002125,0.000089,0.001433,0.044679,0.005791,0.000289,5.271752e-05,3.738774e-03
1,2,1982,3,10.00,relchanges (income/wages/earnings/etc.),2,explicit/implicit/relchanges/releconomy,relchanges/releconomy/irrelevant,implicit/relchanges/releconomy,relchanges/releconomy,...,2.474175e-03,2.003198e-04,0.087323,0.000146,0.024863,0.003353,0.050871,0.000179,1.137347e-04,2.147234e-05
2,3,1984,1,6.67,"relinequality, explicit",3,explicit/implicit/relchanges/releconomy,explicit/implicit,explicit,explicit/implicit,...,6.797081e-03,2.570716e-03,0.027197,0.000177,0.013944,0.003116,0.121088,0.002493,5.230997e-02,2.059647e-05
3,4,1995,5,6.67,irrelevant,4,irrelevant,relchanges/releconomy/irrelevant,irrelevant,irrelevant,...,8.961000e-06,2.169935e-04,0.019494,0.000148,0.045764,0.004581,0.015037,0.000027,1.346899e-04,6.148988e-05
4,5,1991,5,6.67,irrelevant,5,irrelevant,relchanges/releconomy/irrelevant,irrelevant,irrelevant,...,4.868603e-06,2.349363e-05,0.001645,0.000296,0.026244,0.000651,0.004754,0.000059,2.462613e-05,1.418729e-03
5,6,2004,5,6.67,irrelevant,6,irrelevant,relchanges/releconomy/irrelevant,irrelevant,irrelevant,...,7.035714e-04,9.291804e-04,0.069713,0.000070,0.014133,0.000053,0.010638,0.000160,4.643222e-04,2.574566e-06
6,7,2012,1,6.67,"relinequality, explicit",7,explicit/implicit/relchanges/releconomy,explicit/implicit,explicit,explicit/implicit,...,2.945997e-04,9.087999e-04,0.014534,0.000767,0.025149,0.000147,0.019970,0.000053,1.195428e-04,1.505522e-05
7,8,2010,3,6.67,relchanges (income/wages/earnings/etc.),8,explicit/implicit/relchanges/releconomy,relchanges/releconomy/irrelevant,implicit/relchanges/releconomy,relchanges/releconomy,...,7.178140e-06,6.414210e-04,0.009749,0.000168,0.017996,0.000216,0.022493,0.000137,3.780026e-02,1.608806e-05
8,9,2000,5,6.67,irrelevant,9,irrelevant,relchanges/releconomy/irrelevant,irrelevant,irrelevant,...,1.282440e-05,9.069087e-04,0.003004,0.000018,0.006226,0.000078,0.024995,0.000218,2.222631e-05,6.735110e-07
9,10,2002,5,6.67,irrelevant,10,irrelevant,relchanges/releconomy/irrelevant,irrelevant,irrelevant,...,6.630799e-04,7.050510e-04,0.082330,0.000199,0.029315,0.000290,0.022278,0.000070,1.157906e-05,3.533475e-05


In [5]:
#Cluster 47 is the inequality cluster
#See D-stm60_terms.csv for the top weighted words in cluster 47

#create a dataset with only articles hand-coded as irrelevant (code=5)
df_stm_irrel = df_stm[df_stm['code']==5]
print(np.percentile(df_stm_irrel['X47'], [5,25,50,75,95,99]))

#choose the 95th percentile

min_weight = np.percentile(df_stm_irrel['X47'], [95])[0]
min_weight

[  4.25618742e-06   4.15189174e-05   1.66601500e-04   6.14198450e-04
   5.13488638e-03   9.71488439e-02]


0.0051348863799999976

In [6]:
#assign a boolean, 1 if the topic weight for topic 47 is greater than the minimum weight calculated above
df_stm['inequal_stm'] = ((df_stm['X47']>=min_weight).astype(int))
df_stm['inequal_stm']

0       0
1       0
2       0
3       0
4       0
5       0
6       0
7       0
8       0
9       0
10      0
11      0
12      0
13      0
14      1
15      0
16      0
17      0
18      0
19      0
20      0
21      0
22      0
23      0
24      0
25      0
26      1
27      0
28      0
29      0
       ..
1223    0
1224    0
1225    0
1226    1
1227    0
1228    0
1229    0
1230    0
1231    0
1232    0
1233    0
1234    0
1235    1
1236    0
1237    0
1238    0
1239    0
1240    0
1241    0
1242    0
1243    0
1244    0
1245    0
1246    1
1247    0
1248    1
1249    0
1250    0
1251    0
1252    0
Name: inequal_stm, Length: 1253, dtype: int64

In [7]:
#remove topic weights from dataframe
col_keep_stm = ['ID', 'code', 'weight', 'id', 'year', 'code_label', 'inequal_stm']
df_stm = df_stm[col_keep_stm]
df_stm

Unnamed: 0,ID,code,weight,id,year,code_label,inequal_stm
0,1,5,6.67,1,1981,irrelevant,0
1,2,3,10.00,2,1982,relchanges (income/wages/earnings/etc.),0
2,3,1,6.67,3,1984,"relinequality, explicit",0
3,4,5,6.67,4,1995,irrelevant,0
4,5,5,6.67,5,1991,irrelevant,0
5,6,5,6.67,6,2004,irrelevant,0
6,7,1,6.67,7,2012,"relinequality, explicit",0
7,8,3,6.67,8,2010,relchanges (income/wages/earnings/etc.),0
8,9,5,6.67,9,2000,irrelevant,0
9,10,5,6.67,10,2002,irrelevant,0


In [8]:
#number of articles identified by STM
df_stm['inequal_stm'].sum()

190

In [9]:
#do the same for lda
df_lda_weights = pandas.read_csv("../data/lda60_docdist.csv", index_col=0)
df_lda_weights

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X52,X53,X54,X55,X56,X57,X58,X59,X60,ID
0,0.000073,0.000073,0.173562,0.000073,0.000073,0.000073,0.000073,0.000073,0.000073,0.000073,...,0.000073,0.000073,0.000073,0.000073,0.000073,0.000073,0.110753,0.000073,0.000073,1
1,0.000090,0.000090,0.000090,0.026736,0.255625,0.000090,0.000090,0.000090,0.000090,0.000090,...,0.000090,0.000090,0.000090,0.000090,0.000090,0.000090,0.000090,0.000090,0.000090,2
2,0.081265,0.000068,0.155727,0.000068,0.152677,0.079250,0.000068,0.000068,0.000068,0.000068,...,0.000068,0.114182,0.000068,0.000068,0.000068,0.000068,0.000068,0.000068,0.000068,3
3,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.000086,0.265232,0.150820,...,0.027925,0.000086,0.000086,0.000086,0.000086,0.000086,0.251545,0.000086,0.000086,4
4,0.000071,0.000071,0.000071,0.000071,0.000071,0.000071,0.000071,0.000071,0.000071,0.164997,...,0.000071,0.000071,0.000071,0.000071,0.000071,0.000071,0.000071,0.000071,0.000071,5
5,0.000141,0.000141,0.000141,0.000141,0.000141,0.000141,0.271891,0.000141,0.000141,0.000141,...,0.000141,0.000141,0.000141,0.000141,0.000141,0.000141,0.000141,0.000141,0.000141,6
6,0.103381,0.000029,0.000029,0.146883,0.000029,0.043144,0.000029,0.011976,0.000029,0.059434,...,0.000029,0.156911,0.000029,0.000029,0.000029,0.062325,0.163703,0.000029,0.000029,7
7,0.000139,0.000139,0.000139,0.000139,0.000139,0.000139,0.000139,0.000139,0.000139,0.053403,...,0.000139,0.000139,0.000139,0.000139,0.000139,0.000139,0.000139,0.000139,0.000139,8
8,0.000080,0.000080,0.000080,0.000080,0.000080,0.600472,0.000080,0.000080,0.000080,0.000080,...,0.058009,0.000080,0.000080,0.000080,0.000080,0.000080,0.000080,0.000080,0.037688,9
9,0.009922,0.000039,0.000039,0.000039,0.000039,0.000039,0.000039,0.000039,0.000039,0.136594,...,0.000039,0.000039,0.000039,0.000039,0.000039,0.000039,0.000039,0.000039,0.036091,10


In [10]:
df_lda = df_stm.merge(df_lda_weights, on = 'ID')
df_lda

Unnamed: 0,ID,code,weight,id,year,code_label,inequal_stm,X1,X2,X3,...,X51,X52,X53,X54,X55,X56,X57,X58,X59,X60
0,1,5,6.67,1,1981,irrelevant,0,0.000073,0.000073,0.173562,...,0.000073,0.000073,0.000073,0.000073,0.000073,0.000073,0.000073,0.110753,0.000073,0.000073
1,2,3,10.00,2,1982,relchanges (income/wages/earnings/etc.),0,0.000090,0.000090,0.000090,...,0.000090,0.000090,0.000090,0.000090,0.000090,0.000090,0.000090,0.000090,0.000090,0.000090
2,3,1,6.67,3,1984,"relinequality, explicit",0,0.081265,0.000068,0.155727,...,0.000068,0.000068,0.114182,0.000068,0.000068,0.000068,0.000068,0.000068,0.000068,0.000068
3,4,5,6.67,4,1995,irrelevant,0,0.000086,0.000086,0.000086,...,0.000086,0.027925,0.000086,0.000086,0.000086,0.000086,0.000086,0.251545,0.000086,0.000086
4,5,5,6.67,5,1991,irrelevant,0,0.000071,0.000071,0.000071,...,0.000071,0.000071,0.000071,0.000071,0.000071,0.000071,0.000071,0.000071,0.000071,0.000071
5,6,5,6.67,6,2004,irrelevant,0,0.000141,0.000141,0.000141,...,0.000141,0.000141,0.000141,0.000141,0.000141,0.000141,0.000141,0.000141,0.000141,0.000141
6,7,1,6.67,7,2012,"relinequality, explicit",0,0.103381,0.000029,0.000029,...,0.000029,0.000029,0.156911,0.000029,0.000029,0.000029,0.062325,0.163703,0.000029,0.000029
7,8,3,6.67,8,2010,relchanges (income/wages/earnings/etc.),0,0.000139,0.000139,0.000139,...,0.000139,0.000139,0.000139,0.000139,0.000139,0.000139,0.000139,0.000139,0.000139,0.000139
8,9,5,6.67,9,2000,irrelevant,0,0.000080,0.000080,0.000080,...,0.000080,0.058009,0.000080,0.000080,0.000080,0.000080,0.000080,0.000080,0.000080,0.037688
9,10,5,6.67,10,2002,irrelevant,0,0.009922,0.000039,0.000039,...,0.000039,0.000039,0.000039,0.000039,0.000039,0.000039,0.000039,0.000039,0.000039,0.036091


In [11]:
#Cluster 40 is the inequality cluster
#See E-lda60_terms.csv for the top weighted words in cluster 47

#create a dataset with only articles hand-coded as irrelevant (code=5)
df_lda_irrel = df_lda[df_lda['code']==5]
print(np.percentile(df_lda_irrel['X40'], [5,25,50,75,95,99]))

#choose the 95th percentile

min_weight_lda = np.percentile(df_lda_irrel['X40'], [95])[0]
min_weight_lda

[  2.81621354e-05   6.21715399e-05   8.75819607e-05   1.31028669e-04
   2.07051357e-02   1.51161621e-01]


0.020705135743183667

In [12]:
#assign a boolean, 1 if the topic weight for topic 40 is greater than the minimum weight calculated above
df_lda['inequal_lda'] = ((df_lda['X40']>min_weight_lda).astype(int))
df_lda['inequal_lda']

0       0
1       0
2       0
3       0
4       0
5       0
6       0
7       0
8       0
9       0
10      0
11      0
12      1
13      0
14      0
15      0
16      0
17      1
18      0
19      0
20      0
21      0
22      0
23      0
24      0
25      0
26      1
27      0
28      0
29      0
       ..
1223    0
1224    0
1225    0
1226    1
1227    0
1228    0
1229    0
1230    0
1231    0
1232    0
1233    0
1234    0
1235    0
1236    0
1237    0
1238    0
1239    0
1240    0
1241    0
1242    0
1243    0
1244    0
1245    0
1246    1
1247    0
1248    1
1249    1
1250    0
1251    0
1252    0
Name: inequal_lda, Length: 1253, dtype: int64

In [13]:
#remove topic weights
col_keep_lda = ['ID', 'code', 'weight', 'id', 'year', 'code_label', 'inequal_stm', 'inequal_lda']
df_lda = df_lda[col_keep_lda]
df_lda['inequal_lda'].sum()

150

In [14]:
#do the same for k-means, but not by weight but by cluster label

df_kmeans = pandas.read_csv("../data/kmeans30.csv", sep = '\t')
df_kmeans

Unnamed: 0,ID,clusters
0,1,1
1,2,23
2,3,15
3,4,7
4,5,7
5,6,24
6,7,2
7,8,8
8,9,7
9,10,27


In [15]:
#cluster 4 is the inequality cluster
#see C-kmeans30_terms.csv for a list of the most frequent words per cluster

df_kmeans['inequal_kmeans'] = ((df_kmeans['clusters']==4).astype(int))
df_kmeans

Unnamed: 0,ID,clusters,inequal_kmeans
0,1,1,0
1,2,23,0
2,3,15,0
3,4,7,0
4,5,7,0
5,6,24,0
6,7,2,0
7,8,8,0
8,9,7,0
9,10,27,0


In [16]:
#remove clusters column
col_keep_kmeans = ['ID', 'inequal_kmeans']
df_kmeans = df_kmeans[col_keep_kmeans]
df_kmeans['inequal_kmeans'].sum()

42

In [17]:
#merge together with master dataframe
df = df_kmeans.merge(df_lda, on="ID")
df

Unnamed: 0,ID,inequal_kmeans,code,weight,id,year,code_label,inequal_stm,inequal_lda
0,1,0,5,6.67,1,1981,irrelevant,0,0
1,2,0,3,10.00,2,1982,relchanges (income/wages/earnings/etc.),0,0
2,3,0,1,6.67,3,1984,"relinequality, explicit",0,0
3,4,0,5,6.67,4,1995,irrelevant,0,0
4,5,0,5,6.67,5,1991,irrelevant,0,0
5,6,0,5,6.67,6,2004,irrelevant,0,0
6,7,0,1,6.67,7,2012,"relinequality, explicit",0,0
7,8,0,3,6.67,8,2010,relchanges (income/wages/earnings/etc.),0,0
8,9,0,5,6.67,9,2000,irrelevant,0,0
9,10,0,5,6.67,10,2002,irrelevant,0,0


## Compute Metrics, replicating Table 1

### STM

In [18]:
#compute metrics
#Calculate metrics for Coding Scheme D, Inequality/Relevant category for STM
tp_rel_stm = df[(df['code']==1) & (df['inequal_stm']==1)]
tn_rel_stm = df[(df['code']!=1) & (df['inequal_stm']==0)]
fp_rel_stm = df[(df['code']!=1) & (df['inequal_stm']==1)]
fn_rel_stm = df[(df['code']==1) & (df['inequal_stm']==0)]

trupos_stm_rel = float(len(tp_rel_stm.index))
truneg_stm_rel = float(len(tn_rel_stm.index))
falpos_stm_rel = float(len(fp_rel_stm.index))
falneg_stm_rel = float(len(fn_rel_stm.index))

precision_stm_inequal = trupos_stm_rel/(trupos_stm_rel + falpos_stm_rel)
recall_stm_inequal = trupos_stm_rel/(trupos_stm_rel + falneg_stm_rel)

f1_stm_inequal = 2*((precision_stm_inequal * recall_stm_inequal)/(precision_stm_inequal + recall_stm_inequal))

print("Number of articles hand-classified as inquality: %d" % len(df[df['code']==1].index))
print("Number of articles classified as inquality via STM: %d" % len(df[df['inequal_stm']==1].index))
print("Precision Inequality: %f" % precision_stm_inequal)
print("Recall Inequality: %f" % recall_stm_inequal)
print("F1 Inequality: %f" % f1_stm_inequal)

Number of articles hand-classified as inquality: 264
Number of articles classified as inquality via STM: 190
Precision Inequality: 0.631579
Recall Inequality: 0.454545
F1 Inequality: 0.528634


In [19]:
#Calculate metrics for Coding Scheme D, Not Iequality/Irrelevant category for STM
tp_irrel_stm = df[(df['code']!=1) & (df['inequal_stm']==0)]
tn_irrel_stm = df[(df['code']==1) & (df['inequal_stm']==1)]
fp_irrel_stm = df[(df['code']==1) & (df['inequal_stm']==0)]
fn_irrel_stm = df[(df['code']!=1) & (df['inequal_stm']==1)]

trupos_stm_irrel = float(len(tp_irrel_stm.index))
truneg_stm_irrel = float(len(tn_irrel_stm.index))
falpos_stm_irrel = float(len(fp_irrel_stm.index))
falneg_stm_irrel = float(len(fn_irrel_stm.index))

precision_stm_irrel = trupos_stm_irrel/(trupos_stm_irrel + falpos_stm_irrel)
recall_stm_irrel = trupos_stm_irrel/(trupos_stm_irrel + falneg_stm_irrel)

f1_stm_irrel = 2*((precision_stm_irrel * recall_stm_irrel)/(precision_stm_irrel + recall_stm_irrel))

print("Number of articles hand-classified as irrelevant: %d" % len(df[df['code']!=1].index))
print("Number of articles classified as irrelevant via STM: %d" % len(df[df['inequal_stm']!=1].index))
print("Precision Irrelevant: %f" % precision_stm_irrel)
print("Recall Irrelevant: %f" % recall_stm_irrel)
print("F1 Irrelevant: %f" % f1_stm_irrel)

Number of articles hand-classified as irrelevant: 989
Number of articles classified as irrelevant via STM: 1063
Precision Irrelevant: 0.864534
Recall Irrelevant: 0.929221
F1 Irrelevant: 0.895712


In [20]:
#Calculate overall metrics for Coding Scheme D, STM
##Calculate metrics for each label, and find their average,weighted by support (the number of true instances for each label). 
##It can result in an F1-score that is not between precision and recall.

precision_stm_total = (
(precision_stm_inequal * (len(df[df['code']==1].index)/len(df.index))) + 
(precision_stm_irrel * (len(df[df['code']!=1].index)/len(df.index)))
)

recall_stm_total = (
(recall_stm_inequal * (len(df[df['code']==1].index)/len(df.index))) + 
(recall_stm_irrel * (len(df[df['code']!=1].index)/len(df.index)))
)

f1_stm_total = (
(f1_stm_inequal * (len(df[df['code']==1].index)/len(df.index))) + 
(f1_stm_irrel * (len(df[df['code']!=1].index)/len(df.index)))
)

print("Total number of articles: %d" % len(df.index))
print("Precision Total: %f" % precision_stm_total)
print("Recall Total: %f" % recall_stm_total)
print("F1 Total: %f" % f1_stm_total)

Total number of articles: 1253
Precision Total: 0.815452
Recall Total: 0.829210
F1 Total: 0.818370


### LDA (not included in Table 1)

In [29]:
#Calculate metrics for Coding Scheme D, Iequality/Relevant category for LDA
tp_lda_rel = df[(df['code']==1) & (df['inequal_lda']==1)]
tn_lda_rel = df[(df['code']!=1) & (df['inequal_lda']==0)]
fp_lda_rel = df[(df['code']!=1) & (df['inequal_lda']==1)]
fn_lda_rel = df[(df['code']==1) & (df['inequal_lda']==0)]

trupos_lda_rel = float(len(tp_lda_rel.index))
truneg_lda_rel = float(len(tn_lda_rel.index))
falpos_lda_rel = float(len(fp_lda_rel.index))
falsneg_lda_rel = float(len(fn_lda_rel.index))

precision_lda_inequal = trupos_lda_rel/(trupos_lda_rel + falpos_lda_rel)
recall_lda_inequal = trupos_lda_rel/(trupos_lda_rel + falsneg_lda_rel)

f1_lda_inequal = 2*((precision_lda_inequal * recall_lda_inequal)/(precision_lda_inequal + recall_lda_inequal))

print("Number of articles hand-classified as inquality: %d" % len(df[df['code']==1].index))
print("Number of articles classified as inquality via LDA: %d" % len(df[df['inequal_lda']==1].index))
print("Precision Inequality: %f" % precision_lda_inequal)
print("Recall Inequality: %f" % recall_lda_inequal)
print("F1 Inequality: %f" % f1_lda_inequal)

Number of articles hand-classified as inquality: 264
Number of articles classified as inquality via LDA: 150
Precision Inequality: 0.713333
Recall Inequality: 0.405303
F1 Inequality: 0.516908


In [28]:
#Calculate metrics for Coding Scheme D, Not Iequality/Irrelevant category for LDA
tp_lda_irrel = df[(df['code']!=1) & (df['inequal_lda']==0)]
tn_lda_irrel = df[(df['code']==1) & (df['inequal_lda']==1)]
fp_lda_irrel = df[(df['code']==1) & (df['inequal_lda']==0)]
fn_lda_irrel = df[(df['code']!=1) & (df['inequal_lda']==1)]

trupos_lda_irrel = float(len(tp_lda_irrel.index))
truneg_lda_irrel = float(len(tn_lda_irrel.index))
falpos_lda_irrel = float(len(fp_lda_irrel.index))
falsneg_lda_irrel = float(len(fn_lda_irrel.index))

precision_lda_irrel = trupos_lda_irrel/(trupos_lda_irrel + falpos_lda_irrel)
recall_lda_irrel = trupos_lda_irrel/(trupos_lda_irrel + falsneg_lda_irrel)

f1_lda_irrel = 2*((precision_lda_irrel * recall_lda_irrel)/(precision_lda_irrel + recall_lda_irrel))

print("Number of articles hand-classified as not inquality: %d" % len(df[df['code']!=1].index))
print("Number of articles classified as not inquality via LDA: %d" % len(df[df['inequal_lda']!=1].index))
print("Precision Inequality: %f" % precision_lda_irrel)
print("Recall Inequality: %f" % recall_lda_irrel)
print("F1 Inequality: %f" % f1_lda_irrel)

Number of articles hand-classified as not inquality: 989
Number of articles classified as not inquality via LDA: 1103
Precision Inequality: 0.857661
Recall Inequality: 0.956522
F1 Inequality: 0.904398


### K-Means

Reproduces k-means row in Table 1

In [26]:
#Calculate metrics for Coding Scheme D, Iequality/Relevant category for kmeans
tp_kmeans_rel = df[(df['code']==1) & (df['inequal_kmeans']==1)]
tn_kmeans_rel = df[(df['code']!=1) & (df['inequal_kmeans']==0)]
fp_kmeans_rel = df[(df['code']!=1) & (df['inequal_kmeans']==1)]
fn_kmeans_rel = df[(df['code']==1) & (df['inequal_kmeans']==0)]

trupos_kmeans_rel = float(len(tp_kmeans_rel.index))
truneg_kmeans_rel = float(len(tn_kmeans_rel.index))
falpos_kmeans_rel = float(len(fp_kmeans_rel.index))
falneg_kmeans_rel = float(len(fn_kmeans_rel.index))

precision_kmeans_inequal = trupos_kmeans_rel/(trupos_kmeans_rel + falpos_kmeans_rel)
recall_kmeans_inequal = trupos_kmeans_rel/(trupos_kmeans_rel + falneg_kmeans_rel)

f1_kmeans_inequal = 2*((precision_kmeans_inequal * recall_kmeans_inequal)/(precision_kmeans_inequal + recall_kmeans_inequal))

print("Number of articles hand-classified as inquality: %d" % len(df[df['code']==1].index))
print("Number of articles classified as inquality via k-means: %d" % df['inequal_kmeans'].sum())
print("Precision Inequality: %f" % precision_kmeans_inequal)
print("Recall Inequality: %f" % recall_kmeans_inequal)
print("F1 Inequality: %f" % f1_kmeans_inequal)

Number of articles hand-classified as inquality: 264
Number of articles classified as inquality via k-means: 42
Precision Inequality: 0.880952
Recall Inequality: 0.140152
F1 Inequality: 0.241830


In [27]:
#Calculate metrics for Coding Scheme D, Not Iequality/Irrelevant category for kmeans
tp_kmeans_irrel = df[(df['code']!=1) & (df['inequal_kmeans']==0)]
tn_kmeans_irrel = df[(df['code']==1) & (df['inequal_kmeans']==1)]
fp_kmeans_irrel = df[(df['code']==1) & (df['inequal_kmeans']==0)]
fn_kmeans_irrel = df[(df['code']!=1) & (df['inequal_kmeans']==1)]

trupos_kmeans_irrel = float(len(tp_kmeans_irrel.index))
truneg_kmeans_irrel = float(len(tn_kmeans_irrel.index))
falpos_kmeans_irrel = float(len(fp_kmeans_irrel.index))
falneg_kmeans_irrel = float(len(fn_kmeans_irrel.index))

precision_kmeans_irrel = trupos_kmeans_irrel/(trupos_kmeans_irrel + falpos_kmeans_irrel)
recall_kmeans_irrel = trupos_kmeans_irrel/(trupos_kmeans_irrel + falneg_kmeans_irrel)

f1_kmeans_irrel = 2*((precision_kmeans_irrel * recall_kmeans_irrel)/(precision_kmeans_irrel + recall_kmeans_irrel))

print("Number of articles hand-classified as not inquality: %d" % len(df[df['code']!=1].index))
print("Number of articles classified as not inquality via k-means: %d" % len(df[df['inequal_kmeans']!=1].index))
print("Precision Irrelevant: %f" % precision_kmeans_irrel)
print("Recall Irrelevant: %f" % recall_kmeans_irrel)
print("F1 Irrelevant: %f" % f1_kmeans_irrel)

Number of articles hand-classified as not inquality: 989
Number of articles classified as not inquality via k-means: 1211
Precision Irrelevant: 0.812552
Recall Irrelevant: 0.994944
F1 Irrelevant: 0.894545


In [25]:
#Calculate overall metrics for Coding Scheme D, k-means
##Calculate metrics for each label, and find their average,weighted by support (the number of true instances for each label). 
##It can result in an F1-score that is not between precision and recall.

precision_kmeans_total = (
(precision_kmeans_inequal * (len(df[df['code']==1].index)/len(df.index))) + 
(precision_kmeans_irrel * (len(df[df['code']!=1].index)/len(df.index)))
)

recall_kmeans_total = (
(recall_kmeans_inequal * (len(df[df['code']==1].index)/len(df.index))) + 
(recall_kmeans_irrel * (len(df[df['code']!=1].index)/len(df.index)))
)

f1_kmeans_total = (
(f1_kmeans_inequal * (len(df[df['code']==1].index)/len(df.index))) + 
(f1_kmeans_irrel * (len(df[df['code']!=1].index)/len(df.index)))
)

print("Total number of articles: %d" % len(df.index))
print("Precision Total: %f" % precision_kmeans_total)
print("Recall Total: %f" % recall_kmeans_total)
print("F1 Total: %f" % f1_kmeans_total)

Total number of articles: 1253
Precision Total: 0.826963
Recall Total: 0.814844
F1 Total: 0.757022
