## Counting files and words

In [4]:
!ls ../data/conll-2012-flat/*/*gold_conll | head

../data/conll-2012-flat/dev/bc_cctv_0000.v4_gold_conll
../data/conll-2012-flat/dev/bc_cnn_0000.v4_gold_conll
../data/conll-2012-flat/dev/bc_msnbc_0000.v4_gold_conll
../data/conll-2012-flat/dev/bc_phoenix_0000.v4_gold_conll
../data/conll-2012-flat/dev/bn_abc_0010.v4_gold_conll
../data/conll-2012-flat/dev/bn_abc_0020.v4_gold_conll
../data/conll-2012-flat/dev/bn_abc_0030.v4_gold_conll
../data/conll-2012-flat/dev/bn_abc_0040.v4_gold_conll
../data/conll-2012-flat/dev/bn_abc_0050.v4_gold_conll
../data/conll-2012-flat/dev/bn_abc_0060.v4_gold_conll


Number of documents:

In [3]:
!ls ../data/conll-2012-flat/*/*gold_conll | wc -l

    2385


Number of tokens:

In [1]:
!cat ../data/conll-2012-flat/*/*gold_conll | wc -l

 1913004


## Discover the genres

In [47]:
import os 
from glob import glob
import re

paths = glob('../data/conll-2012-flat/test/*_conll')
fnames = [os.path.basename(path) for path in paths]
genres = set(re.match(r'([a-z]+)_', fname).group(1) for fname in fnames)
print('Found %d genres: %s' %(len(genres), ' '.join(genres)))

Found 7 genres: bc tc wb bn nw mz pt


## Measure performance of Stanford Sieve on different genres

In [40]:
%%sh
rm -rf ../output/stats_conll_2012
mkdir ../output/stats_conll_2012

In [45]:
%%time
%%sh

cd ../CoreNLP

JARS=`echo *.jar target/*.jar lib/*.jar | tr ' ' ':'`

run_sieve() {
    java -Xmx6g -cp $JARS edu.stanford.nlp.dcoref.SieveCoreferenceSystem \
            -props sieve-english-conll.properties \
            -dcoref.use_conll_auto false \
            -dcoref.conll2011 ../output/conll-2012-by_genre/$1/orig/test.m_gold_conll \
            -dcoref.conll.output ../output/stats_conll_2012/$1-test.m_gold_conll \
            -dcoref.conll.scorer ../data/conll-2012/scorer/v8.01/scorer.pl
}

for genre in bc tc wb bn nw mz pt
do
    echo "=== Genre: $genre ==="
    run_sieve $genre 2>&1 | grep "Final conll score"
done

=== Genre: bc ===
INFO: Final conll score ((muc+bcub+ceafe)/3) = 55
=== Genre: tc ===
INFO: Final conll score ((muc+bcub+ceafe)/3) = 61.96
=== Genre: wb ===
INFO: Final conll score ((muc+bcub+ceafe)/3) = 57.64
=== Genre: bn ===
INFO: Final conll score ((muc+bcub+ceafe)/3) = 56.37
=== Genre: nw ===
INFO: Final conll score ((muc+bcub+ceafe)/3) = 54.62
=== Genre: mz ===
INFO: Final conll score ((muc+bcub+ceafe)/3) = 57.69
=== Genre: pt ===
INFO: Final conll score ((muc+bcub+ceafe)/3) = 66.65
CPU times: user 13 ms, sys: 10.4 ms, total: 23.4 ms
Wall time: 6min 36s


## Measure performance of Stanford Neural nets on different genres

In [46]:
%%time
%%sh

cd ../CoreNLP

JARS=target/stanford-corenlp-3.7.0.jar:stanford-corenlp-models-current.jar:stanford-english-corenlp-models-current.jar:`echo lib/*.jar | tr ' ' ':'`

run_deep_coref() {
    OUT_DIR=../output/stats_conll_2012/$1-test-deep-coref
    mkdir $OUT_DIR
    java -Xmx6g -cp $JARS edu.stanford.nlp.coref.CorefSystem \
        -coref.algorithm neural \
        -coref.conll true -coref.suffix _gold_conll \
        -coref.inputPath ../output/conll-2012-by_genre/$1/orig/test.m_gold_conll \
        -coref.conllOutputPath $OUT_DIR \
        -coref.scorer ../reference-coreference-scorers/v8.01/scorer.pl
}

for genre in bc tc wb bn nw mz pt
do
    echo "=== Genre: $genre ==="
    run_deep_coref $genre 2>&1 | grep "Final conll score"
done

=== Genre: bc ===
[main] INFO CoreNLP - Final conll score ((muc+bcub+ceafe)/3) = 64.02
=== Genre: tc ===
[main] INFO CoreNLP - Final conll score ((muc+bcub+ceafe)/3) = 71.36
=== Genre: wb ===
[main] INFO CoreNLP - Final conll score ((muc+bcub+ceafe)/3) = 66.11
=== Genre: bn ===
[main] INFO CoreNLP - Final conll score ((muc+bcub+ceafe)/3) = 66.78
=== Genre: nw ===
[main] INFO CoreNLP - Final conll score ((muc+bcub+ceafe)/3) = 65.05
=== Genre: mz ===
[main] INFO CoreNLP - Final conll score ((muc+bcub+ceafe)/3) = 75.19
=== Genre: pt ===
[main] INFO CoreNLP - Final conll score ((muc+bcub+ceafe)/3) = 78.02
CPU times: user 306 ms, sys: 152 ms, total: 458 ms
Wall time: 2h 43s
