This notebook shows an initial version of book relationship classification, which used various extracted features. This process has be phased out, because manually described derivative features aren't as performant as deep neural network analysis of raw data, like similarity matrix-based classification.

In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import glob
import dask.dataframe as dd
import seaborn as sns
import matplotlib.pyplot as plt
from dask.diagnostics import ProgressBar
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import classification_report

import numpy as np
from sklearn import preprocessing
from compare_tools.configuration import config
from compare_tools.hathimeta import HathiMeta, meta_compare
from compare_tools.train_utils import print_most_important_for_forest, processStats
from IPython.core.display import display, HTML
%matplotlib inline

  import pandas.util.testing as tm


## Import metadata-based ground truth and training data

In [2]:
truth = pd.read_parquet('../../sampling/ground_truth_meta_judgments.parquet')
# Append truth for generated files
fake_truth = pd.read_parquet('/data/saddl/fakebooks/fakebook_gt.parquet') 
truth = pd.concat([truth,fake_truth])
truth = truth.sample(frac=1)
truth

Unnamed: 0,left,right,judgment,notes
57656,hvd.32044010723161,hvd.32044050798669,WP_DV,oclc+desc
431491,mdp.49015000245317,mdp.49015000866542,AUTHOR,diff:fuzztitle
36928,coo1.ark:/13960/t49p3md6f,uc1.b2929771,SWSM,oclc+desc
685062,uc1.b4331157,osu.32435057575045,DIFF,diff:author
259286,mdp.39015010742404,ien.35556021268065,DIFF,diff:author
...,...,...,...,...
96054,hvd.32044105629224,coo1.ark:/13960/t0000px62,AUTHOR,diff:fuzztitle
555308,uc1.$b16308,uc1.b4247138,AUTHOR,diff:fuzztitle
729523,uc2.ark:/13960/t45q51q3v,hvd.32044021234695,AUTHOR,diff:fuzztitle
437496,mdp.49015002912534,njp.32101034101186,DIFF,diff:author


## Load Left/Right Content Comparison Stats

In [7]:
# Using Stats
metastats = dd.read_parquet('/data/saddl/stats/03-20-stats/*')
fakebook_stats = dd.read_parquet('/data/saddl/fakebooks/fakebook-stats.parquet')
metastats = dd.concat([metastats, fakebook_stats])
metastats2, metaextra = processStats(metastats)
with ProgressBar():
    sample = dd.merge(metastats2, truth).compute()

39220

In [10]:
# Using similarities
metastats = dd.read_parquet('/data/saddl/stats/04-11-sims/stats*')
fakebook_stats = dd.read_parquet('/data/saddl/fakebooks/combined-sims/part.0.parquet').head()
metastats = dd.concat([metastats, fakebook_stats])

In [14]:
#metasample = sample.sample(frac=.2) #.groupby('judgment', as_index=False) #.apply(lambda x: x.sample(7000))
metasample = sample[sample.judgment.isin(['SWSM', 'SWDE', 'WP_DV', 'AUTHOR'])].sample(frac=1.)
sample.shape, metasample.shape

((897909, 69), (453668, 69))

In [20]:
sample.sample(frac=1.).to_parquet('/tmp/metasample-with-fake.parquet')

## Preprocessing

Trying to scale to a normal-ish distribution.

In [None]:
b = metaextra.sample(frac=.1).gloveLMeanMinSim.compute()
#b = a.sample(20000)

In [None]:
sns.distplot(metasample.gloveLMeanMinSimTransform)

# Feature Importance

In [170]:
train = metasample.iloc[:-2000].fillna(0)
test = metasample.iloc[-2000:].fillna(0)
models = {}

In [164]:
handY_true = handcoded_sample['judgment']
handcoded_sample.judgment.value_counts()

AUTHOR    1166
WP_DV      458
SWDE       305
SWSM       274
Name: judgment, dtype: int64

In [41]:
%%time
from compare_tools.train_utils import parse_comparison_records
import tensorflow as tf
import glob
batch_size = 2048
ds = tf.data.TFRecordDataset(glob.glob('/tmp/handcoded-stats.tfrecord'), compression_type='GZIP')
#ds = tf.data.TFRecordDataset(glob.glob('/data/saddl/stats/04-27-sim-tfrecords/handcoded-stats.tfrecord'))

#ds = ds.shuffle(buffer_size=10000, reshuffle_each_iteration=False)
#partial_ds = ds.map(lambda x: (x["X"], x['y'])) # Only keep X and y
#partial_ds = partial_ds.prefetch(batch_size)

#train_size, val_size, test_size = 235000, 50000, 50000 # 70/15/15 split on one instance of the data
#train_ds = partial_ds.take(train_size).batch(batch_size)
#val_ds = partial_ds.skip(train_size).take(val_size).batch(batch_size)
#test_ds = partial_ds.skip(train_size+val_size).take(test_size).batch(batch_size)

def parse_comparison_records2(example_proto):
  features = {
    'X': tf.io.FixedLenFeature((50,50,1), tf.float32),
    'left': tf.io.FixedLenFeature([], tf.string),
    'right': tf.io.FixedLenFeature([], tf.string),
    'judgment': tf.io.FixedLenFeature([], tf.string),
    'y': tf.io.FixedLenFeature([7], tf.int64),
    'notes': tf.io.FixedLenFeature([], tf.string)
  }
  parsed_features = tf.io.parse_example(example_proto, features)
  return parsed_features

ds = ds.batch(1)
ds = ds.map(parse_comparison_records2)

i = 0
for record in ds:
    i += 1
print(i)
#print(record)
#print(record['left'], record['right'], record['judgment'], record['notes'])
#print("Example of input data", record['X'])
#print("Example of truth label", record['y'])

3333
CPU times: user 869 ms, sys: 44.9 ms, total: 914 ms
Wall time: 707 ms


In [None]:
featcols = train.columns[2:-5]
runs = []
runs.append(('allFeatures', featcols))
runs.append(('noRight', [col for col in featcols if 'L' not in col]))
runs.append(('noLeft', [col for col in featcols if 'R' not in col]))
runs.append(('onlySW', [col for col in featcols if 'SW' in col]))
runs.append(('onlyQuantiles', [col for col in featcols if 'Quantile' in col]))
runs.append(('onlyPropDist', [col for col in featcols if 'PropDist' in col]))
runs.append(('onlyGlove', [col for col in featcols if 'glove' in col]))
runs.append(('onlySRP', [col for col in featcols if 'srp' in col]))

y = train['judgment']
testy_true = test['judgment']

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "QDA"]
classifiers = [
    KNeighborsClassifier(3),
    #SVC(kernel="linear", C=0.025),
    #SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]

for name, cols in runs[:1]:
    print(name.center(50, '='))
    if name not in models:
        models[name] = {}
    for cname, classifier in zip(names, classifiers):
        print(cname.center(50, '-'))
        if cname not in models[name]:
            models[name][cname] = classifier
            #models[name] = ExtraTreesClassifier(n_estimators=250, random_state=0)
            X = train[cols].values
            models[name][cname].fit(X, y)

        print('Internal'.center(50))
        testX = test[cols].values
        y_pred = models[name][cname].predict(testX)
        print(classification_report(testy_true, y_pred))

        print('Handcoded'.center(50))
        y_pred = models[name][cname].predict(handcoded_sample[cols])
        print(classification_report(handY_true, y_pred))
        #print_most_important_for_forest(forest, cols)

----------------Nearest Neighbors-----------------
                     Internal                     
              precision    recall  f1-score   support

      AUTHOR       0.89      0.88      0.88      1201
        SWDE       0.67      0.58      0.62        50
        SWSM       0.75      0.72      0.74        97
       WP_DV       0.79      0.82      0.80       652

    accuracy                           0.84      2000
   macro avg       0.78      0.75      0.76      2000
weighted avg       0.85      0.84      0.84      2000

                    Handcoded                     
              precision    recall  f1-score   support

      AUTHOR       0.79      0.80      0.80      1166
        SWDE       0.58      0.21      0.31       305
        SWSM       0.58      0.76      0.66       274
       WP_DV       0.54      0.65      0.59       458

    accuracy                           0.68      2203
   macro avg       0.62      0.61      0.59      2203
weighted avg       0.68      0.6

In [151]:
for name, cols in runs:
    print(name.center(50, '-'))
    print_most_important_for_forest(models[name], cols)

-------------------allFeatures--------------------
most important
Feature ranking:
1. srpRSimQuantile0.9Transform (0.032290)
2. srpLSimQuantile0.9Transform (0.030800)
3. srpRSimQuantile0.8Transform (0.027136)
4. srpLSimQuantile0.8Transform (0.026679)
5. srpLSimQuantile0.7Transform (0.026309)
6. gloveMeanSimTransform (0.025223)
7. gloveRPropDist0002 (0.023878)
8. srpRSimQuantile0.7Transform (0.023793)
9. gloveLPropDist0002 (0.023206)
10. LSW0001Prop (0.022283)
11. RSW0001Prop (0.020890)
12. srpRSimQuantile0.6Transform (0.020494)
13. gloveLPropDist0005 (0.020172)
14. gloveRPropDist0005 (0.018767)
15. srpLSimQuantile0.6Transform (0.018296)
16. srpRSimQuantile0.5Transform (0.018000)
17. srpLSimQuantile0.1Transform (0.017245)
18. gloveLSimQuantile0.5Transform (0.016913)
19. srpLSimQuantile0.5Transform (0.016888)
20. gloveLMeanMinSimTransform (0.016463)
---------------------noRight----------------------
most important
Feature ranking:
1. srpRSimQuantile0.8Transform (0.055039)
2. srpRSimQuant

IndexError: index 8 is out of bounds for axis 0 with size 8

# Investigating outliers

In [640]:
quantiles = stats.quantile([.1, 0.2, .25, .3, .4, .5, .6, .7, .75, .8, .9]).compute()
quantiles

Unnamed: 0,LSize,RSize,SW0001Len,SW0004Len,SW0005Len,SW0010Len,gloveLMeanMinSim,gloveLPropDist0002,gloveLPropDist0005,gloveLPropDist0010,...,LeftSW0005Prop,LeftSW0010Prop,LSW0001Prop,RSW0001Prop,LSW0004Prop,RSW0004Prop,LSW0005Prop,RSW0005Prop,LSW0010Prop,RSW0010Prop
0.1,4.0,10.0,0.0,0.0,0.0,0.0,0.001208,0.0,0.0,0.0,...,0.0,0.0,-0.333333,-0.083333,-0.333333,-0.058824,-0.333333,-0.058824,-0.333333,-0.041667
0.2,7.0,15.0,0.0,0.0,0.0,0.0,0.002918,0.0,0.0,0.0,...,0.0,0.0,-0.166667,-0.052632,-0.142857,-0.04,-0.142857,-0.037037,-0.111111,-0.025
0.25,9.0,17.0,0.0,0.0,0.0,0.0,0.003748,0.0,0.0,0.125,...,0.0,0.0,-0.111111,-0.047619,-0.1,-0.032258,-0.090909,-0.029412,-0.066667,-0.016129
0.3,11.0,20.0,0.0,0.0,0.0,0.0,0.005131,0.0,0.0,0.333333,...,0.0,0.0,-0.083333,-0.037037,-0.066667,-0.02381,-0.0625,-0.021277,-0.03125,-0.008264
0.4,16.0,25.0,0.0,0.0,0.0,6.0,0.008485,0.0,0.153846,0.631579,...,0.0,0.318182,-0.05,-0.027027,-0.030303,-0.013158,-0.02439,-0.009434,0.321429,0.294118
0.5,21.0,31.0,0.0,4.0,6.0,12.0,0.012394,0.102564,0.470775,0.825,...,0.235294,0.695357,-0.033333,-0.018519,0.109394,0.108893,0.222222,0.210526,0.708333,0.677419
0.6,26.0,38.0,0.0,9.0,11.0,18.0,0.018003,0.289357,0.709677,0.913043,...,0.571429,0.895395,-0.019608,-0.009901,0.416667,0.410256,0.555556,0.551583,0.9,0.888889
0.7,32.0,45.0,4.0,14.0,16.0,23.0,0.027388,0.510638,0.857143,0.964286,...,0.818182,0.975786,0.090909,0.090909,0.705882,0.705447,0.807692,0.8,1.0,0.975489
0.75,36.0,51.0,6.0,17.0,19.0,25.0,0.033935,0.689655,0.916667,1.0,...,0.9,1.0,0.25,0.25,0.84,0.833333,0.896552,0.894737,1.0,1.0
0.8,43.0,67.0,11.0,22.0,24.0,32.0,0.049966,0.866667,0.97619,1.0,...,0.979167,1.0,0.5,0.5,0.943396,0.944444,0.974359,0.978723,1.0,1.0


In [642]:
with ProgressBar():
    match1 = (stats.gloveLTruncSim <=  quantiles.loc[0.2, 'gloveLTruncSim'])
    match2 = (stats.gloveLTruncSim <=  quantiles.loc[0.2, 'gloveRTruncSim'])
    match3 = (stats.gloveLMeanMinSim >=  quantiles.loc[0.8, 'gloveLMeanMinSim'])
    match4 = (stats.LSize > 4) & (stats.RSize > 4)
    to_investigate = stats[match1 & match2 & match3 & match4].compute()
to_investigate.shape

[########################################] | 100% Completed |  4.5s


(907, 36)

In [643]:
to_investigate[['left', 'right']].sample(2)

Unnamed: 0,left,right
275,nyp.33433075660088,nyp.33433075011365
3701,hvd.32044069771079,wu.89094600145


In [649]:
meta = HathiMeta(config['metadb_path'])
meta

<compare_tools.hathimeta.HathiMeta at 0x7f910cd20e48>

In [650]:
meta_compare("mdp.39015010954140","hvd.hxjvjq", meta)

Unnamed: 0,title,author,oclc_num,page_count,description
left,The atomic theory of Lucretius contrasted with modern doctrines of atoms and evolution. By John Masson.,"Masson, John.",2722924,288,
right,The atomic theory of Lucretius contrasted with modern doctrines of atoms and evolution. By John Masson.,"Masson, John.",2722924,272,


In [646]:
pd.set_option('display.max_colwidth', 2000)
html = ""
for i, row in to_investigate[['left', 'right']].sample(20).iterrows():
    a = meta_compare(row.left, row.right).to_html()
    html += "<hr/><a href='http://hdl.handle.net/2027/{}'>Left</a>&nbsp;<a href='http://hdl.handle.net/2027/{}'>Right</a><br/>".format(row.left, row.right)
    html += a
display(HTML(html))

Unnamed: 0,title,author,oclc_num,page_count,description
left,Charles Kingsley and the Christian social movement / by Charles William Stubbs.,"Stubbs, Charles William, 1845-1912.",1707237,242,
right,"Charles Kingsley and the Christian social movement, by Charles William Stubbs.","Stubbs, Charles William, 1845-1912.",5842441,212,

Unnamed: 0,title,author,oclc_num,page_count,description
left,Mary Lincoln; biography of a marriage.,"Randall, Ruth Painter.",964413,608,
right,Mary Lincoln; biography of a marriage.,"Randall, Ruth Painter.",964413,484,

Unnamed: 0,title,author,oclc_num,page_count,description
left,"Sucker's progress; an informal history of gambling in America from the colonies to Canfield, by Herbert Asbury.","Asbury, Herbert, 1891-1963.",565296,580,
right,"Gambling on the western rivers / prepared by the staff of the Public Library, Fort Wayne and Allen County.","Asbury, Herbert, 1891-1963.",3025776,90,

Unnamed: 0,title,author,oclc_num,page_count,description
left,"Sports & anecdotes of bygone days in England, Scotland, Ireland, Italy and the Sunny South. By C. T. S. Birch Reynardson.","Reynardson, C. T. S. Birch 1810-1889.",8289671,368,
right,"Sports & anecdotes of bygone days in England, Scotland, Ireland, Italy and the Sunny South. By C. T. S. Birch Reynardson.","Reynardson, C. T. S. Birch 1810-1889.",8289671,336,

Unnamed: 0,title,author,oclc_num,page_count,description
left,"Old picture books; with other essays on bookish subjects, by Alfred W. Pollard.","Pollard, Alfred W. 1859-1944.",187474313,340,
right,Old picture books; with other essays on bookish subjects.,"Pollard, Alfred W. 1859-1944.",122426,298,

Unnamed: 0,title,author,oclc_num,page_count,description
left,Theism : being the Baird lecture for 1876 / by Robert Flint.,"Flint, Robert, 1838-1910.",23893352,490,
right,Theism : being the Baird lecture for 1876 /,"Flint, Robert, 1838-1910.",3172565,472,

Unnamed: 0,title,author,oclc_num,page_count,description
left,Anti-theistic theories : being the Baird lecture for 1877 / by Robert Flint.,"Flint, Robert, 1838-1910.",3158456,608,
right,Anti-theistic theories : being the Baird lecture for 1877 / by Robert Flint.,"Flint, Robert, 1838-1910.",3158456,578,

Unnamed: 0,title,author,oclc_num,page_count,description
left,Truth (Vérité) Tr. by Ernest Alfred Vizetelly.,"Zola, Emile, 1840-1902.",6587653,646,
right,Truth / Emile Zola ; translated by Ernest A. Vizetelly.,"Zola, Emile, 1840-1902.",47625453,624,

Unnamed: 0,title,author,oclc_num,page_count,description
left,The scientific basis of morals : and other essays : viz.: Right or wrong ; The ethics of belief ; The ethics of religion / by William Kingdon Clifford.,"Clifford, William Kingdon, 1845-1879.",9322697,72,
right,"The scientific basis of morals : and other essays, viz. : right and wrong, the ethics of belief, the ethics of religion.","Clifford, William Kingdon, 1845-1879.",3682878,60,

Unnamed: 0,title,author,oclc_num,page_count,description
left,"Near and far; an angler's sketches of home sport and colonial life, by William Senior (""Red Spinner"") ...","Senior, William, 1839?-1920.",19994144,368,
right,"Near and far; an angler's sketches of home sport and colonial life, by William Senior (""Red Spinner"") ...","Senior, William, 1839?-1920.",19994144,338,

Unnamed: 0,title,author,oclc_num,page_count,description
left,"Elementary sketches of moral philosophy. Delivered at the Royal Institution ... 1804, 1805 and 1806.","Smith, Sydney, 1771-1845.",24369214,530,
right,"Elementary sketches of moral philosophy : delivered at the Royal institution, in the years 1804, 1805, and 1806 / by Sydney Smith.","Smith, Sydney, 1771-1845.",734562,420,

Unnamed: 0,title,author,oclc_num,page_count,description
left,"The poetical works of Thomas Chatterton, with a memoir ...","Chatterton, Thomas, 1752-1770.",6547935,892,
right,"The poetical works of Thomas Chatterton : with notices of his life, a history of the Rowley controversy, a selection of his letters, notes critical and explanatory, and a glossary.","Chatterton, Thomas, 1752-1770.",3136812,492,v.1

Unnamed: 0,title,author,oclc_num,page_count,description
left,Hymns of faith and hope / by Horatius Bonar.,"Bonar, Horatius, 1808-1889.",817840,296,
right,Hymns of faith and hope [First series],"Bonar, Horatius, 1808-1889.",817845,288,

Unnamed: 0,title,author,oclc_num,page_count,description
left,An echo of passion / by George Parsons Lathrop.,"Lathrop, George Parsons, 1851-1898.",13487946,258,
right,An echo of passion / by George Parsons Lathrop.,"Lathrop, George Parsons, 1851-1898.",2473897,222,

Unnamed: 0,title,author,oclc_num,page_count,description
left,Letters on the philosophy of the human mind. First-[second] series.,"Bailey, Samuel, 1791-1870.",6124184106310320,330,
right,Letters on the philosophy of the human mind.,"Bailey, Samuel, 1791-1870.",378823945,308,ser. 2

Unnamed: 0,title,author,oclc_num,page_count,description
left,Select works of William Kingdon Clifford.,"Clifford, William Kingdon, 1845-1879.",8239705,274,
right,Seeing and thinking / By the late William Kingdon Clifford.,"Clifford, William Kingdon, 1845-1879.",5168318,170,

Unnamed: 0,title,author,oclc_num,page_count,description
left,Knowledge and life / by Rudolf Eucken...tr. by W. Tudor Jones.,"Eucken, Rudolf, 1846-1926.",1899333,392,
right,"Knowledge and life, by Rudolf Eucken...tr. by W. Tudor Jones...","Eucken, Rudolf, 1846-1926.",2413139,332,

Unnamed: 0,title,author,oclc_num,page_count,description
left,Hymns of faith and hope / by Horatius Bonar.,"Bonar, Horatius, 1808-1889.",817840,296,
right,Hymns of faith and hope. By Horatius Bonar ... [1st]-3d series.,"Bonar, Horatius, 1808-1889.",14720256,280,v.1

Unnamed: 0,title,author,oclc_num,page_count,description
left,"History of the United States of America during the administration of James Madison,","Adams, Henry, 1838-1918.",1724293,1260,v.2
right,History of the United States of America / Henry Adams.,"Adams, Henry, 1838-1918.",3525431,450,v.7

Unnamed: 0,title,author,oclc_num,page_count,description
left,"An introduction to entomology: or, Elements of the natural history of insects: comprising an account of noxious and useful insects, of their metamorphoses, food, stratagems, habitations, societies, motions, noises, hybernation, instinct, etc., etc. By William Kirby ... and William Spence ...","Kirby, William, 1759-1850.",5813958,680,
right,An introduction to entomology: or Elements of the natural history of insects: with plates. By William Kirby ... and William Spence.,"Kirby, William, 1759-1850.",2784095,540,v. 2


## Hand-coded Ground Truth

In [17]:
handcoded = pd.read_csv('http://35.239.220.133/download')
stats = pd.read_parquet('/data/saddl/handcoded.parquet')
handcoded.head()

Unnamed: 0.1,Unnamed: 0,rater,target,candidate,judgment,notes,timestamp
0,0,Peter,hvd.32044024501652,pst.000059706786,SWDE,"Mostly the same, but rebranded?",1580929000.0
1,1,Peter,hvd.32044024501652,uc2.ark:/13960/fk0tq5rc3v,SWDE,,1580929000.0
2,2,Peter,hvd.32044024501652,uiuo.ark:/13960/t0cv4jb3w,SWSM,,1580929000.0
3,3,Peter,hvd.32044024501652,loc.ark:/13960/t5s75px7c,AUTHOR,,1580929000.0
4,4,Peter,hvd.32044024501652,uva.x001053494,SWSM,,1580929000.0


In [18]:
stats2, extra = processStats(stats)

In [21]:
handcoded = handcoded.rename(columns={'target':'left', 'candidate':'right'})
handcoded_sample = dd.merge(stats2, handcoded)
handcoded_sample = handcoded_sample[handcoded_sample.judgment.isin(['SWDE', 'SWSM', 'AUTHOR', 'WP_DV'])]
bsample = handcoded_sample.sample(frac=1)
bsample.shape

(2307, 71)

In [22]:
bsample.to_parquet('/tmp/handcoded_feats.parquet')

In [255]:
train = bsample.iloc[:300]#bsample.iloc[:25000]
test = bsample.iloc[300:]#bsample.iloc[25000:]

## Measure value of different *types* of features

In [257]:
featcols = train.columns[2:-5]
runs = []
runs.append(('allFeatures', featcols))
runs.append(('noRight', [col for col in featcols if 'L' not in col]))
runs.append(('noLeft', [col for col in featcols if 'R' not in col]))
runs.append(('onlySW', [col for col in featcols if 'SW' in col]))
runs.append(('onlyQuantiles', [col for col in featcols if 'Quantile' in col]))
runs.append(('onlyPropDist', [col for col in featcols if 'PropDist' in col]))
runs.append(('onlyGlove', [col for col in featcols if 'glove' in col]))
runs.append(('onlySRP', [col for col in featcols if 'srp' in col]))

In [259]:
y = train['meta_code']
testy_true = test['meta_code']

for name, cols in runs:
    print(name.center(50, '-'))
    X = train[cols].values
    forest = ExtraTreesClassifier(n_estimators=250,
                                  random_state=0)
    forest.fit(X, y)

    testX = test[cols].values
    y_pred = forest.predict(testX)
    print(classification_report(testy_true, y_pred))
    print_most_important_for_forest(forest, cols)

-------------------allFeatures--------------------
              precision    recall  f1-score   support

      AUTHOR       0.86      0.87      0.87      1019
    CONTAINS       0.50      0.12      0.20         8
        SWDE       0.58      0.74      0.65       253
        SWSM       0.79      0.58      0.67       238
       WP_DV       0.72      0.69      0.71       395

    accuracy                           0.78      1913
   macro avg       0.69      0.60      0.62      1913
weighted avg       0.78      0.78      0.78      1913

most important
Feature ranking:
1. srpRSimQuantile0.6Transform (0.026547)
2. srpRSimQuantile0.0Transform (0.026482)
3. srpLSimQuantile0.0Transform (0.024655)
4. gloveLPropDist0005 (0.024419)
5. srpRSimQuantile0.7Transform (0.024098)
6. srpRSimQuantile0.8Transform (0.023602)
7. gloveRSimQuantile0.0Transform (0.022927)
8. srpLSimQuantile0.1Transform (0.022723)
9. gloveRPropDist0005 (0.022230)
10. gloveRPropDist0020 (0.022153)
11. srpLSimQuantile0.4Transform 

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

      AUTHOR       0.85      0.85      0.85      1019
    CONTAINS       0.00      0.00      0.00         8
        SWDE       0.54      0.67      0.60       253
        SWSM       0.74      0.50      0.59       238
       WP_DV       0.62      0.67      0.65       395

    accuracy                           0.74      1913
   macro avg       0.55      0.54      0.54      1913
weighted avg       0.75      0.74      0.74      1913

most important
Feature ranking:
1. gloveLPropDist0005 (0.050511)
2. gloveRPropDist0005 (0.042162)
3. gloveRSimQuantile0.0Transform (0.040500)
4. gloveLSimQuantile0.0Transform (0.036705)
5. gloveMeanSimTransform (0.033870)
6. gloveLSimQuantile0.1Transform (0.033107)
7. gloveLPropDist0010 (0.032652)
8. gloveLSimQuantile0.2Transform (0.032198)
9. gloveRPropDist0020 (0.031783)
10. gloveRPropDist0002 (0.031562)
11. gloveLTruncSimTransform (0.030525)
12. gloveRSimQuantile0.2Transform (0.030404)
13. gloveLSimQuan