# FastText Classification on Metadata

In [4]:
import fasttext
from SRP import Vector_file
import pandas as pd
import os
import numpy as np
import re
import random
from scipy.spatial.distance import cdist, pdist
import tempfile

meta = pd.read_csv('../../sampling/test_dataset.csv.gz', low_memory=False).set_index('htid').sort_index()

# Training the models

Multiple models are trained and concatenated together.

In [10]:
fn, fname = tempfile.mkstemp()

print("Training title string model")
title_enum= meta.title + ' / ' + meta.description.fillna('')
# Writes all the data to a temp filetrains from that file.
with open(fname, mode='w') as f:
    for txt in title_enum:
        f.write(txt+'\n')
titlemodel = fasttext.train_unsupervised(fname, model='skipgram', dim=20)

print("Training clean title string model")
def clean_title(title):
    # Remove up to two trailing '/'-separated sections
    title = title[::-1].split('/', 2)[-1][::-1].strip()
    title = title.split('; by')[0]
    title = re.sub('[\.,;\]\)\[] +?[\(\[]?(assembl|photo|arrang|select|compil|record|collect|edit|translat).{0,100}by.*\.?', '', title, flags=re.IGNORECASE)
    title = re.sub('[\.\,] [bB]y (the .{0,30})?([A-Z]\w+ [A-Z\w+]|author).*\.?', '', title)
    return title
with open(fname, mode='w') as f:
    for txt in meta.title:
        clean = clean_title(txt)
        f.write(clean+'\n')
titlemodel2 = fasttext.train_unsupervised(fname, model='skipgram', dim=40)
        
print("Training truncated title string model")
with open(fname, mode='w') as f:
    for txt in meta.title:
        f.write(txt[:40]+'\n')
titlemodel3 = fasttext.train_unsupervised(fname, model='skipgram', dim=20)
        
print("Training author string model")
with open(fname, mode='w') as f:
    for txt in meta.author:
        f.write(txt+'\n')
authormodel = fasttext.train_unsupervised(fname, model='skipgram', dim=10)

print("Training description string model")
with open(fname, mode='w') as f:
    for txt in meta.description.fillna(' '):
        f.write(txt+'\n')
descmodel = fasttext.train_unsupervised(fname, model='skipgram', dim=10)

os.remove(fname)

Training title string model
Training clean title string model
Training truncated title string model
Training author string model
Training description string model


## Converting metadata to vectors.

In [15]:
dims = titlemodel.dim + titlemodel2.dim + titlemodel3.dim + authormodel.dim + descmodel.dim

with Vector_file('fastTextVecs.bin', mode='w', dims=dims) as vfile:
    for htid, row in meta.iterrows():
        title =  row['title'] + '/' + row.fillna('')['description']
        author = row['author']
        description = row.fillna(' ')['description']
        fullvec = np.concatenate((titlemodel[title],
                                  titlemodel2[clean_title(row['title'])],
                                  titlemodel3[row['title'][:100]],
                                  authormodel[author],
                                  descmodel[description]))
        vfile.add_row(htid, fullvec)

# Test Vectors

In [17]:
with Vector_file('fastTextVecs.bin', mode='r', dims=100) as vfile:
    allvecs = vfile.to_matrix()

In [19]:
random_targets = np.random.randint(meta.shape[0], size=20)
txts = (meta['title'] + ' | ' + meta['author'] + ' | ' + meta['description'].fillna('')).tolist()

for target_i in random_targets:
        print('TARGET:' + txts[target_i] + '\n' + '=' * 20)
        results = cdist(allvecs['matrix'][target_i:target_i+1], allvecs['matrix'], metric='cosine')
        sorted_results = pd.Series(results[0]).sort_values()

        for i, result in sorted_results.items():
            if result > 0.03:
                break
            name = txts[i]
            print("%.5f\t%s" % (result, name))
        print('\n')

TARGET:The wider outlook beyond the world war, by Charles E. Hooper ... | Hooper, Charles E. | 
0.00000	The wider outlook beyond the world war, by Charles E. Hooper ... | Hooper, Charles E. | 
0.00000	The wider outlook beyond the world war, by Charles E. Hooper ... | Hooper, Charles E. | 
0.00589	The wider outlook beyond the World war, by Charles E. Hooper. | Hooper, Charles E. | 
0.00589	The wider outlook beyond the World war, by Charles E. Hooper. | Hooper, Charles E. | 


TARGET:Essentials of nematodology. Edited by K.I. Skrjabin. [Translated by Marc Paenson and Z.S. Cole] | Gelʹmintologicheskai︠a︡ laboratorii︠a︡ (Akademii︠a︡ nauk SSSR) | 5
0.00000	Essentials of nematodology. Edited by K.I. Skrjabin. [Translated by Marc Paenson and Z.S. Cole] | Gelʹmintologicheskai︠a︡ laboratorii︠a︡ (Akademii︠a︡ nauk SSSR) | 5
0.00140	Essentials of nematodology. Edited by K.I. Skrjabin. [Translated by Marc Paenson and Z.S. Cole] | Gelʹmintologicheskai︠a︡ laboratorii︠a︡ (Akademii︠a︡ nauk SSSR) | 4
0.