In [1]:
from sklearn.model_selection import train_test_split
import spacy
from spacy.tokens import DocBin
import pandas as pd
import numpy as np
import bz2

In [2]:
train = bz2.BZ2File('../data/train.ft.txt.bz2')
train_lines = train.readlines() 
train_lines = [x.decode('utf-8') for x in train_lines]
train_labels = [i.split(" ")[0] for i in train_lines]
train_texts = [" ".join(i.split(" ")[1:]) for i in train_lines]
df_train = pd.DataFrame(columns= ["Labels", "Features"])
df_train['Labels'] = train_labels
df_train['Features'] = train_texts

test = bz2.BZ2File('../data/test.ft.txt.bz2')
test_lines = test.readlines() 
test_lines = [x.decode('utf-8') for x in test_lines]
test_labels = [i.split(" ")[0] for i in test_lines]
test_texts = [" ".join(i.split(" ")[1:]) for i in test_lines]
df_test = pd.DataFrame(columns= ["Labels", "Features"])
df_test['Labels'] = test_labels
df_test['Features'] = test_texts

In [14]:
df_train_spacy = df_train[df_train['Features'].apply(lambda x: len(x.split()) < 100)].groupby('Labels', group_keys=False).apply(lambda x: x.sample(100000))
df_test_spacy = df_test[df_test['Features'].apply(lambda x: len(x.split()) < 100)].groupby('Labels', group_keys=False).apply(lambda x: x.sample(100000))

In [16]:
def get_cats(labels: list(),labels_dict: dict):
    """This will take a label and a dictionary and return a dictionary for spacy.Doc.doc.cats

    Args:
        label (str): label of current text
        labels_dict (dict): default labels dict to create label

    Returns:
        temp_dict: dictionary label for current text
    """
    temp_dict = labels_dict.copy()
    for label in labels:
        temp_dict[label] = 1.0
    return temp_dict

# Create Training data

In [17]:
labels = df_train['Labels'].unique().tolist()
labels_dict = {i:0.0 for i in labels}

# Save Data

In [19]:
nlp = spacy.blank("en")
db = DocBin()
column = "Features"

for i,row in df_train_spacy.iterrows():
    doc = nlp.make_doc(row[column])    
    temp_dict = get_cats(labels = [row['Labels']],labels_dict = labels_dict)
    doc.cats = temp_dict
    db.add(doc)
db.to_disk("../textcat_multilabel/corpus/train.spacy")

db=DocBin()
for i,row in df_test_spacy.iterrows():
    doc = nlp.make_doc(row[column])    
    temp_dict = get_cats(labels = [row['Labels']],labels_dict = labels_dict)
    doc.cats = temp_dict    
    db.add(doc)
db.to_disk("../textcat_multilabel/corpus/dev.spacy")