In [None]:
path = '/data'

Preparing the data

In [None]:
import pandas as pd

In [None]:
def data_classification(dataset, threshold):
    classification_values = []

    for index, row in dataset.iterrows():
        if float(row[2]) > threshold:
            classification_values.append('OFF')
        else:
            classification_values.append('NOT')
    dataset.insert(2, column='task_a', value=classification_values)
    dataset.drop(axis=1, columns=['average', 'std'], inplace=True)
    dataset.set_index('id', inplace=True)

    return dataset

In [None]:
df = pd.read_csv(r'{0}/task_a_distant.tsv'.format(path), sep='\t', header=0)
df = data_classification(df, 0.5)

df.to_csv(r'{0}/task_a_0.5.tsv'.format(path), sep='\t')

In [None]:
from imblearn.under_sampling import RandomUnderSampler

In [None]:
df = pd.read_csv(r'{0}/task_a_0.5.tsv'.format(path), sep='\t', header=0)
print(dataset.head())

rus = RandomUnderSampler(random_state=0)

print(df['task_a'].value_counts())

X_resampled, y_resampled = rus.fit_resample(df['text'].values.reshape(-1, 1),
                                            df['task_a'].values.reshape(-1, 1))

df = pd.DataFrame(data=X_resampled, columns=['text'])
df['task_a'] = y_resampled
print(dataset.head())
print(dataset['task_a'].value_counts())

df.to_csv(r'{0}/task_a_0.5_resampled.tsv'.format(path), sep='\t', index=False)

Text Classification using Spacy TextCategorizer

In [None]:
import pandas as pd
import spacy

In [None]:
def df2list(text_df, label_df):
	ls_ = [(text_df.iloc[i], {'cats': label_df.iloc[i].to_dict()}) for i in range(len(text_df))]
	return ls_

In [None]:
print('reading dataset')
train = pd.read_csv(r'{0}/task_a_0.5.tsv'.format(path), sep='\t', header=0)
validation = pd.read_csv(r'{0}/testset_2020.tsv'.format(path), sep='\t', header=0)

print('splitting train test data')
X_train = train['text'].copy()
y_train = train['task_a'].copy()
X_val = validation['text'].copy()
y_val = validation['task_a'].copy()

y_train = pd.get_dummies(y_train)
y_val = pd.get_dummies(y_test)

In [None]:
testset = pd.read_csv(r'{0}/test_a_tweets.tsv'.format(path), sep='\t', header=0)
test_labels = pd.read_csv(r'{0}/test_a_baseline.csv'.format(path), header=None)
test_labels.columns = ['id', 'task_a']

X_test = testset['tweet']
y_test = test_labels['task_a']
y_test = pd.get_dummies(y_test)

In [None]:
train_ls = df2list(X_train, y_train)
val_ls = df2list(X_val, y_val)
test_ls = df2list(X_test, y_test)

In [None]:
col_vals = list(train['task_a'].unique())

nlp = spacy.load('en_core_web_lg')

if 'textcat' not in nlp.pipe_names:
    textcat = nlp.create_pipe('textcat')
    nlp.add_pipe(textcat, last=True)
else:
    textcat = nlp.get_pipe('textcat')

# add label to text classifier
for _, col_val in enumerate(col_vals):
    textcat.add_label(col_val)

In [None]:
from spacy.util import minibatch, compounding
from sklearn.metrics import f1_score, accuracy_score, classification_report
import random

In [None]:
val_text, val_label = list(zip(*val_ls))
test_text, test_label = list(zip(*test_ls))

In [None]:
output_dir = path + '/spacy_models/'

In [None]:
n_iter = 20
print_every= 1
not_improve = 5 


# Train model
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
with nlp.disable_pipes(*other_pipes): 
    optimizer = nlp.begin_training()
    
    score_f1_best = 0
    early_stop = 0
    
    for i in range(n_iter):
        losses = {}
        true_labels = list() # true label
        pdt_labels = list() # predict label
        
        random.shuffle(train_ls)
        batches = minibatch(train_ls, size=compounding(4., 32., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
                       losses=losses)
            
        with textcat.model.use_params(optimizer.averages): 
            docs = [nlp.tokenizer(text) for text in val_text]
            
            for j, doc in enumerate(textcat.pipe(docs)):
                true_series = pd.Series(valid_label[j]['cats'])
                true_label = true_series.idxmax()
                true_labels.append(true_label)
    
                pdt_series = pd.Series(doc.cats)
                pdt_label = pdt_series.idxmax()
                pdt_labels.append(pdt_label)
                
            score_f1 = f1_score(true_labels, pdt_labels, average='macro')
            score_ac = accuracy_score(true_labels, pdt_labels)
            
            if i % print_every == 0:
                print('textcat loss: {:.4f}\tf1-score: {:.3f}\taccuracy: {:.3f}'.format(
                    losses['textcat'], score_f1, score_ac))
            
            if score_f1 > score_f1_best:
                early_stop = 0
                score_f1_best = score_f1
                with nlp.use_params(optimizer.averages):
                    nlp.to_disk(output_dir) # save the model
            else:
                early_stop += 1
            
            if early_stop >= not_improve:
                print('Finished training...')
                break
            
            if i == n_iter:
                print('Finished training...')

In [None]:
# Load saved model.
nlp = spacy.load(output_dir)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
from sklearn.utils import shuffle
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
import math
import seaborn as sns

# Evaluate the model.
def evaluate(nlp, texts, labels, label_names):
	"""
	:param nlp: spacy nlp object
	:param texts: list of sentences
	:param labels: dictionary of labels
	:param label_names: list of label names
	"""
	label_names = label_names
	true_labels = []
	pdt_labels = []
	docs = [nlp.tokenizer(text) for text in texts]
	textcat = nlp.get_pipe('textcat')
	for j, doc in enumerate(textcat.pipe(docs)):
		true_series = pd.Series(labels[j]['cats'])
		true_label = true_series.idxmax()  # idxmax() is the new version of argmax()
		true_labels.append(true_label)

		pdt_series = pd.Series(doc.cats)
		pdt_label = pdt_series.idxmax()
		pdt_labels.append(pdt_label)
	score_f1 = f1_score(true_labels, pdt_labels, average='macro')
	score_ac = accuracy_score(true_labels, pdt_labels)
	print('f1 score: {:.3f}\taccuracy: {:.3f}'.format(
		score_f1, score_ac))

	print('\nclassification report...')
	print(classification_report(true_labels, pdt_labels, target_names=label_names))
	
	#data = {'ID': testset['id'],
        	#'LABEL': pdt_labels}
	data = {'LABEL': pdt_labels}
	results = pd.DataFrame(data)
	#results.set_index('ID', inplace=True)
	results.to_csv(r'{0}/labels_testset_last.csv'.format(path), header=None)
	print('saved labels to csv file')

In [None]:
evaluate(nlp, test_text, test_label, label_names=['NOT', 'OFF'])