In [60]:
import pandas as pd
import csv
import re

### Import data

In [61]:
# import the event titles and artists from the text file
df = pd.read_csv("../data/event_titles.txt", sep='\t', dtype = str, names=['event_names']) 
artists = set(line.strip() for line in open('../data/artists.txt', encoding='UTF-8'))

# import the labels that were hand generated
labels = pd.read_csv("../data/binary_labels.txt", sep='\t', dtype = str, names=['labels']) 

### Filter the data

The data is not well posed for training. We'll apply a filter to clean up the strings.

In [62]:
df.head()

Unnamed: 0,event_names
0,Jamey Johnson
1,Alex the Astronaut & Stella Donnelly - Adelaid...
2,Bad Bunny - La Nueva Religion Tour
3,Julien Baker at The Burl
4,SWING pres. Sam Paganini & Zøe


In [63]:
# since I haven't manually labeled all of the data I can't use all the data for training
num_of_labels = len(labels) - 1

def text_clean(x):
    # lowercase the text
    x = x.lower()
    # adding spaces before punctuation that occurs at the end of a word
    x = re.sub(r'\b[!,.~:]\B', ' \g<0>', x)
    # spaces around some special characters
    x = re.sub(r'[\[\]~()/<>-]', ' \g<0> ', x)
    # removed double spaces from string
    x = re.sub(r'\s+', ' ', x)
    # removed spaces at the beginning of the string
    x = x.strip()


    return x


# Filter the event names and save them to a new file
text = df['event_names'].apply(lambda x: text_clean(x)).loc[0:num_of_labels]
df['event_names'].apply(lambda x: text_clean(x)).to_csv('../data/event_titles_clean.csv', sep='\t', quoting=csv.QUOTE_NONE, index=False)

### Combine to create the training data

In [64]:
train_data = text.to_frame().join(labels)
train_data.head()

Unnamed: 0,event_names,labels
0,jamey johnson,B-per I-per
1,alex the astronaut & stella donnelly - adelaid...,B-per I-per I-per O B-per I-per O O O O
2,bad bunny - la nueva religion tour,B-per I-per O O O O O
3,julien baker at the burl,B-per I-per O O O
4,swing pres . sam paganini & zøe,O O O B-per I-per O B-per


### Sanity check on the labeled data

Since the labels are hand produced we'll run some sanity checks to verify the labels are correct.

Note: the labels have been generated based on the cleaned event names. Changing the cleaning will result in different labels. 

In [65]:
# If any labels have a different number of spaces than the text, we want to identify the error and fix it.

train_data[train_data.apply(lambda row: row.labels.count(' ') != row.event_names.count(' '), axis=1)]

Unnamed: 0,event_names,labels


In [66]:
# Verify the labels look correct

def verify_labels(row):
    """Returns a list of artists based on the labels."""
    # split the string into a list of words
    labels = row.labels.split()
    event_names = row.event_names.split()

    # list to append the artist names to
    artists = []
    name = []

    # if the event label is a person, extract the name from 
    for label, event_name in zip(labels, event_names):
        if label == 'B-per':
            name = [event_name]
        if label == 'I-per':
            name.append(event_name)
        if label == 'O':
            if len(name):
                artists.append(' '.join(name))
                name = []
    if len(name):
        artists.append(' '.join(name))

    return artists
# train_data_temp = train_data.join(train_data.apply(lambda row: verify_labels(row), axis=1))
# train_data_temp
train_data_temp = pd.DataFrame(train_data.apply(lambda row: verify_labels(row), axis=1), columns=['artists'])
train_data.join(train_data_temp)

Unnamed: 0,event_names,labels,artists
0,jamey johnson,B-per I-per,[jamey johnson]
1,alex the astronaut & stella donnelly - adelaid...,B-per I-per I-per O B-per I-per O O O O,"[alex the astronaut, stella donnelly]"
2,bad bunny - la nueva religion tour,B-per I-per O O O O O,[bad bunny]
3,julien baker at the burl,B-per I-per O O O,[julien baker]
4,swing pres . sam paganini & zøe,O O O B-per I-per O B-per,"[sam paganini, zøe]"
...,...,...,...
495,monster truck winter nationals,B-per I-per O O,[monster truck]
496,"satyricon w / inquisition , mictlantecuhtl & h...",B-per O O B-per O B-per O B-per O O O,"[satyricon, inquisition, mictlantecuhtl, highl..."
497,jason isbell & the 400 unit w / james mcmurty ...,B-per I-per I-per I-per I-per I-per O O B-per ...,"[jason isbell & the 400 unit, james mcmurty]"
498,funtcase at 45 east,B-per O O O,[funtcase]


In [67]:
# Check to see if there are some unwanted labels
# We only expect the following labels:
# {'B-per', 'I-per', 'O'}

unique_labels = set()

for label in train_data.labels.to_list():
    for work in label.split():
        unique_labels.add(work)

unique_labels

{'B-per', 'I-per', 'O'}

### Save the data

In [68]:
# save the train data
train_data.to_csv('../data/train_data.csv', sep='\t', quoting=csv.QUOTE_NONE, index=False)