In [1]:
%matplotlib inline
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from collections import namedtuple, defaultdict

In [2]:
sns.set_context("poster")
sns.set_style("ticks")

In [3]:
DATA_DIR="data/data/"
CLEANED_DIR="data/cleaned/"

Tag = namedtuple("Tag", ["token", "tag"])

def load_sequences(filename, sep="\t", notypes=False):
    tag_count = defaultdict(int)
    sequences = []
    with open(filename) as fp:
        seq = []
        for line in fp:
            line = line.strip()
            if line:
                line = line.split(sep)
                if notypes:
                    line[1] = line[1][0]
                tag_count[line[1]] += 1
                #print line
                seq.append(Tag(*line))
            else:
                sequences.append(seq)
                seq = []
        if seq:
            sequences.append(seq)
    return sequences, tag_count

def write_sequences(sequences, filename, sep="\t", to_bieou=True):
    with open(filename, "wb+") as fp:
        for seq in sequences:
            if to_bieou:
                seq = to_BIEOU(seq)
            for tag in seq:
                print >> fp, sep.join(tag)
            print >> fp, ""                
                
def count_phrases(ptype="movie"):
    phrase_counts = defaultdict(int)
    check_tag = ptype
    for seq in sequences:
        phrase = ""
        for tag in seq:
            if not phrase and tag.tag == "B-%s" % check_tag:
                phrase = tag.token
                continue
            if tag.tag == "I-%s" % check_tag:
                phrase += " %s" % tag.token
                continue
            if phrase:
                phrase_counts[phrase] += 1
                phrase = ""
    return phrase_counts


def phrase_to_BIEOU(phrase):
    l = len(phrase)
    new_phrase = []
    for j, t in enumerate(phrase):
        new_tag = t.tag
        if l == 1:
            new_tag = "U%s" % t.tag[1:]
        elif j == l-1:
            new_tag = "E%s" % t.tag[1:]
        new_phrase.append(Tag(t.token, new_tag))
    return new_phrase

def to_BIEOU(seq, verbose=False):
    # TAGS B I E U O
    phrase = []
    new_seq = []
    for i, tag in enumerate(seq):
        if not phrase and tag.tag[0] == "B":
            phrase.append(tag)
            continue
        if tag.tag[0] == "I":
            phrase.append(tag)
            continue
        if phrase:
            if verbose:
                print "Editing phrase", phrase
            new_phrase = phrase_to_BIEOU(phrase)
            new_seq.extend(new_phrase)
            phrase = []
        new_seq.append(tag)
    if phrase:
        if verbose:
            print "Editing phrase", phrase
        new_phrase = phrase_to_BIEOU(phrase)
        new_seq.extend(new_phrase)
        phrase = []
    return new_seq

# Clean data

In [4]:
sequences, tag_count = load_sequences("data/data/train", sep="\t")
write_sequences(sequences, "data/cleaned/train.tsv", to_bieou=False)

In [5]:
sequences, tag_count = load_sequences("data/data/dev", sep="\t")
write_sequences(sequences, "data/cleaned/dev.tsv", to_bieou=False)

In [6]:
sequences, tag_count = load_sequences("data/data/dev_2015", sep="\t")
write_sequences(sequences, "data/cleaned/dev_2015.tsv", to_bieou=False)

In [7]:
sequences, tag_count = load_sequences("data/data/test.txt", sep="\t")
write_sequences(sequences, "data/cleaned/test.tsv", to_bieou=False)

In [8]:
sequences, tag_count = load_sequences("data/cleaned/train.tsv", sep="\t")

In [9]:
len(sequences)

2394

In [10]:
sum(len(seq) for seq in sequences)

46469

In [11]:
sequences[-1]

[Tag(token='good', tag='O'),
 Tag(token='friday', tag='O'),
 Tag(token='whatchu', tag='O'),
 Tag(token='got', tag='O'),
 Tag(token='for', tag='O'),
 Tag(token='me', tag='O'),
 Tag(token='@kanyewest', tag='O')]

In [12]:
tag_count

defaultdict(int,
            {'B-company': 171,
             'B-facility': 104,
             'B-geo-loc': 276,
             'B-movie': 34,
             'B-musicartist': 55,
             'B-other': 225,
             'B-person': 449,
             'B-product': 97,
             'B-sportsteam': 51,
             'B-tvshow': 34,
             'I-company': 36,
             'I-facility': 105,
             'I-geo-loc': 49,
             'I-movie': 46,
             'I-musicartist': 61,
             'I-other': 320,
             'I-person': 215,
             'I-product': 80,
             'I-sportsteam': 23,
             'I-tvshow': 31,
             'O': 44007})

In [13]:
phrase_counts = count_phrases(ptype="movie")
phrase_counts

defaultdict(int,
            {'Agora': 1,
             'Alpha &amp; Omega': 1,
             'Breaking Dawn': 1,
             'Dazed &amp; Confused': 1,
             'Devil': 1,
             'Easy A': 1,
             'Gigli': 2,
             'Half baked': 1,
             'Iron Man 2': 1,
             'JENNIFERS BODY': 1,
             'Kick-Ass': 1,
             'Knight and Day': 1,
             'Les Miserables': 1,
             'Mar Adentro': 1,
             'Paranormal Activity 2': 1,
             'Piranha 3D': 1,
             'Princess Lover OVA 1': 1,
             'Sex In The City': 1,
             'The Ride': 1,
             'The Room': 1,
             'The Sea Inside': 1,
             'The Town': 3,
             'Toy story 3': 1,
             "Winter 's Bone": 2,
             'camp rock 2': 1,
             'grandma boys': 1,
             'how high': 1,
             'nightmare before christmas': 1,
             'puff puff pass': 1,
             'rocky horror show': 1})

In [14]:
phrase_counts = count_phrases(ptype="facility")
phrase_counts

defaultdict(int,
            {' Mason': 1,
             '@Cromwell Field': 1,
             'ASU Step Gallery': 1,
             'BAR': 1,
             'Band hall': 1,
             'Botanic Gdns': 1,
             'Bowl Long Island': 1,
             'Burning Bush Grille': 1,
             'CAFE NINE': 1,
             'CALABASH LOUNGE': 1,
             'CANWEST Center': 1,
             'CLUB BLU': 1,
             'Casitas': 1,
             'Cathedral of Learning G24': 1,
             'Champlain campus': 1,
             'Costa Lounge': 2,
             "Dick 's Carpet": 1,
             'Dicks': 1,
             'Dillion High School': 1,
             'Dim Mak': 1,
             'Disney World': 1,
             'Disney world': 1,
             'ESB': 2,
             'Elements': 1,
             'Elliot Miner': 1,
             'Empire State Building': 2,
             'Fight Club': 1,
             'First Baptist': 1,
             'Forgotten Door': 1,
             'Futoosh': 1,
             'Gardens': 

In [15]:
phrase_counts = count_phrases(ptype="company")
phrase_counts

defaultdict(int,
            {'#Vh1': 1,
             'AEG': 1,
             'AMerican': 1,
             'AT&amp;T': 1,
             'Accel': 2,
             'Amazon U.K.': 1,
             'Ball Metal Container': 1,
             'Blackberry': 1,
             'Brando': 1,
             'Business Alliance': 1,
             'CORT': 1,
             'Camelbak': 1,
             'Chevron': 1,
             'Cisco': 1,
             'Costco': 1,
             'Crocs': 1,
             'Cyber-Ark': 1,
             'Daily Mail': 1,
             'Deep Sea Intervention': 2,
             'Delphi': 1,
             'Disney': 1,
             'Electric Lady Studios': 1,
             'Engadget': 1,
             'EuroVPS': 1,
             'Evergreen Subaru': 1,
             'FACEBOOK': 1,
             'FB': 1,
             'Facebook': 6,
             'FedEx': 1,
             'Forex': 1,
             'Gabriel Resources': 1,
             'Game Informer': 1,
             'Guinness': 1,
             'High House F

In [16]:
phrase_counts = count_phrases(ptype="musicartist")
phrase_counts

defaultdict(int,
            {'30stm': 2,
             'BORDER LINEA': 1,
             'Baka Boyz': 1,
             'Big Time Rush': 1,
             'Blonde Redhead': 1,
             'Breaking &amp; Entering': 1,
             "Cap'n Jazz": 1,
             'CocoFunka': 2,
             'Cowboy Mouth': 1,
             'Crooked': 1,
             'DJ Chris L': 1,
             'DJ Elements': 1,
             'DJ STRATEGY': 1,
             'DOES IT OFFEND YOU , YEAH ?': 1,
             'DT': 1,
             'Delirious': 1,
             'Fall Out Boy': 1,
             'Ghostland Observatory': 1,
             'Green Day': 1,
             'INDIGENOUS': 1,
             'Jonas Bros': 1,
             'KC and The Sunshine Band': 1,
             'KISS': 1,
             'Lucid Dementia': 1,
             'Maroon 5': 1,
             'Maunalua': 1,
             'Metallica': 1,
             'Mystery Jets': 1,
             'N.E.R.D.': 1,
             'Natives': 1,
             'Primus': 1,
             'SHA

In [17]:
phrase_counts = count_phrases(ptype="other")
phrase_counts

defaultdict(int,
            {'#Vh1': 1,
             '11 Bantam Draft': 1,
             '1st Presbyterian': 1,
             '30+ Beautiful Ladies': 1,
             '6th Biannual 24 Hour Prayer Focus': 1,
             'A Different Kind Of Ache': 1,
             'A&amp;E': 2,
             'ACF Friday Large Group': 1,
             'ADHD': 1,
             'AHFA': 1,
             'AP': 1,
             'APO': 1,
             'Above&amp;Beyond': 1,
             'Alejandro': 1,
             'American Professional Football Association': 1,
             'Army Run': 1,
             'AstronomersWithoutBorders': 1,
             'Atlas Shrugs': 1,
             'Awakening Festival': 1,
             'BBL': 1,
             'BCAT': 1,
             "Badass Babes Savin ' The Day": 1,
             'Battle of Britain': 1,
             'Blakehurst': 1,
             'Body Heat': 1,
             'CCU': 1,
             'COLLEGE REPUBLICAN TAILGATE': 1,
             'COP': 1,
             'CampaignMonitor': 1,


In [18]:
to_BIEOU(sequences[1])

[Tag(token='Made', tag='O'),
 Tag(token='it', tag='O'),
 Tag(token='back', tag='O'),
 Tag(token='home', tag='O'),
 Tag(token='to', tag='O'),
 Tag(token='GA', tag='U-geo-loc'),
 Tag(token='.', tag='O'),
 Tag(token='It', tag='O'),
 Tag(token='sucks', tag='O'),
 Tag(token='not', tag='O'),
 Tag(token='to', tag='O'),
 Tag(token='be', tag='O'),
 Tag(token='at', tag='O'),
 Tag(token='Disney', tag='B-facility'),
 Tag(token='world', tag='E-facility'),
 Tag(token=',', tag='O'),
 Tag(token='but', tag='O'),
 Tag(token='its', tag='O'),
 Tag(token='good', tag='O'),
 Tag(token='to', tag='O'),
 Tag(token='be', tag='O'),
 Tag(token='home', tag='O'),
 Tag(token='.', tag='O'),
 Tag(token='Time', tag='O'),
 Tag(token='to', tag='O'),
 Tag(token='start', tag='O'),
 Tag(token='planning', tag='O'),
 Tag(token='the', tag='O'),
 Tag(token='next', tag='O'),
 Tag(token='Disney', tag='B-facility'),
 Tag(token='World', tag='E-facility'),
 Tag(token='trip', tag='O'),
 Tag(token='.', tag='O')]

In [19]:
write_sequences(sequences, "data/cleaned/train.BIEOU.tsv", to_bieou=True)

## Validation set

In [20]:
sequences, tag_count = load_sequences("data/cleaned/dev.tsv", sep="\t")
write_sequences(sequences, "data/cleaned/dev.BIEOU.tsv")

In [21]:
sequences, tag_count = load_sequences("data/cleaned/dev.tsv", sep="\t")

In [22]:
for seq in sequences:
    if seq[0].token == "Happy":
        print seq
        break

[Tag(token='Happy', tag='O'), Tag(token='Good', tag='B-other'), Tag(token='Friday', tag='I-other')]


In [23]:
to_BIEOU(seq)

[Tag(token='Happy', tag='O'),
 Tag(token='Good', tag='B-other'),
 Tag(token='Friday', tag='E-other')]

## Validation 2015

In [24]:
sequences, tag_count = load_sequences("data/cleaned/dev_2015.tsv", sep="\t")
write_sequences(sequences, "data/cleaned/dev_2015.BIEOU.tsv")

## Test data

In [25]:
sequences, tag_count = load_sequences("data/cleaned/test.tsv", sep="\t")
write_sequences(sequences, "data/cleaned/test.BIEOU.tsv")

# No Types

## Training

In [26]:
sequences, tag_count = load_sequences("data/cleaned/train.tsv", sep="\t", notypes=True)
write_sequences(sequences, "data/cleaned/train_notypes.tsv", to_bieou=False)

sequences, tag_count = load_sequences("data/cleaned/train.BIEOU.tsv", sep="\t", notypes=True)
write_sequences(sequences, "data/cleaned/train_notypes.BIEOU.tsv")

## Validation

In [27]:
sequences, tag_count = load_sequences("data/cleaned/dev.tsv", sep="\t", notypes=True)
write_sequences(sequences, "data/cleaned/dev_notypes.tsv", to_bieou=False)

sequences, tag_count = load_sequences("data/cleaned/dev.BIEOU.tsv", sep="\t", notypes=True)
write_sequences(sequences, "data/cleaned/dev_notypes.BIEOU.tsv")

## Validation 2015

In [28]:
sequences, tag_count = load_sequences("data/cleaned/dev_2015.tsv", sep="\t", notypes=True)
write_sequences(sequences, "data/cleaned/dev_2015_notypes.tsv", to_bieou=False)

sequences, tag_count = load_sequences("data/cleaned/dev_2015.BIEOU.tsv", sep="\t", notypes=True)
write_sequences(sequences, "data/cleaned/dev_2015_notypes.BIEOU.tsv")

## Test data

In [29]:
sequences, tag_count = load_sequences("data/cleaned/test.tsv", sep="\t", notypes=True)
write_sequences(sequences, "data/cleaned/test_notypes.tsv", to_bieou=False)

sequences, tag_count = load_sequences("data/cleaned/test.BIEOU.tsv", sep="\t", notypes=True)
write_sequences(sequences, "data/cleaned/test_notypes.BIEOU.tsv")