In [0]:
from pandas.io.json import json_normalize
import psycopg2
import pandas as pd
import numpy as np
import datetime
from pandas.io.json import json_normalize
import gensim, logging
from sklearn.cluster import DBSCAN
from sklearn import preprocessing, metrics
from collections import defaultdict

### Step 0: Read RSS to dataframe, transform dates, create 24hr buckets and 4-day buckets

In [1]:
from pandas.io.json import json_normalize
import psycopg2
import pandas as pd
import datetime

### Step 0: read RSS to dataframe, transform dates, create 24hr buckets and 4-day buckets
def read_rss():
    database = "rssfeed"
    hostname="rssfeed.cjgj2uy1bapa.us-east-1.rds.amazonaws.com"
    port="5432" 
    userid="postgres"
    passwrd=""
    conn_string = "host="+hostname+" port="+port+" dbname="+database+" user="+userid+" password="+passwrd
    conn = psycopg2.connect(conn_string)
    conn.autocommit=True
    cursor = conn.cursor()
    sqlSelect = "SELECT * FROM rss_entities_with_pub";
    cursor.execute(sqlSelect);
    rows = cursor.fetchall();
    return rows

def read_publishers():
    publishers = pd.read_json('data/rss_feed.json', typ='series')
    publishers = publishers.to_dict()
    pub = {v: k for k, v in publishers.items()}
    return pub

def make_rssdf():
    rows = read_rss()
    pub = read_publishers()
    rssdf = pd.DataFrame(rows)
    rssdf.columns = ['publisher_code','entity','publish_time']
    rssdf["publisher"] = rssdf["publisher_code"].map(pub)  
    return rssdf

def add_24hr_bucket():
    rssdf = make_rssdf()
    rssdf['24hour_bucket'] = pd.to_datetime(
        rssdf['publish_time'],errors='coerce',format = '%Y-%m-%dT%H:%M:%S+00:00',\
        infer_datetime_format = True, cache = True)
    return rssdf

def group_entities_by_bucket():
    rssdf = add_24hr_bucket()
    rssdf['dates']=rssdf['24hour_bucket'].dt.date
    newdf = pd.DataFrame(rssdf.groupby(['dates'])['entity']\
                      .apply(", ".join)).reset_index().sort_values(by=['dates'])
    newdf['dates'] = pd.to_datetime(newdf['dates'])
    newdf = newdf[newdf['dates']<datetime.date(2020,3,19)] 
    return newdf

def CountFrequency(my_list): 
    '''
    Given a list of entities, return a dictionary of frequencies
    '''
    freq = {} 
    for item in my_list: 
        if (item in freq): 
            freq[item] += 1
        else: 
            freq[item] = 1
    return freq

def aggregate_entities_4_days():
    '''
    Create another column in the given dataframe,
    combine 4 consecutive days (slicing window approach),
    store it in the column "entities_4_days" as a dictionary (key = entity, val = frequency)
    '''
    mydf = group_entities_by_bucket()
    mydf_2007_plus = mydf[(mydf['dates'] >= pd.Timestamp(datetime.date(2007,1,1)))].reset_index()
    lst = []
    for i in range(len(mydf_2007_plus)-4):
        mystr = mydf_2007_plus.iloc[i,2]+ ", " + mydf_2007_plus.iloc[i+1,2] + ", "+\
                   mydf_2007_plus.iloc[i+2,2] + ", " + mydf_2007_plus.iloc[i+3,2]
        mylist = list(mystr.split(", ")) 
        mydict = CountFrequency(mylist)
        lst.append(mydict)

    mydf_2007_plus['entities_4_days'] = lst + [np.nan] * 4 
    return mydf_2007_plus, lst

def refine_entity_list():
    '''
    Create a separate column of entities with frequency > 1
    '''
    lst2 = []
    df_window, lst = aggregate_entities_4_days()
    for i in range(len(lst)):
        d = lst[i]
        lst2.append({key: d[key] for key in d if d[key] > 1})
    df_window['refined_entities_4_days'] = lst2 + [np.nan] * 4 
    del df_window['index']
    return df_window, lst2

df_window, lst2 = refine_entity_list()
df_window.to_csv('data/df_window.csv', index = False)
df_window.head()

ModuleNotFoundError: No module named 'psycopg2'

In [0]:
### Subset the first 10 rows for quick demonstration
df_sub = df_window.iloc[0:50,:]
df_sub

Unnamed: 0,dates,entity,entities_4_days,refined_entities_4_days
0,2007-01-01,"Episode, iTunes, New Year, Radio Detective Sto...","{'Episode': 1, 'iTunes': 1, 'New Year': 2, 'Ra...","{'New Year': 2, 'German': 2, 'New Year Resolut..."
1,2007-01-02,"Prager, Who Really Cares: The suprising, Denni...","{'Prager': 1, 'Who Really Cares: The suprising...","{'Italian': 2, 'IndieFeed Alternative Modern R..."
2,2007-01-03,"Idiotchild - Mouth Watering, IndieFeed Alterna...","{'Idiotchild - Mouth Watering': 1, 'IndieFeed ...","{'IndieFeed Alternative Modern Rock': 2, 'Indi..."
3,2007-01-04,"Speeding Bullets, DC, Raging Bullets of, Jim, ...","{'Speeding Bullets': 1, 'DC': 1, 'Raging Bulle...","{'Sarah Dinan': 2, 'Coyote Run': 2, 'Marc Gunn..."
4,2007-01-05,"Irish & Celtic Music Podcast, Celtic, Sarah Di...","{'Irish & Celtic Music Podcast': 1, 'Celtic': ...","{'Sarah Dinan': 2, 'Coyote Run': 2, 'Marc Gunn..."
5,2007-01-06,"Q & A, Pixie, Ann Druyan, Carl Sagan, Ann Druy...","{'Q & A': 1, 'Pixie': 1, 'Ann Druyan': 2, 'Car...","{'Ann Druyan': 2, 'Carl Sagan': 4, 'Italian': ..."
6,2007-01-07,"Italian, Zencast 86 - Right Livelihood, Gil Fr...","{'Italian': 6, 'Zencast 86 - Right Livelihood'...","{'Italian': 6, 'Gil Fronsdal': 2, 'Matt White ..."
7,2007-01-08,"Photoshop CS3 Way, Photoshop CS3 Public Beta, ...","{'Photoshop CS3 Way': 1, 'Photoshop CS3 Public...","{'North': 2, 'IndieFeed Alternative Modern Roc..."
8,2007-01-09,"Italian, Italian, Apple, Steve Jobs, Macworld ...","{'Italian': 8, 'Apple': 1, 'Steve Jobs': 1, 'M...","{'Italian': 8, 'Ben': 3, 'Spanish': 3, 'Guava ..."
9,2007-01-10,"English, Unnateral Helpers - Gettin', IndieFee...","{'English': 1, 'Unnateral Helpers - Gettin'': ...","{'Guava Duff': 2, 'Italian': 7, 'Spanish': 2, ..."


### Step 1: Detect Events

In [2]:
### Step 1: Detect Events
def create_design_matrix(row, sentences, model):
    a = sentences[row]
    designX = []
    for i in range(len(a)):
        try:
            designX.append(model[a[i]])
        except KeyError:
            print(a[i])
            pass
    return designX, a

def standardize_design_matrix(designX):
    standardX = preprocessing.StandardScaler().fit_transform(designX)
    return standardX

def argmax(s): 
    f = lambda i: s[i]
    return max(range(len(s)), key=f)

def best_eps(eps, standardX):
    s = []
    for i in range(len(eps)):
        e = eps[i]
        dbscan = DBSCAN(metric='cosine',eps=e, min_samples=2) 
        clusters = dbscan.fit_predict(standardX)
        nclus = len(set(clusters))
        if nclus < 2:
            s.append(0)
            pass
        else:
            s.append(metrics.silhouette_score(standardX, clusters))
    best = eps[argmax(s)]
    return best

def get_sentences_2(df_window):
    L = df_window['refined_entities_4_days'].dropna()
    sentences2 = []
    for i in range(len(L)):
        curr = list(L[i].keys())
        sentences2.append(curr)
    return sentences2

def get_sentences(df_window):
    m = [df_window['entity'][i] for i in range(len(df_window['entity']))]
    sentences = []
    for item in m:
        sentences.append(item.split(", "))
    return sentences

def train_word2vec_model(sentences):
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    model = gensim.models.Word2Vec(min_count=1, workers=4)
    model.build_vocab(sentences, progress_per=200000)
    model.train(sentences, epochs=model.epochs, total_examples = model.corpus_count)
    #model.save('rssword2vec.model')
    #model = gensim.models.Word2Vec.load('rssword2vec.model')
    return model

def dbscan_cluster(standardX, a):
    '''
    Given a standardized design matrix, and a list of entities (a)
    Finds the best epsilon for DBSCAN, fit and predict with the best epsilon
    and return the grouping result in a dictionary
    if there's more than one group, also return the silhouette score
    '''
    # find best epsilon
    eps = [0.01,0.05,0.1,0.2,0.3,0.5,0.7,1,1.5,2,3,5,7,10]
    best = best_eps(eps, standardX)
    print(best)
    # train the model on the best epsilon
    dbscan = DBSCAN(metric='cosine',eps=best, min_samples=2) 
    clusters = dbscan.fit_predict(standardX)
    # put the grouping info in a dictionary
    testdict = dict(zip(a, clusters))
    res = defaultdict(list) 
    for key, val in sorted(testdict.items()): 
        res[val].append(key)
    # check number of groups and calculate silhouette scores
    nclus = len(set(clusters))
    if nclus > 1:
        sil = metrics.silhouette_score(standardX, clusters)
        return dict(res), sil
    else: 
        return dict(res), 0
    
def aggregate(df_window, nrows):
    '''
    Takes in a dataframe with grouped 24-hour buckets and 4-day buckets
    Trains the word2vec model on all entities
    For each 4-day time bucket, trains a DBSCAN model and detect events
    Evaluates each DBSCAN model with silhouette scores
    Returns the same dataframe with two added columns: groups (a dictionary) and silhouettes
    '''
    df_temp = df_window.iloc[0:nrows,:]
    sentences2 = get_sentences_2(df_window)
    sentences = get_sentences(df_window)
    print("sentences done")
    model = train_word2vec_model(sentences) # No need to train for each iteration!
    print("model done")
    groups = []
    silhouettes = []
    for i in range(nrows):  #len(sentences)
        row = i
        designX, a = create_design_matrix(row, sentences, model)
        standardX = standardize_design_matrix(designX)
        group, silhouette = dbscan_cluster(standardX, a)
        groups.append([group])
        silhouettes.append(silhouette)
    df_temp['groups'] = groups
    df_temp['silhouettes'] = silhouettes
    return df_temp
    #return groups, silhouettes

df_sub_clustered = aggregate(df_window, 50)
df_sub_clustered 

NameError: name 'df_window' is not defined

### Step 2: Determine event duration

In [0]:
def percent_overlap(a,b): 
    '''
    Takes in two lists of strings, return three floating numbers 
    in terms of percentage overlaps
    '''
    setA = set(a)
    setB = set(b)
    overlap = setA & setB
    universe = setA | setB
    result1 = float(len(overlap)) / len(setA) * 100
    result2 = float(len(overlap)) / len(setB) * 100
    result3 = float(len(overlap)) / len(universe) * 100
    return result1,result2,result3
    
def expand_groups_to_event_columns(df, col='groups'):
    '''
    Takes the dataframe from the output of "aggregate",
    Expands the "groups" column into separate columns, each labeled as an event
    '''
    to_dict = lambda x: x[0]
    df2 = json_normalize(df[col].apply(to_dict))
    wide_df = pd.concat([df, df2], axis=1)
    return wide_df

test = expand_groups_to_event_columns(df_sub_clustered)
test

Unnamed: 0,dates,entity,entities_4_days,refined_entities_4_days,groups,silhouettes,0,-1,2,1,3,6,4,5
0,2007-01-01,"Episode, iTunes, New Year, Radio Detective Sto...","{'Episode': 1, 'iTunes': 1, 'New Year': 2, 'Ra...","{'New Year': 2, 'German': 2, 'New Year Resolut...","[{0: ['- Murder Clinic', 'Anne Farnsworth', 'A...",0.630648,"[- Murder Clinic, Anne Farnsworth, Atari, Auld...","[CJ, Review, WindowtotheMagic, iTunes]","[Dan, Greg, Microsoft, Nintendo, Phil, Sony]","[German, New Year]",,,,
1,2007-01-02,"Prager, Who Really Cares: The suprising, Denni...","{'Prager': 1, 'Who Really Cares: The suprising...","{'Italian': 2, 'IndieFeed Alternative Modern R...","[{0: ['8MB', 'Arthur Brooks', 'Dennis Pragers'...",0.795786,"[8MB, Arthur Brooks, Dennis Pragers, New Year ...",[Dennis Prager],,[Italian],,,,
2,2007-01-03,"Idiotchild - Mouth Watering, IndieFeed Alterna...","{'Idiotchild - Mouth Watering': 1, 'IndieFeed ...","{'IndieFeed Alternative Modern Rock': 2, 'Indi...","[{0: ['Caltech', 'Caltech Lecture Series', 'Ch...",0.726953,"[Caltech, Caltech Lecture Series, Chloe Day, C...","[Christian, New York Times]",[Sam Harris],"[IndieFeed Alternative Modern Rock, IndieFeed ...",[Spanish],,,
3,2007-01-04,"Speeding Bullets, DC, Raging Bullets of, Jim, ...","{'Speeding Bullets': 1, 'DC': 1, 'Raging Bulle...","{'Sarah Dinan': 2, 'Coyote Run': 2, 'Marc Gunn...","[{0: ['0:00 Opening', '1:17:47 Manhunter Chall...",0.791652,"[0:00 Opening, 1:17:47 Manhunter Challenge, Br...",,,"[DC, Jim, Sean]",,,,
4,2007-01-05,"Irish & Celtic Music Podcast, Celtic, Sarah Di...","{'Irish & Celtic Music Podcast': 1, 'Celtic': ...","{'Sarah Dinan': 2, 'Coyote Run': 2, 'Marc Gunn...","[{1: ['1:00:32 Peggy', '43:04 Weave', '8:04 Ma...",0.659777,"[AP, AZ, American, Ben, Celtfather, Celtic, Ce...",,,"[1:00:32 Peggy, 43:04 Weave, 8:04 Mairi Mac, A...",,,,
5,2007-01-06,"Q & A, Pixie, Ann Druyan, Carl Sagan, Ann Druy...","{'Q & A': 1, 'Pixie': 1, 'Ann Druyan': 2, 'Car...","{'Ann Druyan': 2, 'Carl Sagan': 4, 'Italian': ...","[{1: ['A Famous Broken Heart', 'Ann Druyan', '...",0.640159,"[D.J. Grothe, NASA, New York Times, Q & A]",,,"[A Famous Broken Heart, Ann Druyan, Carl Sagan...",,,,
6,2007-01-07,"Italian, Zencast 86 - Right Livelihood, Gil Fr...","{'Italian': 6, 'Zencast 86 - Right Livelihood'...","{'Italian': 6, 'Gil Fronsdal': 2, 'Matt White ...","[{0: ['American', 'Dave', 'European Union', 'I...",0.624332,"[American, Dave, European Union, Italian, Rick...",,[Gil Fronsdal],"[Bristol Universitys Peter Rogers, Drs Chris, ...",,,,
7,2007-01-08,"Photoshop CS3 Way, Photoshop CS3 Public Beta, ...","{'Photoshop CS3 Way': 1, 'Photoshop CS3 Public...","{'North': 2, 'IndieFeed Alternative Modern Roc...","[{0: ['1:13:52', '1:31:17', '2:01:30 Closing',...",0.657359,"[1:13:52, 1:31:17, 2:01:30 Closing, 37:34 Revi...",,,"[Core Security, IndieFeed Alternative Modern R...",,,,
8,2007-01-09,"Italian, Italian, Apple, Steve Jobs, Macworld ...","{'Italian': 8, 'Apple': 1, 'Steve Jobs': 1, 'M...","{'Italian': 8, 'Ben': 3, 'Spanish': 3, 'Guava ...","[{-1: ['Andrew', 'Eric', 'San Francisco', 'iTu...",0.620065,"[Italian, Spain, Spanish]","[Andrew, Eric, San Francisco, iTunes]","[Apple TV, Chicken Soup, Deathly Hallows, Hall...","[Apple, Mac, iPhone]",[Ben],,,
9,2007-01-10,"English, Unnateral Helpers - Gettin', IndieFee...","{'English': 1, 'Unnateral Helpers - Gettin'': ...","{'Guava Duff': 2, 'Italian': 7, 'Spanish': 2, ...","[{1: ['Chris Farmer', 'El Fuego', 'Fed', 'Fede...",0.588905,"[English, Italian, Spanish]",[Patreon],,"[Chris Farmer, El Fuego, Fed, Federal Reserve,...",,,,


In [0]:
# reorder the expanded columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

test = test[['dates','entity','entities_4_days','refined_entities_4_days','groups','silhouettes',-1,0,1,2,3,4,5,6]]
test

Unnamed: 0,dates,entity,entities_4_days,refined_entities_4_days,groups,silhouettes,-1,0,1,2,3,4,5,6
0,2007-01-01,"Episode, iTunes, New Year, Radio Detective Story Hour, - Murder Clinic, New Year, Murder Clinic, Saturday Morning, Anne Farnsworth, Podsafe Music Network, German, German, New Year Resolutions, The Disney Year, Review, Ricky, Paul Back, Ricky Brigante, Paul Barrie, Disney Year, WindowtotheMagic, Auld Lang Synes, Phil, Dan, regulars, Greg, CJ, Microsoft, Nintendo, Sony, Atari, NESes, Super NESes, Gears of War, Viva Pinata, Gran Turismo, PS3, Zelda:, Pokemon Ruby","{'Episode': 1, 'iTunes': 1, 'New Year': 2, 'Radio Detective Story Hour': 1, '- Murder Clinic': 1, 'Murder Clinic': 1, 'Saturday Morning': 1, 'Anne Farnsworth': 1, 'Podsafe Music Network': 1, 'German': 2, 'New Year Resolutions': 2, 'The Disney Year': 1, 'Review': 1, 'Ricky': 1, 'Paul Back': 1, 'Ricky Brigante': 1, 'Paul Barrie': 1, 'Disney Year': 1, 'WindowtotheMagic': 1, 'Auld Lang Synes': 1, 'Phil': 1, 'Dan': 1, 'regulars': 1, 'Greg': 1, 'CJ': 1, 'Microsoft': 1, 'Nintendo': 1, 'Sony': 1, 'Atari': 1, 'NESes': 1, 'Super NESes': 1, 'Gears of War': 1, 'Viva Pinata': 1, 'Gran Turismo': 1, 'PS3': 1, 'Zelda:': 1, 'Pokemon Ruby': 1, 'Prager': 1, 'Who Really Cares: The suprising': 1, 'Dennis Prager': 1, 'Arthur Brooks': 1, 'Syracuse University': 1, '8MB': 1, 'Dennis Pragers': 1, 'julienne': 1, 'Italian': 2, 'Idiotchild - Mouth Watering': 1, 'IndieFeed Alternative Modern Rock': 1, 'Chloe Day -': 1, 'Chloe Day': 1, 'IndieFeed Indie Pop': 1, 'Caltech Lecture Series': 1, 'Sam Harris': 2, 'Skepticality': 1, 'Symposium on Science': 1, 'Religion & Politics': 1, 'Caltech': 1, 'New York Times': 1, 'The End of Faith': 1, 'Christian': 1, 'Molly Ringwald': 1, 'Sweet Charity': 1, 'Spanish': 2, 'Speeding Bullets': 1, 'DC': 1, 'Raging Bullets of': 1, 'Jim': 1, 'Sean': 1, 'Manhunter Challenge': 1, 'Justice Society of America': 1, 'Brent AKA Knightwingbk': 1, 'Comic Timing Time': 1, '0:00 Opening': 1, '1:17:47 Manhunter Challenge': 1, 'Comic Timings Ian Levenstein': 1, 'John Mayo': 1}","{'New Year': 2, 'German': 2, 'New Year Resolutions': 2, 'Italian': 2, 'Sam Harris': 2, 'Spanish': 2}","[{0: ['- Murder Clinic', 'Anne Farnsworth', 'Atari', 'Auld Lang Synes', 'Disney Year', 'Episode', 'Gears of War', 'Gran Turismo', 'Murder Clinic', 'NESes', 'New Year Resolutions', 'PS3', 'Paul Back', 'Paul Barrie', 'Podsafe Music Network', 'Pokemon Ruby', 'Radio Detective Story Hour', 'Ricky', 'Ricky Brigante', 'Saturday Morning', 'Super NESes', 'The Disney Year', 'Viva Pinata', 'Zelda:', 'regulars'], -1: ['CJ', 'Review', 'WindowtotheMagic', 'iTunes'], 2: ['Dan', 'Greg', 'Microsoft', 'Nintendo', 'Phil', 'Sony'], 1: ['German', 'New Year']}]",0.630648,"[CJ, Review, WindowtotheMagic, iTunes]","[- Murder Clinic, Anne Farnsworth, Atari, Auld Lang Synes, Disney Year, Episode, Gears of War, Gran Turismo, Murder Clinic, NESes, New Year Resolutions, PS3, Paul Back, Paul Barrie, Podsafe Music Network, Pokemon Ruby, Radio Detective Story Hour, Ricky, Ricky Brigante, Saturday Morning, Super NESes, The Disney Year, Viva Pinata, Zelda:, regulars]","[German, New Year]","[Dan, Greg, Microsoft, Nintendo, Phil, Sony]",,,,
1,2007-01-02,"Prager, Who Really Cares: The suprising, Dennis Prager, Arthur Brooks, Syracuse University, 8MB, Dennis Pragers, julienne, Italian, Italian, New Year Resolutions","{'Prager': 1, 'Who Really Cares: The suprising': 1, 'Dennis Prager': 1, 'Arthur Brooks': 1, 'Syracuse University': 1, '8MB': 1, 'Dennis Pragers': 1, 'julienne': 1, 'Italian': 2, 'New Year Resolutions': 1, 'Idiotchild - Mouth Watering': 1, 'IndieFeed Alternative Modern Rock': 2, 'Chloe Day -': 1, 'Chloe Day': 1, 'IndieFeed Indie Pop': 2, 'Caltech Lecture Series': 1, 'Sam Harris': 2, 'Skepticality': 1, 'Symposium on Science': 1, 'Religion & Politics': 1, 'Caltech': 1, 'New York Times': 1, 'The End of Faith': 1, 'Christian': 1, 'Molly Ringwald': 1, 'Sweet Charity': 1, 'Spanish': 4, 'Speeding Bullets': 1, 'DC': 1, 'Raging Bullets of': 1, 'Jim': 1, 'Sean': 1, 'Manhunter Challenge': 1, 'Justice Society of America': 1, 'Brent AKA Knightwingbk': 1, 'Comic Timing Time': 1, '0:00 Opening': 1, '1:17:47 Manhunter Challenge': 1, 'Comic Timings Ian Levenstein': 1, 'John Mayo': 1, 'Irish & Celtic Music Podcast': 1, 'Celtic': 1, 'Sarah Dinan': 2, 'Hounds': 1, 'Coyote Run': 2, 'Marc Gunn': 4, 'Brian Boru': 3, 'Great Big Sea': 1, 'Cara Dillon': 2, 'Mary Knickle': 2, 'The Tea Merchants': 1, 'Heather Dale': 2, 'Celticana': 1, 'Clandestine': 1, 'Paisley Close': 2, 'Niamh Parsons': 2, 'Susan Hamlin': 2, 'iTunes': 1, 'Celtic Music Magazine': 1, 'Irish': 1, 'Celtic Music Podcast': 1, 'Celtic Music': 1, 'The Brobdingnagian Bards'': 1, 'Ireland': 1, 'Guinness': 1, 'Dublin': 1, 'Rock of Cashel': 1, 'Killarney': 1, 'Galway': 1, 'Bunratty Castle': 1, 'http://www.thebards.net/ireland/': 1, 'Myspace Irish & Celtic Music Podcast': 1, 'Myspace': 1, 'Circled By Hounds': 1, '8:04 Mairi Mac': 1, 'Soul of a Harper 15:13': 1, 'The Wearing of the Green': 1, 'Irish Pipe Band': 1, 'Kidd': 1, 'Men of Erin': 1, 'American': 1, 'The Wind That Shakes the Barley': 1, 'Welcome Into the Morning': 1, 'Ye Jacobites': 1, '43:04 Weave': 1, 'Weave 47:00': 1, 'The High Fiddle Reels': 1, 'Mordred': 1, '1:00:32 Peggy': 1, 'The Fox/Gravel Walk': 1, 'Heart': 1, 'Salley Gardens': 1, 'Sun': 1, 'Celtfather': 1, 'Honi Soit qui Mal y Pense': 1, 'AP': 1, 'French': 1, 'SX': 1, 'UVA': 1, 'SPF': 1, ...}","{'Italian': 2, 'IndieFeed Alternative Modern Rock': 2, 'IndieFeed Indie Pop': 2, 'Sam Harris': 2, 'Spanish': 4, 'Sarah Dinan': 2, 'Coyote Run': 2, 'Marc Gunn': 4, 'Brian Boru': 3, 'Cara Dillon': 2, 'Mary Knickle': 2, 'Heather Dale': 2, 'Paisley Close': 2, 'Niamh Parsons': 2, 'Susan Hamlin': 2, 'Snowden': 4}","[{0: ['8MB', 'Arthur Brooks', 'Dennis Pragers', 'New Year Resolutions', 'Prager', 'Syracuse University', 'Who Really Cares: The suprising', 'julienne'], -1: ['Dennis Prager'], 1: ['Italian']}]",0.795786,[Dennis Prager],"[8MB, Arthur Brooks, Dennis Pragers, New Year Resolutions, Prager, Syracuse University, Who Really Cares: The suprising, julienne]",[Italian],,,,,
2,2007-01-03,"Idiotchild - Mouth Watering, IndieFeed Alternative Modern Rock, Chloe Day -, Chloe Day, IndieFeed Indie Pop, Caltech Lecture Series, Sam Harris, Skepticality, Sam Harris, Symposium on Science, Religion & Politics, Caltech, New York Times, The End of Faith, Christian, Molly Ringwald, Sweet Charity, Spanish, Spanish","{'Idiotchild - Mouth Watering': 1, 'IndieFeed Alternative Modern Rock': 2, 'Chloe Day -': 1, 'Chloe Day': 1, 'IndieFeed Indie Pop': 2, 'Caltech Lecture Series': 1, 'Sam Harris': 2, 'Skepticality': 1, 'Symposium on Science': 1, 'Religion & Politics': 1, 'Caltech': 1, 'New York Times': 2, 'The End of Faith': 1, 'Christian': 1, 'Molly Ringwald': 1, 'Sweet Charity': 1, 'Spanish': 4, 'Speeding Bullets': 1, 'DC': 1, 'Raging Bullets of': 1, 'Jim': 1, 'Sean': 1, 'Manhunter Challenge': 1, 'Justice Society of America': 1, 'Brent AKA Knightwingbk': 1, 'Comic Timing Time': 1, '0:00 Opening': 1, '1:17:47 Manhunter Challenge': 1, 'Comic Timings Ian Levenstein': 1, 'John Mayo': 1, 'Irish & Celtic Music Podcast': 1, 'Celtic': 1, 'Sarah Dinan': 2, 'Hounds': 1, 'Coyote Run': 2, 'Marc Gunn': 4, 'Brian Boru': 3, 'Great Big Sea': 1, 'Cara Dillon': 2, 'Mary Knickle': 2, 'The Tea Merchants': 1, 'Heather Dale': 2, 'Celticana': 1, 'Clandestine': 1, 'Paisley Close': 2, 'Niamh Parsons': 2, 'Susan Hamlin': 2, 'iTunes': 1, 'Celtic Music Magazine': 1, 'Irish': 1, 'Celtic Music Podcast': 1, 'Celtic Music': 1, 'The Brobdingnagian Bards'': 1, 'Ireland': 1, 'Guinness': 1, 'Dublin': 1, 'Rock of Cashel': 1, 'Killarney': 1, 'Galway': 1, 'Bunratty Castle': 1, 'http://www.thebards.net/ireland/': 1, 'Myspace Irish & Celtic Music Podcast': 1, 'Myspace': 1, 'Circled By Hounds': 1, '8:04 Mairi Mac': 1, 'Soul of a Harper 15:13': 1, 'The Wearing of the Green': 1, 'Irish Pipe Band': 1, 'Kidd': 1, 'Men of Erin': 1, 'American': 1, 'The Wind That Shakes the Barley': 1, 'Welcome Into the Morning': 1, 'Ye Jacobites': 1, '43:04 Weave': 1, 'Weave 47:00': 1, 'The High Fiddle Reels': 1, 'Mordred': 1, '1:00:32 Peggy': 1, 'The Fox/Gravel Walk': 1, 'Heart': 1, 'Salley Gardens': 1, 'Sun': 1, 'Celtfather': 1, 'Honi Soit qui Mal y Pense': 1, 'AP': 1, 'French': 1, 'SX': 1, 'UVA': 1, 'SPF': 1, 'Jenny M.': 1, 'Cindy': 1, 'Motherland': 1, 'Compton Terrace': 1, 'Amphitheatre Chandler': 1, 'AZ': 1, 'Valley Road': 1, 'Sam': 1, 'Tony & Tino': 1, 'Tom': 1, ...}","{'IndieFeed Alternative Modern Rock': 2, 'IndieFeed Indie Pop': 2, 'Sam Harris': 2, 'New York Times': 2, 'Spanish': 4, 'Sarah Dinan': 2, 'Coyote Run': 2, 'Marc Gunn': 4, 'Brian Boru': 3, 'Cara Dillon': 2, 'Mary Knickle': 2, 'Heather Dale': 2, 'Paisley Close': 2, 'Niamh Parsons': 2, 'Susan Hamlin': 2, 'Snowden': 4, 'Ann Druyan': 2, 'Carl Sagan': 4}","[{0: ['Caltech', 'Caltech Lecture Series', 'Chloe Day', 'Chloe Day -', 'Idiotchild - Mouth Watering', 'Molly Ringwald', 'Religion & Politics', 'Sweet Charity', 'Symposium on Science', 'The End of Faith'], -1: ['Christian', 'New York Times'], 1: ['IndieFeed Alternative Modern Rock', 'IndieFeed Indie Pop', 'Skepticality'], 2: ['Sam Harris'], 3: ['Spanish']}]",0.726953,"[Christian, New York Times]","[Caltech, Caltech Lecture Series, Chloe Day, Chloe Day -, Idiotchild - Mouth Watering, Molly Ringwald, Religion & Politics, Sweet Charity, Symposium on Science, The End of Faith]","[IndieFeed Alternative Modern Rock, IndieFeed Indie Pop, Skepticality]",[Sam Harris],[Spanish],,,
3,2007-01-04,"Speeding Bullets, DC, Raging Bullets of, Jim, Sean, Manhunter Challenge, Justice Society of America, Brent AKA Knightwingbk, Comic Timing Time, 0:00 Opening, 1:17:47 Manhunter Challenge, Comic Timings Ian Levenstein, John Mayo","{'Speeding Bullets': 1, 'DC': 1, 'Raging Bullets of': 1, 'Jim': 1, 'Sean': 1, 'Manhunter Challenge': 1, 'Justice Society of America': 1, 'Brent AKA Knightwingbk': 1, 'Comic Timing Time': 1, '0:00 Opening': 1, '1:17:47 Manhunter Challenge': 1, 'Comic Timings Ian Levenstein': 1, 'John Mayo': 1, 'Irish & Celtic Music Podcast': 1, 'Celtic': 1, 'Sarah Dinan': 2, 'Hounds': 1, 'Coyote Run': 2, 'Marc Gunn': 4, 'Brian Boru': 3, 'Great Big Sea': 1, 'Cara Dillon': 2, 'Mary Knickle': 2, 'The Tea Merchants': 1, 'Heather Dale': 2, 'Celticana': 1, 'Clandestine': 1, 'Paisley Close': 2, 'Niamh Parsons': 2, 'Susan Hamlin': 2, 'iTunes': 2, 'Celtic Music Magazine': 1, 'Irish': 1, 'Celtic Music Podcast': 1, 'Celtic Music': 1, 'The Brobdingnagian Bards'': 1, 'Ireland': 1, 'Guinness': 1, 'Dublin': 1, 'Rock of Cashel': 1, 'Killarney': 1, 'Galway': 1, 'Bunratty Castle': 1, 'http://www.thebards.net/ireland/': 1, 'Myspace Irish & Celtic Music Podcast': 1, 'Myspace': 1, 'Circled By Hounds': 1, '8:04 Mairi Mac': 1, 'Soul of a Harper 15:13': 1, 'The Wearing of the Green': 1, 'Irish Pipe Band': 1, 'Kidd': 1, 'Men of Erin': 1, 'American': 2, 'The Wind That Shakes the Barley': 1, 'Welcome Into the Morning': 1, 'Ye Jacobites': 1, '43:04 Weave': 1, 'Weave 47:00': 1, 'The High Fiddle Reels': 1, 'Mordred': 1, '1:00:32 Peggy': 1, 'The Fox/Gravel Walk': 1, 'Heart': 1, 'Salley Gardens': 1, 'Sun': 1, 'Celtfather': 1, 'Honi Soit qui Mal y Pense': 1, 'AP': 1, 'French': 1, 'Spanish': 2, 'SX': 1, 'UVA': 1, 'SPF': 1, 'Jenny M.': 1, 'Cindy': 1, 'Motherland': 1, 'Compton Terrace': 1, 'Amphitheatre Chandler': 1, 'AZ': 1, 'Valley Road': 1, 'Sam': 1, 'Tony & Tino': 1, 'Tom': 1, 'Boardgamegeek': 1, 'Battlelore Hill Giant': 1, 'Hawthorne': 1, 'Heroscape': 1, 'Mary': 1, 'Yspahan': 1, 'Jason': 1, 'Ticket to Ride': 1, 'Effortless English': 1, 'Snowden': 4, 'IndieFeed Alternative Modern Rock': 1, 'IndieFeed Indie Pop': 1, 'Ben': 1, 'Q & A': 1, 'Pixie': 1, 'Ann Druyan': 2, ...}","{'Sarah Dinan': 2, 'Coyote Run': 2, 'Marc Gunn': 4, 'Brian Boru': 3, 'Cara Dillon': 2, 'Mary Knickle': 2, 'Heather Dale': 2, 'Paisley Close': 2, 'Niamh Parsons': 2, 'Susan Hamlin': 2, 'iTunes': 2, 'American': 2, 'Spanish': 2, 'Snowden': 4, 'Ann Druyan': 2, 'Carl Sagan': 4, 'Gil Fronsdal': 2, 'Matt White Band': 2, 'Joe Kroll': 2}","[{0: ['0:00 Opening', '1:17:47 Manhunter Challenge', 'Brent AKA Knightwingbk', 'Comic Timing Time', 'Comic Timings Ian Levenstein', 'John Mayo', 'Justice Society of America', 'Manhunter Challenge', 'Raging Bullets of', 'Speeding Bullets'], 1: ['DC', 'Jim', 'Sean']}]",0.791652,,"[0:00 Opening, 1:17:47 Manhunter Challenge, Brent AKA Knightwingbk, Comic Timing Time, Comic Timings Ian Levenstein, John Mayo, Justice Society of America, Manhunter Challenge, Raging Bullets of, Speeding Bullets]","[DC, Jim, Sean]",,,,,
4,2007-01-05,"Irish & Celtic Music Podcast, Celtic, Sarah Dinan, Hounds, Coyote Run, Marc Gunn, Brian Boru, Great Big Sea, Cara Dillon, Mary Knickle, The Tea Merchants, Heather Dale, Celticana, Clandestine, Paisley Close, Niamh Parsons, Susan Hamlin, iTunes, Celtic Music Magazine, Irish, Celtic Music Podcast, Celtic Music, The Brobdingnagian Bards', Ireland, Guinness, Dublin, Rock of Cashel, Killarney, Galway, Bunratty Castle, http://www.thebards.net/ireland/, Marc Gunn, Myspace Irish & Celtic Music Podcast, Myspace, Sarah Dinan, Circled By Hounds, 8:04 Mairi Mac, Coyote Run, Marc Gunn, Soul of a Harper 15:13, The Wearing of the Green, Brian Boru, Irish Pipe Band, Brian Boru, Kidd, Men of Erin, American, The Wind That Shakes the Barley, Welcome Into the Morning, Ye Jacobites, Cara Dillon, 43:04 Weave, Mary Knickle, Weave 47:00, The High Fiddle Reels, Mordred, Heather Dale, 1:00:32 Peggy, The Fox/Gravel Walk, Paisley Close, Niamh Parsons, Heart, Salley Gardens, Susan Hamlin, Sun, Marc Gunn, Celtfather, Honi Soit qui Mal y Pense, AP, French, Spanish, SX, UVA, SPF, Jenny M., Cindy, Motherland, Compton Terrace, Amphitheatre Chandler, AZ, Valley Road, Sam, Tony & Tino, Tom, Boardgamegeek, Battlelore Hill Giant, Hawthorne, Heroscape, Mary, Yspahan, Jason, Ticket to Ride, Effortless English, Snowden, Snowden, IndieFeed Alternative Modern Rock, Snowden, Snowden, IndieFeed Indie Pop, Spanish, Ben","{'Irish & Celtic Music Podcast': 1, 'Celtic': 1, 'Sarah Dinan': 2, 'Hounds': 1, 'Coyote Run': 2, 'Marc Gunn': 4, 'Brian Boru': 3, 'Great Big Sea': 1, 'Cara Dillon': 2, 'Mary Knickle': 2, 'The Tea Merchants': 1, 'Heather Dale': 2, 'Celticana': 1, 'Clandestine': 1, 'Paisley Close': 2, 'Niamh Parsons': 2, 'Susan Hamlin': 2, 'iTunes': 3, 'Celtic Music Magazine': 1, 'Irish': 1, 'Celtic Music Podcast': 1, 'Celtic Music': 1, 'The Brobdingnagian Bards'': 1, 'Ireland': 1, 'Guinness': 1, 'Dublin': 1, 'Rock of Cashel': 1, 'Killarney': 1, 'Galway': 1, 'Bunratty Castle': 1, 'http://www.thebards.net/ireland/': 1, 'Myspace Irish & Celtic Music Podcast': 1, 'Myspace': 1, 'Circled By Hounds': 1, '8:04 Mairi Mac': 1, 'Soul of a Harper 15:13': 1, 'The Wearing of the Green': 1, 'Irish Pipe Band': 1, 'Kidd': 1, 'Men of Erin': 1, 'American': 2, 'The Wind That Shakes the Barley': 1, 'Welcome Into the Morning': 1, 'Ye Jacobites': 1, '43:04 Weave': 1, 'Weave 47:00': 1, 'The High Fiddle Reels': 1, 'Mordred': 1, '1:00:32 Peggy': 1, 'The Fox/Gravel Walk': 1, 'Heart': 1, 'Salley Gardens': 1, 'Sun': 1, 'Celtfather': 1, 'Honi Soit qui Mal y Pense': 1, 'AP': 1, 'French': 1, 'Spanish': 2, 'SX': 1, 'UVA': 1, 'SPF': 1, 'Jenny M.': 1, 'Cindy': 1, 'Motherland': 1, 'Compton Terrace': 1, 'Amphitheatre Chandler': 1, 'AZ': 1, 'Valley Road': 1, 'Sam': 1, 'Tony & Tino': 1, 'Tom': 1, 'Boardgamegeek': 1, 'Battlelore Hill Giant': 1, 'Hawthorne': 1, 'Heroscape': 1, 'Mary': 1, 'Yspahan': 1, 'Jason': 1, 'Ticket to Ride': 1, 'Effortless English': 1, 'Snowden': 4, 'IndieFeed Alternative Modern Rock': 2, 'IndieFeed Indie Pop': 1, 'Ben': 1, 'Q & A': 1, 'Pixie': 1, 'Ann Druyan': 2, 'Carl Sagan': 4, 'Emmy Award Winning and the Peabody Award Winning television series Cosmos': 1, 'NASA': 1, 'Voyager Interstellar Record Project': 1, 'Voyager': 1, 'Spacecrafts': 1, 'Contact': 1, 'A Famous Broken Heart': 1, 'New York Times': 1, 'Carl Sagan's': 1, 'D.J. Grothe': 1, 'The Varieties of Scientific Experience': 1, 'Italian': 2, ...}","{'Sarah Dinan': 2, 'Coyote Run': 2, 'Marc Gunn': 4, 'Brian Boru': 3, 'Cara Dillon': 2, 'Mary Knickle': 2, 'Heather Dale': 2, 'Paisley Close': 2, 'Niamh Parsons': 2, 'Susan Hamlin': 2, 'iTunes': 3, 'American': 2, 'Spanish': 2, 'Snowden': 4, 'IndieFeed Alternative Modern Rock': 2, 'Ann Druyan': 2, 'Carl Sagan': 4, 'Italian': 2, 'Gil Fronsdal': 2, 'Matt White Band': 2, 'Joe Kroll': 2, 'North': 2, 'Paul': 2, 'Brian Nixon': 2}","[{1: ['1:00:32 Peggy', '43:04 Weave', '8:04 Mairi Mac', 'Amphitheatre Chandler', 'Battlelore Hill Giant', 'Boardgamegeek', 'Brian Boru', 'Bunratty Castle', 'Cara Dillon', 'Celticana', 'Circled By Hounds', 'Clandestine', 'Compton Terrace', 'Coyote Run', 'Effortless English', 'Galway', 'Great Big Sea', 'Guinness', 'Hawthorne', 'Heart', 'Heather Dale', 'Heroscape', 'Honi Soit qui Mal y Pense', 'Hounds', 'Irish Pipe Band', 'Jenny M.', 'Kidd', 'Killarney', 'Mary Knickle', 'Men of Erin', 'Mordred', 'Motherland', 'Myspace', 'Myspace Irish & Celtic Music Podcast', 'Niamh Parsons', 'Paisley Close', 'Rock of Cashel', 'SPF', 'SX', 'Salley Gardens', 'Sarah Dinan', 'Snowden', 'Soul of a Harper 15:13', 'Susan Hamlin', 'The Brobdingnagian Bards'', 'The Fox/Gravel Walk', 'The High Fiddle Reels', 'The Tea Merchants', 'The Wearing of the Green', 'The Wind That Shakes the Barley', 'Ticket to Ride', 'Tony & Tino', 'UVA', 'Valley Road', 'Weave 47:00', 'Welcome Into the Morning', 'Ye Jacobites', 'Yspahan', 'http://www.thebards.net/ireland/'], 0: ['AP', 'AZ', 'American', 'Ben', 'Celtfather', 'Celtic', 'Celtic Music', 'Celtic Music Magazine', 'Celtic Music Podcast', 'Cindy', 'Dublin', 'French', 'IndieFeed Alternative Modern Rock', 'IndieFeed Indie Pop', 'Ireland', 'Irish', 'Irish & Celtic Music Podcast', 'Jason', 'Marc Gunn', 'Mary', 'Sam', 'Spanish', 'Sun', 'Tom', 'iTunes']}]",0.659777,,"[AP, AZ, American, Ben, Celtfather, Celtic, Celtic Music, Celtic Music Magazine, Celtic Music Podcast, Cindy, Dublin, French, IndieFeed Alternative Modern Rock, IndieFeed Indie Pop, Ireland, Irish, Irish & Celtic Music Podcast, Jason, Marc Gunn, Mary, Sam, Spanish, Sun, Tom, iTunes]","[1:00:32 Peggy, 43:04 Weave, 8:04 Mairi Mac, Amphitheatre Chandler, Battlelore Hill Giant, Boardgamegeek, Brian Boru, Bunratty Castle, Cara Dillon, Celticana, Circled By Hounds, Clandestine, Compton Terrace, Coyote Run, Effortless English, Galway, Great Big Sea, Guinness, Hawthorne, Heart, Heather Dale, Heroscape, Honi Soit qui Mal y Pense, Hounds, Irish Pipe Band, Jenny M., Kidd, Killarney, Mary Knickle, Men of Erin, Mordred, Motherland, Myspace, Myspace Irish & Celtic Music Podcast, Niamh Parsons, Paisley Close, Rock of Cashel, SPF, SX, Salley Gardens, Sarah Dinan, Snowden, Soul of a Harper 15:13, Susan Hamlin, The Brobdingnagian Bards', The Fox/Gravel Walk, The High Fiddle Reels, The Tea Merchants, The Wearing of the Green, The Wind That Shakes the Barley, Ticket to Ride, Tony & Tino, UVA, Valley Road, Weave 47:00, Welcome Into the Morning, Ye Jacobites, Yspahan, http://www.thebards.net/ireland/]",,,,,
5,2007-01-06,"Q & A, Pixie, Ann Druyan, Carl Sagan, Ann Druyan, Carl Sagan, Emmy Award Winning and the Peabody Award Winning television series Cosmos, NASA, Voyager Interstellar Record Project, Voyager, Spacecrafts, Contact, Carl Sagan, A Famous Broken Heart, New York Times, Carl Sagan's, D.J. Grothe, The Varieties of Scientific Experience, Carl Sagan","{'Q & A': 1, 'Pixie': 1, 'Ann Druyan': 2, 'Carl Sagan': 4, 'Emmy Award Winning and the Peabody Award Winning television series Cosmos': 1, 'NASA': 1, 'Voyager Interstellar Record Project': 1, 'Voyager': 1, 'Spacecrafts': 1, 'Contact': 1, 'A Famous Broken Heart': 1, 'New York Times': 1, 'Carl Sagan's': 1, 'D.J. Grothe': 1, 'The Varieties of Scientific Experience': 1, 'Italian': 4, 'Zencast 86 - Right Livelihood': 1, 'Gil Fronsdal': 2, 'Matt White Band': 2, 'iTunes': 3, 'Drs Chris': 1, 'Dave': 1, 'Helen': 1, 'London University': 1, 'Roger Corder': 1, 'Bristol Universitys Peter Rogers': 1, 'University of St Louis': 1, 'Jeffrey Gordon': 1, 'Nena': 1, 'Ray Parker Jr.': 1, 'European Union': 1, 'E.U.': 1, 'Rick': 1, 'American': 1, 'Rick Steves': 1, 'Joe Kroll': 2, 'Nonprofit Boot Camp': 1, 'The Kingdom of God': 1, 'Kingdom': 1, 'Photoshop CS3 Way': 1, 'Photoshop CS3 Public Beta': 1, 'Radio Detective Story Hour': 1, 'North': 2, 'Nick': 1, 'Nora Charles': 1, 'Frances': 1, 'Richard Lockridge': 1, 'Jerry North': 1, 'Joseph Curtain': 1, 'Pam North': 1, 'Alice Frost': 1, 'Uncut - Kissme': 1, 'Uncut': 1, 'IndieFeed Alternative Modern Rock': 1, 'WindowtotheMagic': 1, 'A Binaural Adventure': 1, 'Animal Kingdom': 1, 'Walt Disney World': 1, 'Mousefest': 1, 'Expedition Everest': 1, 'Patrick': 1, 'Paul': 2, 'BINAURAL': 1, 'PS3': 1, 'Castlevania Portrait of Ruin': 1, 'Viva Pinata': 1, 'Phantasy Star Universe': 1, 'Wii Sports': 1, 'Zelda': 1, 'Karaoke Revolution:': 1, 'American Idol': 1, 'Microsoft': 1, 'GameSpot': 1, 'Greg Kasavin': 1, 'MySpace': 1, 'Digg': 1, 'Brand New Paul': 1, 'Core Security': 1, 'Syngress': 1, 'Astaro': 1, 'Astaro Security Gateway': 1, 'map': 1, 'Security Weekly Gear': 1, 'Larry ""Uncle Larry"" Pesce': 1, 'Paul Asadoorian': 1, 'Nick ""Twitchy"" Depetrillo': 1, 'Joe': 1, 'Conlin Email': 1, 'Justice Society of America': 1, 'Brian Nixon': 2, 'Review of Justice Society of America': 1, '37:34 Review': 1, '1:13:52': 1, 'Special Education': 1, '1:31:17': 1, 'Collected Comics Library': 1, 'David Wallace': 1, '2:01:30 Closing': 1, 'Apple': 1, 'Steve Jobs': 1, ...}","{'Ann Druyan': 2, 'Carl Sagan': 4, 'Italian': 4, 'Gil Fronsdal': 2, 'Matt White Band': 2, 'iTunes': 3, 'Joe Kroll': 2, 'North': 2, 'Paul': 2, 'Brian Nixon': 2, 'Ben': 3}","[{1: ['A Famous Broken Heart', 'Ann Druyan', 'Carl Sagan', 'Carl Sagan's', 'Contact', 'Emmy Award Winning and the Peabody Award Winning television series Cosmos', 'Pixie', 'Spacecrafts', 'The Varieties of Scientific Experience', 'Voyager', 'Voyager Interstellar Record Project'], 0: ['D.J. Grothe', 'NASA', 'New York Times', 'Q & A']}]",0.640159,,"[D.J. Grothe, NASA, New York Times, Q & A]","[A Famous Broken Heart, Ann Druyan, Carl Sagan, Carl Sagan's, Contact, Emmy Award Winning and the Peabody Award Winning television series Cosmos, Pixie, Spacecrafts, The Varieties of Scientific Experience, Voyager, Voyager Interstellar Record Project]",,,,,
6,2007-01-07,"Italian, Zencast 86 - Right Livelihood, Gil Fronsdal, Gil Fronsdal, Matt White Band, Matt White Band, iTunes, Drs Chris, Dave, Helen, London University, Roger Corder, Bristol Universitys Peter Rogers, University of St Louis, Jeffrey Gordon, Nena, Ray Parker Jr., European Union, E.U., Rick, American, Rick Steves, Joe Kroll, Nonprofit Boot Camp, Joe Kroll, The Kingdom of God, Kingdom","{'Italian': 6, 'Zencast 86 - Right Livelihood': 1, 'Gil Fronsdal': 2, 'Matt White Band': 2, 'iTunes': 3, 'Drs Chris': 1, 'Dave': 1, 'Helen': 1, 'London University': 1, 'Roger Corder': 1, 'Bristol Universitys Peter Rogers': 1, 'University of St Louis': 1, 'Jeffrey Gordon': 1, 'Nena': 1, 'Ray Parker Jr.': 1, 'European Union': 1, 'E.U.': 1, 'Rick': 1, 'American': 1, 'Rick Steves': 1, 'Joe Kroll': 2, 'Nonprofit Boot Camp': 1, 'The Kingdom of God': 1, 'Kingdom': 1, 'Photoshop CS3 Way': 1, 'Photoshop CS3 Public Beta': 1, 'Radio Detective Story Hour': 1, 'North': 2, 'Nick': 1, 'Nora Charles': 1, 'Frances': 1, 'Richard Lockridge': 1, 'Jerry North': 1, 'Joseph Curtain': 1, 'Pam North': 1, 'Alice Frost': 1, 'Uncut - Kissme': 1, 'Uncut': 1, 'IndieFeed Alternative Modern Rock': 2, 'WindowtotheMagic': 1, 'A Binaural Adventure': 1, 'Animal Kingdom': 1, 'Walt Disney World': 1, 'Mousefest': 1, 'Expedition Everest': 1, 'Patrick': 1, 'Paul': 2, 'BINAURAL': 1, 'PS3': 1, 'Castlevania Portrait of Ruin': 1, 'Viva Pinata': 1, 'Phantasy Star Universe': 1, 'Wii Sports': 1, 'Zelda': 1, 'Karaoke Revolution:': 1, 'American Idol': 1, 'Microsoft': 1, 'GameSpot': 1, 'Greg Kasavin': 1, 'MySpace': 1, 'Digg': 1, 'Brand New Paul': 1, 'Core Security': 1, 'Syngress': 1, 'Astaro': 1, 'Astaro Security Gateway': 1, 'map': 1, 'Security Weekly Gear': 1, 'Larry ""Uncle Larry"" Pesce': 1, 'Paul Asadoorian': 1, 'Nick ""Twitchy"" Depetrillo': 1, 'Joe': 1, 'Conlin Email': 1, 'Justice Society of America': 1, 'Brian Nixon': 2, 'Review of Justice Society of America': 1, '37:34 Review': 1, '1:13:52': 1, 'Special Education': 1, '1:31:17': 1, 'Collected Comics Library': 1, 'David Wallace': 1, '2:01:30 Closing': 1, 'Apple': 1, 'Steve Jobs': 1, 'Macworld Conference & Expo': 1, 'San Francisco': 1, 'Moscone West': 1, 'Paramount': 1, 'Apple TV': 1, 'Mac': 1, 'iPhone': 1, 'Ben': 3, 'Deathly Hallows': 1, 'Hallow': 1, 'gallow': 1, 'James hanging': 1, 'OOTP': 1, 'Hermiones': 1, 'Jo': 1, ...}","{'Italian': 6, 'Gil Fronsdal': 2, 'Matt White Band': 2, 'iTunes': 3, 'Joe Kroll': 2, 'North': 2, 'IndieFeed Alternative Modern Rock': 2, 'Paul': 2, 'Brian Nixon': 2, 'Ben': 3, 'Spanish': 2, 'Guava Duff': 2}","[{0: ['American', 'Dave', 'European Union', 'Italian', 'Rick', 'Rick Steves', 'iTunes'], 1: ['Bristol Universitys Peter Rogers', 'Drs Chris', 'E.U.', 'Helen', 'Jeffrey Gordon', 'Joe Kroll', 'Kingdom', 'London University', 'Matt White Band', 'Nena', 'Nonprofit Boot Camp', 'Ray Parker Jr.', 'Roger Corder', 'The Kingdom of God', 'University of St Louis', 'Zencast 86 - Right Livelihood'], 2: ['Gil Fronsdal']}]",0.624332,,"[American, Dave, European Union, Italian, Rick, Rick Steves, iTunes]","[Bristol Universitys Peter Rogers, Drs Chris, E.U., Helen, Jeffrey Gordon, Joe Kroll, Kingdom, London University, Matt White Band, Nena, Nonprofit Boot Camp, Ray Parker Jr., Roger Corder, The Kingdom of God, University of St Louis, Zencast 86 - Right Livelihood]",[Gil Fronsdal],,,,
7,2007-01-08,"Photoshop CS3 Way, Photoshop CS3 Public Beta, Radio Detective Story Hour, North, Nick, Nora Charles, North, Frances, Richard Lockridge, Jerry North, Joseph Curtain, Pam North, Alice Frost, Uncut - Kissme, Uncut, IndieFeed Alternative Modern Rock, Italian, WindowtotheMagic, A Binaural Adventure, Animal Kingdom, Walt Disney World, Mousefest, Expedition Everest, Patrick, Paul, BINAURAL, PS3, Castlevania Portrait of Ruin, Viva Pinata, Phantasy Star Universe, Wii Sports, Zelda, Karaoke Revolution:, American Idol, Microsoft, GameSpot, Greg Kasavin, MySpace, Digg, iTunes, Paul, Brand New Paul, Core Security, Syngress, Astaro, Astaro Security Gateway, map, Security Weekly Gear, Larry ""Uncle Larry"" Pesce, Paul Asadoorian, Nick ""Twitchy"" Depetrillo, Joe, Conlin Email, Justice Society of America, Brian Nixon, Review of Justice Society of America, Brian Nixon, 37:34 Review, 1:13:52, Special Education, 1:31:17, Collected Comics Library, David Wallace, 2:01:30 Closing","{'Photoshop CS3 Way': 1, 'Photoshop CS3 Public Beta': 1, 'Radio Detective Story Hour': 1, 'North': 2, 'Nick': 1, 'Nora Charles': 1, 'Frances': 1, 'Richard Lockridge': 1, 'Jerry North': 1, 'Joseph Curtain': 1, 'Pam North': 1, 'Alice Frost': 1, 'Uncut - Kissme': 1, 'Uncut': 1, 'IndieFeed Alternative Modern Rock': 2, 'Italian': 7, 'WindowtotheMagic': 1, 'A Binaural Adventure': 1, 'Animal Kingdom': 1, 'Walt Disney World': 1, 'Mousefest': 1, 'Expedition Everest': 1, 'Patrick': 1, 'Paul': 2, 'BINAURAL': 1, 'PS3': 1, 'Castlevania Portrait of Ruin': 1, 'Viva Pinata': 1, 'Phantasy Star Universe': 1, 'Wii Sports': 1, 'Zelda': 1, 'Karaoke Revolution:': 1, 'American Idol': 1, 'Microsoft': 1, 'GameSpot': 1, 'Greg Kasavin': 1, 'MySpace': 1, 'Digg': 1, 'iTunes': 2, 'Brand New Paul': 1, 'Core Security': 1, 'Syngress': 1, 'Astaro': 1, 'Astaro Security Gateway': 1, 'map': 1, 'Security Weekly Gear': 1, 'Larry ""Uncle Larry"" Pesce': 1, 'Paul Asadoorian': 1, 'Nick ""Twitchy"" Depetrillo': 1, 'Joe': 1, 'Conlin Email': 1, 'Justice Society of America': 1, 'Brian Nixon': 2, 'Review of Justice Society of America': 1, '37:34 Review': 1, '1:13:52': 1, 'Special Education': 1, '1:31:17': 1, 'Collected Comics Library': 1, 'David Wallace': 1, '2:01:30 Closing': 1, 'Apple': 1, 'Steve Jobs': 1, 'Macworld Conference & Expo': 1, 'San Francisco': 1, 'Moscone West': 1, 'Paramount': 1, 'Apple TV': 1, 'Mac': 1, 'iPhone': 1, 'Ben': 3, 'Deathly Hallows': 1, 'Hallow': 1, 'gallow': 1, 'James hanging': 1, 'OOTP': 1, 'Hermiones': 1, 'Jo': 1, 'Jamie': 1, 'Andrew': 1, 'Eric': 1, 'Laura': 1, 'Hogwarts': 1, 'Phys Ed': 1, 'Chicken Soup': 1, 'MuggleCast Soul': 1, 'Spanish': 2, 'Spain': 1, 'English': 1, 'Unnateral Helpers - Gettin'': 1, 'Guava Duff': 2, 'Federal Reserve': 1, 'El Fuego': 1, 'Fed': 1, 'NASDAQ': 1, 'Chris Farmer': 1, 'Freesound': 1, 'Patreon': 1, 'Role Playing Public Radio': 1, 'Polaroid': 1, ...}","{'North': 2, 'IndieFeed Alternative Modern Rock': 2, 'Italian': 7, 'Paul': 2, 'iTunes': 2, 'Brian Nixon': 2, 'Ben': 3, 'Spanish': 2, 'Guava Duff': 2}","[{0: ['1:13:52', '1:31:17', '2:01:30 Closing', '37:34 Review', 'A Binaural Adventure', 'Alice Frost', 'American Idol', 'Animal Kingdom', 'Astaro', 'Astaro Security Gateway', 'BINAURAL', 'Brand New Paul', 'Brian Nixon', 'Castlevania Portrait of Ruin', 'Collected Comics Library', 'Conlin Email', 'David Wallace', 'Digg', 'Expedition Everest', 'Frances', 'GameSpot', 'Greg Kasavin', 'Jerry North', 'Joseph Curtain', 'Justice Society of America', 'Karaoke Revolution:', 'Mousefest', 'MySpace', 'Nick ""Twitchy"" Depetrillo', 'Nora Charles', 'North', 'PS3', 'Pam North', 'Phantasy Star Universe', 'Photoshop CS3 Public Beta', 'Photoshop CS3 Way', 'Radio Detective Story Hour', 'Review of Justice Society of America', 'Richard Lockridge', 'Special Education', 'Syngress', 'Uncut', 'Uncut - Kissme', 'Viva Pinata', 'Walt Disney World', 'Wii Sports', 'map'], 1: ['Core Security', 'IndieFeed Alternative Modern Rock', 'Italian', 'Joe', 'Larry ""Uncle Larry"" Pesce', 'Microsoft', 'Nick', 'Patrick', 'Paul', 'Paul Asadoorian', 'Security Weekly Gear', 'WindowtotheMagic', 'Zelda', 'iTunes']}]",0.657359,,"[1:13:52, 1:31:17, 2:01:30 Closing, 37:34 Review, A Binaural Adventure, Alice Frost, American Idol, Animal Kingdom, Astaro, Astaro Security Gateway, BINAURAL, Brand New Paul, Brian Nixon, Castlevania Portrait of Ruin, Collected Comics Library, Conlin Email, David Wallace, Digg, Expedition Everest, Frances, GameSpot, Greg Kasavin, Jerry North, Joseph Curtain, Justice Society of America, Karaoke Revolution:, Mousefest, MySpace, Nick ""Twitchy"" Depetrillo, Nora Charles, North, PS3, Pam North, Phantasy Star Universe, Photoshop CS3 Public Beta, Photoshop CS3 Way, Radio Detective Story Hour, Review of Justice Society of America, Richard Lockridge, Special Education, Syngress, Uncut, Uncut - Kissme, Viva Pinata, Walt Disney World, Wii Sports, map]","[Core Security, IndieFeed Alternative Modern Rock, Italian, Joe, Larry ""Uncle Larry"" Pesce, Microsoft, Nick, Patrick, Paul, Paul Asadoorian, Security Weekly Gear, WindowtotheMagic, Zelda, iTunes]",,,,,
8,2007-01-09,"Italian, Italian, Apple, Steve Jobs, Macworld Conference & Expo, San Francisco, Moscone West, Paramount, iTunes, Apple TV, Mac, iPhone, Ben, Ben, Deathly Hallows, Hallow, gallow, James hanging, OOTP, Hermiones, Jo, Jamie, Andrew, Eric, Laura, Hogwarts, Phys Ed, Chicken Soup, MuggleCast Soul, Spanish, Ben, Spain","{'Italian': 8, 'Apple': 1, 'Steve Jobs': 1, 'Macworld Conference & Expo': 1, 'San Francisco': 1, 'Moscone West': 1, 'Paramount': 1, 'iTunes': 1, 'Apple TV': 1, 'Mac': 1, 'iPhone': 1, 'Ben': 3, 'Deathly Hallows': 1, 'Hallow': 1, 'gallow': 1, 'James hanging': 1, 'OOTP': 1, 'Hermiones': 1, 'Jo': 1, 'Jamie': 1, 'Andrew': 1, 'Eric': 1, 'Laura': 1, 'Hogwarts': 1, 'Phys Ed': 1, 'Chicken Soup': 1, 'MuggleCast Soul': 1, 'Spanish': 3, 'Spain': 1, 'English': 1, 'Unnateral Helpers - Gettin'': 1, 'IndieFeed Alternative Modern Rock': 1, 'Guava Duff': 2, 'Federal Reserve': 1, 'El Fuego': 1, 'Fed': 1, 'NASDAQ': 1, 'Chris Farmer': 1, 'Freesound': 1, 'Patreon': 1, 'Role Playing Public Radio': 1, 'Polaroid': 1, 'InDesign': 1, 'New Year': 1, 'The Advanced Selling Podcast': 1, 'The Commandments of Selling': 1, 'Bill': 1, 'Bryan': 1, 'Julie': 1, 'Henry': 1, 'ELOISE': 1, 'Paris': 1, 'Donde esta mi esposo': 1, 'Jen': 1, 'pizahnkyeh': 1, 'Ukrainian': 1, 'pysanka Kiska': 1, 'pl = Kisky': 1, 'Al': 1, 'Christian': 1, 'Curtis Hixon Hall - Tampa': 1, 'FL': 1, 'Prophet': 1, 'Wharf Rat': 1, 'Moritz': 1, 'Magic Realm': 1, 'Hill Giant': 1, 'Hawthorne': 1, 'Mary': 1, 'Sam': 1, 'Bonobo Beach': 1, 'Tom': 1, 'Hustle - Energy of Death': 1, 'Hustle': 1, 'Beirut': 2, 'IndieFeed Indie Pop': 1, 'New Years Eve': 1, 'Eugene Burger - Magic': 1, 'Eugene Burger': 2, 'Stagebill': 1, 'Magic Castle': 1, 'Hollywood': 1, 'California': 1, 'Magic': 1, 'How Magicians Think': 1, 'Great Britain': 1, 'Canada': 1, 'Belgium': 1, 'Finland': 1, 'Japan': 1, 'PBS': 1, 'The Art of Magic': 1, 'The Learning Channel': 1, 'CNN': 1, 'D.J. Grothe': 1, 'Burger': 1, 'Spirit Theater': 1, 'Israeli': 1, 'Uri Geller': 1, 'American': 1, ...}","{'Italian': 8, 'Ben': 3, 'Spanish': 3, 'Guava Duff': 2, 'Beirut': 2, 'Eugene Burger': 2}","[{-1: ['Andrew', 'Eric', 'San Francisco', 'iTunes'], 1: ['Apple', 'Mac', 'iPhone'], 2: ['Apple TV', 'Chicken Soup', 'Deathly Hallows', 'Hallow', 'Hermiones', 'Hogwarts', 'James hanging', 'Jamie', 'Jo', 'Laura', 'Macworld Conference & Expo', 'Moscone West', 'MuggleCast Soul', 'OOTP', 'Paramount', 'Phys Ed', 'Steve Jobs', 'gallow'], 3: ['Ben'], 0: ['Italian', 'Spain', 'Spanish']}]",0.620065,"[Andrew, Eric, San Francisco, iTunes]","[Italian, Spain, Spanish]","[Apple, Mac, iPhone]","[Apple TV, Chicken Soup, Deathly Hallows, Hallow, Hermiones, Hogwarts, James hanging, Jamie, Jo, Laura, Macworld Conference & Expo, Moscone West, MuggleCast Soul, OOTP, Paramount, Phys Ed, Steve Jobs, gallow]",[Ben],,,
9,2007-01-10,"English, Unnateral Helpers - Gettin', IndieFeed Alternative Modern Rock, Guava Duff, Guava Duff, Italian, Italian, Federal Reserve, El Fuego, Spanish, Fed, NASDAQ, Chris Farmer, Freesound, Patreon, Role Playing Public Radio","{'English': 1, 'Unnateral Helpers - Gettin'': 1, 'IndieFeed Alternative Modern Rock': 1, 'Guava Duff': 2, 'Italian': 7, 'Federal Reserve': 1, 'El Fuego': 1, 'Spanish': 2, 'Fed': 1, 'NASDAQ': 1, 'Chris Farmer': 1, 'Freesound': 1, 'Patreon': 1, 'Role Playing Public Radio': 1, 'Polaroid': 1, 'InDesign': 1, 'New Year': 1, 'The Advanced Selling Podcast': 1, 'The Commandments of Selling': 1, 'Bill': 1, 'Bryan': 1, 'Julie': 1, 'Henry': 1, 'ELOISE': 1, 'Paris': 1, 'Donde esta mi esposo': 1, 'Jen': 1, 'pizahnkyeh': 1, 'Ukrainian': 1, 'pysanka Kiska': 1, 'pl = Kisky': 1, 'Al': 1, 'Christian': 1, 'Curtis Hixon Hall - Tampa': 1, 'FL': 1, 'Prophet': 1, 'Wharf Rat': 1, 'Moritz': 1, 'Magic Realm': 1, 'Hill Giant': 1, 'Hawthorne': 1, 'Mary': 1, 'Sam': 1, 'Bonobo Beach': 1, 'Tom': 1, 'Hustle - Energy of Death': 1, 'Hustle': 1, 'Beirut': 2, 'IndieFeed Indie Pop': 1, 'New Years Eve': 1, 'Eugene Burger - Magic': 1, 'Eugene Burger': 2, 'Stagebill': 1, 'Magic Castle': 1, 'Hollywood': 1, 'California': 1, 'Magic': 1, 'How Magicians Think': 1, 'Great Britain': 1, 'Canada': 1, 'Belgium': 1, 'Finland': 1, 'Japan': 1, 'PBS': 1, 'The Art of Magic': 1, 'The Learning Channel': 1, 'CNN': 1, 'D.J. Grothe': 1, 'Burger': 1, 'Spirit Theater': 1, 'Israeli': 1, 'Uri Geller': 1, 'American': 1, 'John Edward': 1, 'Rebecca Sheir's': 1, 'The End as Beginning:': 1, 'Jewish': 1, 'Black Jack Justice': 1, 'The Trouble With Doubles': 1, 'Jack': 1, 'Florida': 1, 'Trixie': 1, 'Q & A': 1, 'Pixie': 1, 'Knit From Your Stash 2007': 1, 'DCU': 1, 'Star Superman 5': 1, 'Action Comics': 1, 'Justice': 1, 'Elm Street': 1, 'Brent AKA Knightwingbk': 1, '1:40:31': 1, 'Star Superman 5 and 6 Review': 1, 'Superman': 1, '2:42:39': 1}","{'Guava Duff': 2, 'Italian': 7, 'Spanish': 2, 'Beirut': 2, 'Eugene Burger': 2}","[{1: ['Chris Farmer', 'El Fuego', 'Fed', 'Federal Reserve', 'Freesound', 'Guava Duff', 'IndieFeed Alternative Modern Rock', 'NASDAQ', 'Role Playing Public Radio', 'Unnateral Helpers - Gettin''], 0: ['English', 'Italian', 'Spanish'], -1: ['Patreon']}]",0.588905,[Patreon],"[English, Italian, Spanish]","[Chris Farmer, El Fuego, Fed, Federal Reserve, Freesound, Guava Duff, IndieFeed Alternative Modern Rock, NASDAQ, Role Playing Public Radio, Unnateral Helpers - Gettin']",,,,,


In [0]:
from datetime import timedelta
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

def event_duration(df):
    '''
    Takes in the dataframe from "expand_groups_to_event_columns"
    For every two consecutive rows R1 and R2, assuming each row has e0, e1, ... e5 number of events
    Find every pair (R1e0, R2e0), (R1e0, R2e1), ... (R1e5, R2e5) total 25 combinations
    For each combination, call "percent_overlap" to calculate a percentage score
    For each event in R1, take the max(scores), and the R2 argmax(scores) event as its "next_day_event"
    
    Create a dataframe D to keep track of R1 events. D[['features','duration','start_date','end_date']]
    Initialize all start and end dates as the first row date.
    For those that found a "next_day_event", D[e] duration increments by 1, D[e] end date increment by 1.
    Returns D, the event dataframe.
    '''
    
    # initialize the events dataframe
    columns = ['features','duration','start','end','overlap']
    events_df = pd.DataFrame(columns = columns)
    #events_df2 = pd.DataFrame(columns = columns)
    prev_group = np.nan
    
    # Compare events for every two rows in the dataset
    for i in range(len(df)-1):
        
        R1 = df.iloc[i,:]
        R2 = df.iloc[i+1,:]
        m = [] # an n by n matrix, each entry is an overlap score 
        dur = 1
        start = R1['dates']
        end = R1['dates']
        
        # try every pair between events in row1 and events in row2
        for j in range(7):
            mm = [] # a n by 1 vector
            for k in range(7):
                try:
                    OL, OM, OR = percent_overlap(R1[j],R2[k])
                    mm.append(max(OL, OM, OR))
                except TypeError:
                    mm.append(-1)
                    pass
            m.append(mm)
        
        # Find the max overlap between two events in two consecutive time windows
        flat_m = [item for sublist in m for item in sublist]
        max_score = max(flat_m)
        m_arr = np.array(m)
        
        # Search for max overlap percentage entries in m, match events by m index numbers
        if max_score != 0:
            indices = list(zip(*np.where(m_arr == max_score)))
            
            # continuous event
            if indices[0][0] == prev_group: 
                # last event duration + 1, end date + 1, features combine
                fea = events_df.iloc[len(events_df)-1,0] + R2[indices[0][1]] 
                dur = events_df.iloc[len(events_df)-1,1] + 1 
                start = events_df.iloc[len(events_df)-1,2]
                end = timedelta(days=1) + events_df.iloc[len(events_df)-1,3] 
                ove = (events_df.iloc[len(events_df)-1,4] + max_score)/2 
                newdata = [[fea, dur, start, end, ove]]
                events_df = events_df[:-1]
                events_df = events_df.append(pd.DataFrame(newdata, columns = columns), ignore_index=True)
            
            # new event
            else: 
                # create new features, end date + 1, duration + 1
                fea = R1[indices[0][0]]+R2[indices[0][1]]
                dur = dur + 1
                end = R2['dates']
                newdata = [[fea, dur, start, end, max_score]]
                events_df = events_df.append(pd.DataFrame(newdata, columns = columns), ignore_index=True)
            prev_group = indices[0][1]
        
        # For those zero overlaps, count them as individual events and append to the events dataframe
        else:
            ind = set(np.where(m_arr == 0)[0])
            print(i,ind)
            for e in ind:
                feat = df[e][i]
                start = df['dates'][i]
                end = df['dates'][i]
                dur = 1
                newd = [[feat, dur, start, end, 0]]
                events_df = events_df.append(pd.DataFrame(newd, columns = columns), ignore_index=True)

    return events_df#, events_df2
        
       
testout = event_duration(test)
testout

1 {0, 1}
2 {0, 1, 2, 3}
3 {0, 1}
4 {0, 1}
5 {0, 1}
11 {0, 1}
12 {0}
13 {0, 1, 2}
14 {0, 1}
15 {0, 1}
16 {0, 1}
18 {0, 1, 2, 3}
19 {0, 1}
20 {0, 1, 2, 3, 4, 5, 6}
21 {0, 1, 2, 3, 4, 5, 6}
22 {0, 1, 2}
23 {0}
25 {0, 1}
26 {0}
27 {0, 1, 2}
28 {0, 1}
29 {0}
30 {0, 1, 2}
31 {0, 1}
32 {0, 1, 2}
33 {0, 1, 2, 3}
36 {0, 1, 2}
37 {0, 1}
38 {0, 1, 2, 3}
40 {0, 1, 2}
46 {0, 1, 2, 3, 4, 5}
47 {0, 1}


Unnamed: 0,features,duration,start,end,overlap
0,"[- Murder Clinic, Anne Farnsworth, Atari, Auld Lang Synes, Disney Year, Episode, Gears of War, Gran Turismo, Murder Clinic, NESes, New Year Resolutions, PS3, Paul Back, Paul Barrie, Podsafe Music Network, Pokemon Ruby, Radio Detective Story Hour, Ricky, Ricky Brigante, Saturday Morning, Super NESes, The Disney Year, Viva Pinata, Zelda:, regulars, 8MB, Arthur Brooks, Dennis Pragers, New Year Resolutions, Prager, Syracuse University, Who Really Cares: The suprising, julienne]",2,2007-01-01,2007-01-02,12.5
1,"[8MB, Arthur Brooks, Dennis Pragers, New Year Resolutions, Prager, Syracuse University, Who Really Cares: The suprising, julienne]",1,2007-01-02,2007-01-02,0.0
2,[Italian],1,2007-01-02,2007-01-02,0.0
3,"[Caltech, Caltech Lecture Series, Chloe Day, Chloe Day -, Idiotchild - Mouth Watering, Molly Ringwald, Religion & Politics, Sweet Charity, Symposium on Science, The End of Faith]",1,2007-01-03,2007-01-03,0.0
4,"[IndieFeed Alternative Modern Rock, IndieFeed Indie Pop, Skepticality]",1,2007-01-03,2007-01-03,0.0
5,[Sam Harris],1,2007-01-03,2007-01-03,0.0
6,[Spanish],1,2007-01-03,2007-01-03,0.0
7,"[0:00 Opening, 1:17:47 Manhunter Challenge, Brent AKA Knightwingbk, Comic Timing Time, Comic Timings Ian Levenstein, John Mayo, Justice Society of America, Manhunter Challenge, Raging Bullets of, Speeding Bullets]",1,2007-01-04,2007-01-04,0.0
8,"[DC, Jim, Sean]",1,2007-01-04,2007-01-04,0.0
9,"[AP, AZ, American, Ben, Celtfather, Celtic, Celtic Music, Celtic Music Magazine, Celtic Music Podcast, Cindy, Dublin, French, IndieFeed Alternative Modern Rock, IndieFeed Indie Pop, Ireland, Irish, Irish & Celtic Music Podcast, Jason, Marc Gunn, Mary, Sam, Spanish, Sun, Tom, iTunes]",1,2007-01-05,2007-01-05,0.0


### Step 3: Assign scores

In [0]:
### Step 3: Assign scores
def is_event_feature(df):
    sentences = get_all_unique_entities()
    all_entities = [item for sublist in sentences for item in sublist]
    wide_df, not_features_flat = is_event_feature(df,'groups')
    features = all_entities - not_features_flat
    return features

def sort_features():
    '''
    Given a list of dictionaries with unique keys as features, and values as frequencies 
    (output from CountFrequency)
    Return the sorted list of dictionaries
    '''
    sentences2 = get_sentences_2(df_window)
    all_features = [item for sublist in sentences2 for item in sublist]
    all_features.sort()
    return all_features

def feature_score(sorted_features):
    '''
    Loop through each item (a dictionary) in the list
    value = value + weight if is feature
    return sorted_features
    '''

def event_score(events_df):
    '''
    Takes in the events_df returned by "event_duration"
    flatten it so that each row is a feature and its duration
    return the dictionary as "duration"
    '''

def final_score(sorted_features, duration):
    '''
    Takes the sorted_features output from "feature_score"
    for each key in sorted_features, find it the duration dictionary, multiple its value by the duration's value
    return sorted_features as the final score
    '''
    f = features_score(df)
    e = event_score(events_df)
    return f * e
    
    
def get_all_unique_entities():
    m = [df_window['entity'][i] for i in range(len(df_window['entity']))]
    sentences = []
    for item in m:
        sentences.append(item.split(", "))
    return sentences
    
def not_event_feature(df, col):
    '''
    Takes in a dataframe, expands the grouped dictionary column horizontally,
    Returns a list of entities that are not event features, and the wide dataframe 
    '''
    df2 = json_normalize(df[col])
    wide_df = pd.concat([df, df2], axis=1)
    not_features = list(wide_df[-1]) # dbscan has decided is noise with group label -1
    not_features_flat = [item for sublist in not_features for item in sublist]
    return wide_df, not_features_flat