In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt

In [2]:
from plotly.offline import plot, init_notebook_mode, iplot
import plotly.figure_factory as ff
init_notebook_mode(connected=True)

### Data Preprocessing

In [3]:
# Load the tags of interest
ftags = open("tags-of-interest.txt", "r")
tags = [x.rstrip() for x in ftags.readlines()]
columns = ["clip_id"] + tags + ["mp3_path"]       # Mood tags + clip ids

In [4]:
# Load data into pandas
data = pd.read_csv("annotations.csv")
data = data[columns]

In [5]:
data.head(10)

Unnamed: 0,clip_id,heavy,eerie,spacey,quiet,electric,happy,airy,space,loud,...,sad,slow,scary,jazzy,calm,different,upbeat,soft,water,mp3_path
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,f/american_bach_soloists-j_s__bach_solo_cantat...
1,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,f/american_bach_soloists-j_s__bach_solo_cantat...
2,10,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,f/american_bach_soloists-j_s__bach_solo_cantat...
3,11,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,f/american_bach_soloists-j_s__bach_solo_cantat...
4,12,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,f/american_bach_soloists-j_s__bach_solo_cantat...
5,14,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,c/lvx_nova-lvx_nova-01-contimune-30-59.mp3
6,19,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,c/lvx_nova-lvx_nova-01-contimune-175-204.mp3
7,21,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,c/lvx_nova-lvx_nova-01-contimune-233-262.mp3
8,23,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,c/lvx_nova-lvx_nova-01-contimune-291-320.mp3
9,25,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0/american_bach_soloists-j_s__bach__cantatas_v...


In [6]:
def mood_count(row, tags):
    """ Auxiliary function counting the presence of mood tags in a row.
    """
    count = 0
    for tag in tags:
        count += int(row[tag])
    return count

In [7]:
# Remove rows without any mood info
data["mood_count"] = data.apply(lambda row: mood_count(row, tags), axis=1)
data = data[data["mood_count"] != 0]

### Statistics

In [8]:
# Create a table for each tag count (single class)
count_plot = [["Mood Tag", "Count (non unique)", "Count (unique)"]]

unique_set = data[data["mood_count"] == 1] # Remove rows with more than one label

for tag in tags:
    count_plot.append([tag, data[tag].value_counts()[1], unique_set[tag].value_counts()[1]])

    
# sorted(count_plot,key=lambda x: x[2]) # sort according to column 2
# np.sort(count_plot, axis=2)


table = ff.create_table(count_plot)
iplot(table)

In [9]:
# Check the example count for each mood count
data["mood_count"].value_counts()

1    4748
2    1447
3     492
4     118
5      41
6      10
7       4
8       2
Name: mood_count, dtype: int64

If we get rid of datapoints with more than one class, we're left with at most 4748/27=175 datapoints per tag, which is not a lot especially as "slow" already takes 2135 examples. (Too little data?...)