In [1]:
import functools
import random
import math
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, export_graphviz

In [2]:
# Variables

# File prefix
file_pre = 'normw_data_'

# File suffix
file_suf = '.csv'

# Start year
start = 1990

# Number of years to process
end = 21 

# Number of iterations
runs = 5

# Training/Test percent split
training_percent = 0.85

In [3]:
# Create file list
all_files = []
for i in range(end+1):
    all_files.append(file_pre + str(start + i) + file_suf)

random.shuffle(all_files)

# Need data for if hit or not

hits = pd.read_csv('labelled_data.csv', encoding = "ISO-8859-1")
hits['song'] = hits['title'].str.lower() 
hits['artist_name'] = hits['artist_name'].str.lower() 


# Functions for Metrics
def acc(data):
    in_ac = [float(1) if a==b else 0 for (a,b) in data]
    return functools.reduce(lambda x, y: x + y, in_ac) / len(in_ac)
def pr(data):
    l = len(data)
    tp = functools.reduce(lambda x, y: x + y, [float(1) if a==b else 0 for (a,b) in data])
    fp = functools.reduce(lambda x, y: x + y, [float(1) if a==0 and p==1 else 0 for (a,p) in data])
    fn = functools.reduce(lambda x, y: x + y, [float(1) if a==1 and p==0 else 0 for (a,p) in data])
    
    pr = tp / (tp + fp)
    rc = tp / (tp + fn)
    return (pr, rc)

In [4]:
# Ingest files, randomize order, train model, predict for test set, and score.

for i in range(runs):
    print ("Run #" + str(i))
    
    # Ingest and randomize row order
    random.shuffle(all_files)
    df = pd.DataFrame()
    for file in all_files:
        chunks = pd.read_csv(file, encoding = "ISO-8859-1", chunksize=1000, low_memory=False,
                           usecols=['artist_name', 'song', 'artist_nid','song_nid', 'bars', 'beats',
                                    'sections_start', 'segments', 'segments_loudness_max', 'tatums'])
        tmp = pd.concat((chunk for chunk in chunks), ignore_index=True)
        df = pd.concat([df, tmp], ignore_index=True)
        
    df = pd.merge(df, hits, on=['song', 'artist_name'], how='inner').reindex(np.random.permutation(df.index))
    
    # Split training/testing dataset
    train_num = math.ceil (training_percent * len(df.index))
    test_num = len(df.index) - train_num
    train_df = df.head(train_num)
    test_df = df.tail(test_num)
    
    # Train Model
    dt = DecisionTreeClassifier(min_samples_split=100, min_samples_leaf=100, max_depth=4, max_leaf_nodes=15)
    y = train_df["hit"] 
    X = train_df[['bars', 'beats', 'sections_start', 'segments', 'segments_loudness_max', 'tatums']]
    dt.fit(X, y)
    
    # Predict and score
    sample_size = 0 # Use whole test data set
    sample = test_df
    if sample_size:
        sample = sample.sample(sample_size)
    results = []
    for index, row in sample.iterrows():
        tmp_df = pd.DataFrame([row])
        pred = dt.predict(tmp_df[['bars', 'beats', 'sections_start', 'segments', 'segments_loudness_max', 'tatums']])
        results.append((row['hit'], pred[0]))
    print ("acc: " + str(acc(results)))
    print ("precision/recall:" + str(pr(results)))
    export_graphviz(dt, out_file='dt' + str(i) + '.dot')

Run #0
acc: 0.9700441483166762
precision/recall:(1.0, 0.9700441483166762)
Run #1
acc: 0.9700170634598196
precision/recall:(1.0, 0.9700170634598196)
Run #2
acc: 0.9694753663226889
precision/recall:(1.0, 0.9694753663226889)
Run #3
acc: 0.969529536036402
precision/recall:(1.0, 0.969529536036402)
Run #4
acc: 0.9681752931935754
precision/recall:(1.0, 0.9681752931935754)
