In [17]:
import functools
import random
import math
import pandas as pd
from sklearn.neural_network import MLPClassifier

In [18]:
# Variables

# File prefix
file_pre = 'normw_data_'

# File suffix
file_suf = '.csv'

# Start year
start = 1990

# Number of years to process
end = 21 

# Number of iterations
runs = 5

# Training/Test percent split
training_percent = 0.85

In [19]:
# Create file list
all_files = []
for i in range(end+1):
    all_files.append(file_pre + str(start + i) + file_suf)

random.shuffle(all_files)

# Need data for if hit or not

hits = pd.read_csv('labelled_data.csv', encoding = "ISO-8859-1")
hits['song'] = hits['title'].str.lower() 
hits['artist_name'] = hits['artist_name'].str.lower() 


# Functions for Metrics
def acc(data):
    in_ac = [float(1) if a==b else 0 for (a,b) in data]
    return functools.reduce(lambda x, y: x + y, in_ac) / len(in_ac)
def pr(data):
    l = len(data)
    tp = functools.reduce(lambda x, y: x + y, [float(1) if a==b else 0 for (a,b) in data])
    fp = functools.reduce(lambda x, y: x + y, [float(1) if a==0 and p==1 else 0 for (a,p) in data])
    fn = functools.reduce(lambda x, y: x + y, [float(1) if a==1 and p==0 else 0 for (a,p) in data])
    
    pr = tp / (tp + fp)
    rc = tp / (tp + fn)
    return (pr, rc)

In [21]:
for i in range(runs):
    print ("Run #" + str(i))
    
    # Ingest and randomize row order
    random.shuffle(all_files)
    train_num = math.ceil (training_percent * len(all_files))
    file_list = all_files[:train_num]
    test_list = all_files[train_num:]
    print (str(file_list))
    print (str(test_list))
    df = pd.DataFrame()
    for file in file_list:
        chunks = pd.read_csv(file, encoding = "ISO-8859-1", chunksize=1000, low_memory=False,
                           usecols=['artist_name', 'song', 'artist_nid','song_nid', 'bars', 'beats',
                                    'sections_start', 'segments', 'segments_loudness_max', 'tatums'])
        tmp = pd.concat((chunk for chunk in chunks), ignore_index=True)
        df = pd.concat([df, tmp], ignore_index=True)
    df = pd.merge(df, hits, on=['song', 'artist_name'], how='inner')

    # Train Model
    clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                        hidden_layer_sizes=(6), random_state=1)
    y = df["hit"].values
    X = df[['bars', 'beats', 'sections_start', 'segments', 'segments_loudness_max', 'tatums']].values
    clf.fit(X, y)

    # Predict and score
    test_df = pd.DataFrame()
    sample_size = 0 # Use whole test data set
    for file in test_list:
        chunks = pd.read_csv(file, encoding = "ISO-8859-1", chunksize=1000, low_memory=False,
                           usecols=['artist_name', 'song', 'artist_nid','song_nid', 'bars', 'beats',
                                    'sections_start', 'segments', 'segments_loudness_max', 'tatums'])
        tmp = pd.concat((chunk for chunk in chunks), ignore_index=True)
        test_df = pd.concat([test_df, tmp], ignore_index=True)
    sample = test_df
    if sample_size:
        sample = sample.sample(sample_size)
    sample = pd.merge(sample, hits, on=['song', 'artist_name'], how='inner')
    results = []
    for index, row in sample.iterrows():
        tmp_df = pd.DataFrame([row])
        pred = clf.predict(tmp_df[['bars', 'beats', 'sections_start', 'segments', 'segments_loudness_max', 'tatums']].values)
        results.append((row['hit'], pred[0]))
    print ("acc: " + str(acc(results)))
    print ("precision/recall:" + str(pr(results)))

Run #0
['normw_data_2003.csv', 'normw_data_2005.csv', 'normw_data_1994.csv', 'normw_data_1997.csv', 'normw_data_2011.csv', 'normw_data_2001.csv', 'normw_data_1992.csv', 'normw_data_2010.csv', 'normw_data_1990.csv', 'normw_data_1993.csv', 'normw_data_2007.csv', 'normw_data_2008.csv', 'normw_data_1991.csv', 'normw_data_2002.csv', 'normw_data_2006.csv', 'normw_data_2000.csv', 'normw_data_2004.csv', 'normw_data_1998.csv', 'normw_data_1999.csv']
['normw_data_2009.csv', 'normw_data_1995.csv', 'normw_data_1996.csv']
acc: 0.9697557635243095
precision/recall:(0.9998823252530007, 0.9698664536011871)
Run #1
['normw_data_1994.csv', 'normw_data_2005.csv', 'normw_data_1997.csv', 'normw_data_2004.csv', 'normw_data_2009.csv', 'normw_data_2001.csv', 'normw_data_1996.csv', 'normw_data_1991.csv', 'normw_data_2000.csv', 'normw_data_2002.csv', 'normw_data_2011.csv', 'normw_data_2010.csv', 'normw_data_2006.csv', 'normw_data_2003.csv', 'normw_data_1999.csv', 'normw_data_1998.csv', 'normw_data_1993.csv', 'nor