In [1]:
import pandas as pd
import numpy as np
import time
from scipy.spatial.distance import pdist
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from multiprocessing import Pool
from scipy.sparse import coo_matrix
from scipy.sparse import save_npz

In [2]:
embedding_cols = [
    'year',
    'acousticness',
    'danceability',
    'duration_ms',
    'energy',
    'instrumentalness',
    'key',
    'liveness',
    'loudness',
    'mode',
    'speechiness',
    'tempo',
    'time_signature',
    'valence'
]

msd = pd.read_hdf('data/full_msd_with_audio_features.h5', key='df')[['song_id'] + embedding_cols]
# this is because song_id is in the format:  "b'SOSIYAD12A8C14097F'"
msd['song_id'] = msd['song_id'].str.slice(start=2, stop=-1)
msd.head()

Unnamed: 0,song_id,year,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,SOSIYAD12A8C14097F,2006,0.0142,0.471,254537,0.91,0.764,7,0.403,-4.847,1,0.109,175.816,4,0.282
1,SOHPHTP12A8C13BF53,0,0.843,0.479,162173,0.315,4e-06,9,0.0911,-12.951,0,0.0991,97.886,4,0.309
2,SOFVVGL12A8C13C32F,1999,0.307,0.678,188493,0.787,8e-05,9,0.714,-6.344,1,0.0355,128.181,4,0.969
3,SOHXIRQ12AAA15CF81,2008,0.147,0.804,278600,0.676,0.919,3,0.0797,-8.48,0,0.0437,94.994,4,0.527
4,SOJHDEN12AB018B650,2006,0.129,0.604,267200,0.603,0.0,11,0.185,-4.419,0,0.0507,124.088,4,0.399


In [3]:
col_names = ['user_id', 'song_id', 'play_count']
train_triplets = pd.read_csv('data/train_triplets.txt', sep='\t', names=col_names)
test_visible_triplets = pd.read_csv('data/EvalDataYear1MSDWebsite/year1_test_triplets_visible.txt', sep='\t', names=col_names)
test_hidden_triplets = pd.read_csv('data/EvalDataYear1MSDWebsite/year1_test_triplets_hidden.txt', sep='\t', names=col_names)

train_triplets['is_test'] = False
test_visible_triplets['is_test'] = True
test_hidden_triplets['is_test'] = True

train_triplets['is_hidden'] = False
test_visible_triplets['is_hidden'] = False
test_hidden_triplets['is_hidden'] = True

In [4]:
all_triplets = train_triplets.append(test_visible_triplets, ignore_index=True).append(test_hidden_triplets, ignore_index=True)
all_niplets = all_triplets.merge(msd, left_on='song_id', right_on='song_id')

visible_niplets = all_niplets.loc[all_niplets['is_hidden'] == False]
hidden_niplets = all_niplets.loc[all_niplets['is_hidden'] == True]

In [5]:
all_niplets.shape

(20639742, 19)

In [6]:
visible_niplets.shape

(20107787, 19)

In [7]:
hidden_niplets.shape

(531955, 19)

In [8]:
#This is to remove the users that have test data but not hidden data
visible_users = visible_niplets['user_id'].drop_duplicates()
hidden_users = hidden_niplets['user_id'].drop_duplicates()
blacklisted_users = hidden_users[~hidden_users.isin(visible_users)].values

hidden_niplets = hidden_niplets.loc[~hidden_niplets['user_id'].isin(blacklisted_users)]
all_niplets = all_niplets.loc[~all_niplets['user_id'].isin(blacklisted_users)]

In [9]:
len(blacklisted_users)

4202

In [10]:
all_niplets.shape

(20629059, 19)

In [11]:
visible_niplets.shape

(20107787, 19)

In [12]:
hidden_niplets.shape

(521272, 19)

In [13]:
all_niplets.head()

Unnamed: 0,user_id,song_id,play_count,is_test,is_hidden,year,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,False,False,0,0.656,0.486,112680,0.231,0.577,7,0.128,-15.423,1,0.0445,123.685,4,0.389
1,7c86176941718984fed11b7c0674ff04c029b480,SOAKIMP12A8C130995,1,False,False,0,0.656,0.486,112680,0.231,0.577,7,0.128,-15.423,1,0.0445,123.685,4,0.389
2,76235885b32c4e8c82760c340dc54f9b608d7d7e,SOAKIMP12A8C130995,3,False,False,0,0.656,0.486,112680,0.231,0.577,7,0.128,-15.423,1,0.0445,123.685,4,0.389
3,250c0fa2a77bc6695046e7c47882ecd85c42d748,SOAKIMP12A8C130995,1,False,False,0,0.656,0.486,112680,0.231,0.577,7,0.128,-15.423,1,0.0445,123.685,4,0.389
4,3f73f44560e822344b0fb7c6b463869743eb9860,SOAKIMP12A8C130995,6,False,False,0,0.656,0.486,112680,0.231,0.577,7,0.128,-15.423,1,0.0445,123.685,4,0.389


In [14]:
# Get the full dataset since I will need to put this in COO matrix. It will be difficult matching up the song ids of test and train
# if they are treated independently. Putting them together and then splitting the train and test set when creating the COO matrix is the
# easiest way of ensuring that the indices remain correct.

# Build the train and test CSR matrices.

all_niplets['user_id'] = all_niplets['user_id'].astype("category")
all_niplets['song_id'] = all_niplets['song_id'].astype("category")

num_users = all_niplets['user_id'].cat.codes.unique().shape[0]
num_songs = all_niplets['song_id'].cat.codes.unique().shape[0]

train_data = all_niplets.loc[all_niplets['is_hidden'] == False]
test_data = all_niplets.loc[all_niplets['is_hidden'] == True]

rows_train = train_data['song_id'].cat.codes.copy()
cols_train = train_data['user_id'].cat.codes.copy()
data_train = train_data['play_count'].astype(np.float32)

rows_test = test_data['song_id'].cat.codes.copy()
cols_test = test_data['user_id'].cat.codes.copy()
data_test = test_data['play_count'].astype(np.float32)

train_plays = coo_matrix((data_train, (rows_train, cols_train)), shape=(num_songs, num_users)).tocsr()
test_plays = coo_matrix((data_test, (rows_test, cols_test)), shape=(num_songs, num_users)).tocsr()

user_mapping = np.vstack((all_niplets['user_id'].cat.codes.copy().values,
                          all_niplets['user_id'].values)).T
song_mapping = np.vstack((all_niplets['song_id'].cat.codes.copy().values,
                          all_niplets['song_id'].values)).T

user_id_to_user_index = pd.DataFrame(columns=['sparse_index', 'user_id'], data=user_mapping).drop_duplicates()
song_id_to_song_index = pd.DataFrame(columns=['sparse_index', 'song_id'], data=song_mapping).drop_duplicates()

In [15]:
user_id_to_user_index.head()

Unnamed: 0,sparse_index,user_id
0,796068,b80344d063b5ccb3212f76538f3d9e43d87dca9e
1,538494,7c86176941718984fed11b7c0674ff04c029b480
2,510903,76235885b32c4e8c82760c340dc54f9b608d7d7e
3,159634,250c0fa2a77bc6695046e7c47882ecd85c42d748
4,273980,3f73f44560e822344b0fb7c6b463869743eb9860


In [16]:
user_id_to_user_index.shape

(1107613, 2)

In [17]:
song_id_to_song_index.head()

Unnamed: 0,sparse_index,song_id
0,4785,SOAKIMP12A8C130995
2457,7052,SOAPDEY12A81C210A9
3238,14453,SOBFOVM12A58A7D494
3710,20354,SOBSUJE12A6D4F8CF5
4409,21408,SOBVFZR12A6D4F8AE3


In [18]:
song_id_to_song_index.shape

(168493, 2)

In [19]:
# Save to file
user_id_to_user_index.to_hdf('data/user_mapping.h5', key='df', mode='w')
song_id_to_song_index.to_hdf('data/song_mapping.h5', key='df', mode='w')

save_npz('data/train_sparse', train_plays)
save_npz('data/test_sparse', test_plays)

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->block0_values] [items->['sparse_index', 'user_id']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->block0_values] [items->['sparse_index', 'song_id']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)


In [20]:
# Using multiprocessing.Pool.map and play_count weightings
# LET n = number of songs
# LET m = number of audio features

feature_MUSIC_dict = {
    'danceability': np.array([-0.37, 0.05, -0.35, 0.08, 0.43]),
    'energy': np.array([-0.64, -0.46, -0.13, 0.66, -0.03]),
    'instrumentalness': np.array([0.20, -0.47, 0.28, 0.09, -0.01]),
    'liveness': np.array([-0.69, -0.12, -0.07, 0.43, 0.02]),
    'loudness': np.array([-0.58, -0.19, -0.44, 0.79, -0.21]),
    'valence': np.array([-0.04, 0.18, 0.24, -0.34, 0.18]),
}
# feature_MUSIC_matrix -> m x 5 matrix, where m is the number of audio features in feature_MUSIC_dict
feature_MUSIC_matrix = [MUSIC for MUSIC in feature_MUSIC_dict.values()]

# TODO: use play counts and scale song_vectors before calculating pdist
def get_cosine_list_dissimilarity(sub_df):
    # song_vectors -> n x m matrix, where m is the number of audio features in the embedding_cols
    song_vectors = sub_df[embedding_cols].values
    if len(song_vectors) == 1:
        return None
    return np.mean(pdist(song_vectors, 'cosine'))

def get_MUSIC(sub_df):
    # song_vectors -> n x m matrix, where m is the number of audio features in feature_MUSIC_dict
    song_vectors = sub_df[list(feature_MUSIC_dict.keys())].values
    
    # unweighted_MUSIC_vals -> n x 5 matrix
    unweighted_MUSIC_vals = song_vectors @ feature_MUSIC_matrix
    
    # returns the average MUSIC values weighted by their play_counts
    return list(np.average(unweighted_MUSIC_vals, weights=sub_df['play_count'].values, axis=0))

def get_is_test(sub_df):
    return sub_df['is_test'].values[0]

def get_song_ids(sub_df):
    return sub_df['song_id'].to_list()
#     # this is because each song_id looks like this: "b'SOSIYAD12A8C14097F'" 
#     return [song_id_str[2:-1] for song_id_str in sub_df['song_id'].to_list()]

gb = visible_niplets.groupby('user_id')

def get_row(gb_item):
    user_id, sub_df = gb_item
    return {
        'user_id': user_id,
        'MUSIC': get_MUSIC(sub_df),
        'num_songs': sub_df.shape[0],
        'is_test': get_is_test(sub_df),
        'song_ids': get_song_ids(sub_df)
    }
    
user_df_data = Pool().map(func=get_row, iterable=list(gb), chunksize=625)
user_df = pd.DataFrame(user_df_data)
user_df.head()

Unnamed: 0,MUSIC,is_test,num_songs,song_ids,user_id
0,"[4.229812033333332, 1.4042373883333334, 3.7184...",False,3,"[SOBSSGK12A6D4F9EF1, SOCZQCY12AC468E40F, SOCTX...",00000b722001882066dff9d2da8a775658053ea0
1,"[4.910766939999999, 1.6201183160000001, 4.2151...",False,6,"[SOFXSRW12A6D4F3B77, SOFFWTH12A6310D9E8, SOLOD...",00001638d6189236866af9bbf309ae6c2347ffdc
2,"[3.9929606913333338, 1.3756423253666668, 3.569...",False,6,"[SOBYRTY12AB0181EDB, SOYWZXA12A8C138274, SOYFP...",0000175652312d12576d9e6b84f600caa24c4715
3,"[4.160212249999999, 1.38550505, 3.48416005, -6...",False,3,"[SOBDRND12A8C13FD08, SODRFRJ12A8C144167, SOMMJ...",00001cf0dce3fb22b0df0f3a1d9cd21e38385372
4,"[5.020851199999999, 1.2992664299999999, 4.2901...",False,9,"[SOBMSCQ12AAF3B51B7, SOJERWB12A8C13E654, SOMCH...",0000267bde1b3a70ea75cf2b2d216cb828e3202b


In [21]:
user_df.shape

(1107613, 5)

In [22]:
user_df_with_sparse_index = user_df.merge(user_id_to_user_index, left_on='user_id', right_on='user_id')

In [23]:
user_df_with_sparse_index.head()

Unnamed: 0,MUSIC,is_test,num_songs,song_ids,user_id,sparse_index
0,"[4.229812033333332, 1.4042373883333334, 3.7184...",False,3,"[SOBSSGK12A6D4F9EF1, SOCZQCY12AC468E40F, SOCTX...",00000b722001882066dff9d2da8a775658053ea0,0
1,"[4.910766939999999, 1.6201183160000001, 4.2151...",False,6,"[SOFXSRW12A6D4F3B77, SOFFWTH12A6310D9E8, SOLOD...",00001638d6189236866af9bbf309ae6c2347ffdc,1
2,"[3.9929606913333338, 1.3756423253666668, 3.569...",False,6,"[SOBYRTY12AB0181EDB, SOYWZXA12A8C138274, SOYFP...",0000175652312d12576d9e6b84f600caa24c4715,2
3,"[4.160212249999999, 1.38550505, 3.48416005, -6...",False,3,"[SOBDRND12A8C13FD08, SODRFRJ12A8C144167, SOMMJ...",00001cf0dce3fb22b0df0f3a1d9cd21e38385372,3
4,"[5.020851199999999, 1.2992664299999999, 4.2901...",False,9,"[SOBMSCQ12AAF3B51B7, SOJERWB12A8C13E654, SOMCH...",0000267bde1b3a70ea75cf2b2d216cb828e3202b,4


In [24]:
user_df_with_sparse_index.shape

(1107613, 6)

In [25]:
song_df = song_id_to_song_index.merge(msd, left_on='song_id', right_on='song_id')

In [26]:
song_df.head()

Unnamed: 0,sparse_index,song_id,year,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,4785,SOAKIMP12A8C130995,0,0.656,0.486,112680,0.231,0.577,7,0.128,-15.423,1,0.0445,123.685,4,0.389
1,7052,SOAPDEY12A81C210A9,1974,0.588,0.809,156200,0.831,0.0135,9,0.197,-4.027,0,0.0288,102.83,4,0.92
2,14453,SOBFOVM12A58A7D494,0,0.000111,0.598,141440,0.76,0.244,7,0.101,-7.372,0,0.0293,143.948,4,0.833
3,20354,SOBSUJE12A6D4F8CF5,2006,0.119,0.607,246410,0.504,0.000412,0,0.102,-8.176,0,0.0327,126.051,4,0.0396
4,21408,SOBVFZR12A6D4F8AE3,2002,0.172,0.722,171173,0.501,0.534,1,0.119,-10.62,0,0.0314,123.195,4,0.402


In [27]:
song_df.shape

(175566, 16)

In [28]:
user_df_with_sparse_index.to_hdf('data/user_df.h5', key='df', mode='w')
song_df.to_hdf('data/song_df.h5', key='df', mode='w')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->block2_values] [items->['MUSIC', 'song_ids', 'user_id', 'sparse_index']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->block2_values] [items->['sparse_index', 'song_id']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)


# Garbage/Test Code

In [90]:
# using pandas groupby agg

def get_cosine_list_dissimilarity(song_ids):
    song_vectors = msd.loc[msd['song_id'].isin(song_ids)][embedding_cols].values
    if len(song_vectors) == 1:
        return None
    return np.mean(pdist(song_vectors, 'cosine'))

def get_MUSIC(song_ids):
    song_vectors = msd.loc[msd['song_id'].isin(song_ids)][list(feature_MUSIC_dict.keys())].values
    return list(np.mean(np.dot(song_vectors, feature_MUSIC_matrix), axis=0))

def num_songs(song_ids):
    return len(song_ids)

start = time.time()
gb = triplets.head(50000).groupby('user_id')
df = gb.agg([get_cosine_list_dissimilarity, get_MUSIC, 'count'])
df.columns = df.columns.droplevel(0)
df = df.dropna() # Removes all users who only listened to 1 song
df = df.reset_index()
df = df.rename(index=str, columns={
    "get_cosine_list_dissimilarity": "cosine_dissim",
    "get_MUSIC": "MUSIC",
    "count": "num_songs",
})
df['div_pref'] = StandardScaler().fit_transform(df['cosine_dissim'].values.reshape(-1,1))/2 + 0.5
print(time.time()-start)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  ret, rcount, out=ret, casting='unsafe', subok=False)


62.2878041267395


In [21]:
yolo = pd.DataFrame([{'a': True}])
yolo['b'] = False
yolo

Unnamed: 0,a,b
0,True,False


In [55]:
# yolo = msd.head()
# yolo
# print(yolo['song_id'].to_list())
# print(yolo['song_id'].str[2:-1].to_list())
# yolo
# yolo['song_id']
# print([song_id_str[2:-1] for song_id_str in yolo['song_id'].to_list()])

Unnamed: 0,song_id,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,YAD12A8C1409,0.0142,0.471,254537,0.91,0.764,7,0.403,-4.847,1,0.109,175.816,4,0.282
1,HTP12A8C13BF,0.843,0.479,162173,0.315,4e-06,9,0.0911,-12.951,0,0.0991,97.886,4,0.309
2,VGL12A8C13C3,0.307,0.678,188493,0.787,8e-05,9,0.714,-6.344,1,0.0355,128.181,4,0.969
3,IRQ12AAA15CF,0.147,0.804,278600,0.676,0.919,3,0.0797,-8.48,0,0.0437,94.994,4,0.527
4,DEN12AB018B6,0.129,0.604,267200,0.603,0.0,11,0.185,-4.419,0,0.0507,124.088,4,0.399


In [44]:
lol = pd.read_hdf('data/full_msd_with_audio_features.h5', key='df')[['song_id'] + embedding_cols]

[song_id_str[2:-1] for song_id_str in lol['song_id'].to_list()]

['SOSIYAD12A8C14097F',
 'SOHPHTP12A8C13BF53',
 'SOFVVGL12A8C13C32F',
 'SOHXIRQ12AAA15CF81',
 'SOJHDEN12AB018B650',
 'SONVBWO12AB0187B35',
 'SOGLAKB12AB017DF39',
 'SOTAQLI12A8C13CFD5',
 'SOANPML12AB017D645',
 'SOBYKSS12A58A7C17C',
 'SOSJYVH12A8C1424B5',
 'SOCMLQF12A8C145701',
 'SOQCFUK12A8C142F17',
 'SOJTODJ12AB018ADC2',
 'SOKOCYW12AB0184A65',
 'SOSHJEP12A6D4F5886',
 'SOVOLSW12A6D4F8BE9',
 'SODSWWL12AC9618263',
 'SOHRSRU12A8C136D7A',
 'SOFJOMT12A8C137393',
 'SOFENSY12A6D4FB766',
 'SOAZIKY12A6D4FB469',
 'SOAWYTV12AB018A6E8',
 'SOBGWYD12A6D4FD600',
 'SOJEIKW12A6D4F762B',
 'SOBMJHH12AB017DF32',
 'SOPLLWC12A8C138B25',
 'SOKIABX12A8C131124',
 'SONKTHE12AB018246E',
 'SOCNGSC12AB0187A7C',
 'SOKLZDQ12A8C143D9D',
 'SOPOTYN12AB0181CD4',
 'SOJLZCA12A8C133112',
 'SOGGERY12AB01837E9',
 'SORTHCE12AB0182A78',
 'SOWPNTP12AAF3B1D86',
 'SORRXBN12A58A7C684',
 'SOJAWBG12A8C137499',
 'SOJAWBG12A8C137499',
 'SOJAWBG12A8C137499',
 'SOJAWBG12A8C137499',
 'SOIDWYM12AB0187B62',
 'SOAJJBD12A8C133D78',
 'SOLCUIC12

In [None]:
# TEST CODE

# triplets.head(120)['user_id'].nunique()
gb = triplets.head(5).groupby('user_id')
def yolo(vals):
    print(vals)
    return len(vals)
def polo(vals):
#     print(vals)
    return [len(vals), 1]

df = gb.agg([yolo, polo])
df.columns = df.columns.droplevel(0)
df = df.reset_index()
# df.columns
df.head()

In [7]:
gb = triplets.groupby('song_id').agg('count')
gb

Unnamed: 0_level_0,user_id,play_count,is_test
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
b'SOAAADD12AB018A9DD',14,14,14
b'SOAAADE12A6D4F80CC',9,9,9
b'SOAAADF12A8C13DF62',6,6,6
b'SOAAADZ12A8C1334FB',8,8,8
b'SOAAAFI12A6D4F9C66',59,59,59
b'SOAAAGK12AB0189572',55,55,55
b'SOAAAGN12AB017D672',2,2,2
b'SOAAAGO12A67AE0A0E',3,3,3
b'SOAAAGP12A6D4F7D1C',4,4,4
b'SOAAAGQ12A8C1420C8',933,933,933


In [8]:
yolo = gb.merge(msd, left_on='song_id', right_on='song_id')
yolo

Unnamed: 0,song_id,user_id,play_count,is_test,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,b'SOAAADD12AB018A9DD',14,14,14,0.888000,0.225,321133,0.13500,0.657000,3,0.0510,-26.040,0,0.0382,81.807,4,0.0377
1,b'SOAAADE12A6D4F80CC',9,9,9,0.273000,0.509,199088,0.34700,0.032800,4,0.2980,-16.711,1,0.0273,102.604,4,0.5220
2,b'SOAAADF12A8C13DF62',6,6,6,0.313000,0.339,200880,0.78700,0.019600,9,0.7100,-6.285,0,0.0539,157.042,4,0.4200
3,b'SOAAAFI12A6D4F9C66',59,59,59,0.091400,0.453,243533,0.55900,0.188000,4,0.0950,-6.575,0,0.0304,104.391,4,0.4300
4,b'SOAAAGK12AB0189572',55,55,55,0.727000,0.761,191480,0.11700,0.462000,6,0.1040,-17.982,1,0.0354,123.049,4,0.0600
5,b'SOAAAGN12AB017D672',2,2,2,0.077800,0.879,262533,0.78900,0.000324,1,0.1390,-4.123,1,0.2100,91.180,4,0.7050
6,b'SOAAAGO12A67AE0A0E',3,3,3,0.870000,0.320,155747,0.38600,0.881000,10,0.2930,-9.174,1,0.0275,81.872,4,0.1440
7,b'SOAAAGP12A6D4F7D1C',4,4,4,0.055600,0.555,200293,0.83100,0.000033,4,0.1530,-7.945,0,0.0464,164.984,4,0.8890
8,b'SOAAAGQ12A8C1420C8',933,933,933,0.809000,0.258,372853,0.00302,0.859000,9,0.1210,-21.968,0,0.0452,110.639,4,0.0575
9,b'SOAAAKE12A8C1397E9',1,1,1,0.399000,0.827,210200,0.46400,0.000274,0,0.1100,-10.614,1,0.0391,123.701,4,0.8390


In [12]:
niplets['user_id'].nunique()

1107613

In [13]:
triplets['user_id'].nunique()

1119318

In [15]:
niplets['user_id'].unique()

array(['b80344d063b5ccb3212f76538f3d9e43d87dca9e',
       '7c86176941718984fed11b7c0674ff04c029b480',
       '76235885b32c4e8c82760c340dc54f9b608d7d7e', ...,
       'be2a2ab05038b1b898ba571befa503e00b346789',
       'e2463e1cd898d090d919728a67471ba771d14f2b',
       'faa2bc9a5f80530ee6d0eb48260573e1fbe208f0'], dtype=object)

In [49]:
a = np.arange(3)
b = np.arange(3)
np.vstack((a,b)).T

array([[0, 0],
       [1, 1],
       [2, 2]])