# Import packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit

# Load data

In [2]:
sub = pd.read_csv('./data/raw/submission_history.csv')

In [3]:
sub.head()

Unnamed: 0,id,pk,track_id,track_info,band_id,influencer_id,influencer_kind,influencer_feedback,decision,score
0,7312,7312,324,test tim,303,102,Label,"Bonjour, \nle track surf sur les codes ""austra...",['give feedback on your tune'],0.0
1,7313,7313,324,test tim,303,103,Radio,"Bonjour, merci pour votre envoi. Le morceau n'...",['give feedback on your tune'],0.0
2,7314,7314,324,test tim,303,104,Journalist,Le morceau est à lui tout seul une succession ...,['give feedback on your tune'],0.0
3,7315,7315,324,test tim,303,105,Channel,Très bonne pop aux airs de Tame Impala et Pond...,"['share it on social media', 'add it to a play...",1.0
4,7316,7316,324,test tim,303,106,Media,"La production est assurément excellente, mais ...",['give feedback on your tune'],0.0


# Split data

In [4]:
N_FOLDS = 5
RANDOM_SEED = 42

In [5]:
X = sub.drop(columns=['score', 'influencer_feedback', 'decision'])
y = sub.score
skf = StratifiedKFold(n_splits=N_FOLDS, random_state=RANDOM_SEED)



In [6]:
skf.get_n_splits(X, y)
print(skf)

StratifiedKFold(n_splits=5, random_state=42, shuffle=False)


# Load data

In [7]:
band = pd.read_csv('./data/raw/band_content.csv')
content = pd.read_csv('./data/raw/influencer_content.csv')

In [10]:
dataset = sub[['id', 'track_id', 'band_id', 'influencer_id', 'influencer_kind', 'score']].merge(
    band.drop(columns=['id', 'biography_fr', 'biography_en']),
    how='left',
    on='band_id',
).merge(
    content.drop(columns=['id', 'description_fr', 'description_en', 'preferences_fr', 'preferences_en']),
    how='left',
    on='influencer_id',
    suffixes=('_band', '_influencer')
)

In [11]:
dataset.columns.tolist()

['id',
 'track_id',
 'band_id',
 'influencer_id',
 'influencer_kind',
 'score',
 'Acid house_band',
 'African music_band',
 'Alternative rock_band',
 'Ambient_band',
 'Blues_band',
 'Bossa Nova_band',
 'Chill-out_band',
 'Classical Music_band',
 'Coldwave_band',
 'Country_band',
 'Dance music_band',
 'Dance-pop_band',
 'Deep house_band',
 'Disco_band',
 'Dream Pop_band',
 'Dub_band',
 'Electro swing_band',
 'Electronic rock_band',
 'Electronica_band',
 'Electropop_band',
 'Experimental_band',
 'Experimental Jazz_band',
 'Experimental rock_band',
 'Film Music_band',
 'French house_band',
 'Funk_band',
 'Future house_band',
 'Garage rock_band',
 'Grime_band',
 'Hard rock_band',
 'Hip hop_band',
 'House music_band',
 'Indie folk_band',
 'Indie pop_band',
 'Indie rock_band',
 'Instrumental_band',
 'International Pop_band',
 'Latin music_band',
 'Lo-Fi_band',
 'Metal_band',
 'Minimal_band',
 'Modern Jazz_band',
 'New wave_band',
 'Noise rock_band',
 'Nouvelle Scène_band',
 'Nu-disco_band',


In [12]:
dataset.influencer_kind.value_counts()

Media          35288
Radio           9872
Label           9155
Playlist        8501
Journalist      4268
Channel         4150
Booker          3405
Mentor          2288
Manager         2094
Springboard     2087
Publisher       1698
Supervisor       572
Event            328
Name: influencer_kind, dtype: int64

In [13]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
dataset['influencer_kind'] = le.fit_transform(dataset['influencer_kind'])

In [14]:
dataset.head()

Unnamed: 0,id,track_id,band_id,influencer_id,influencer_kind,score,Acid house_band,African music_band,Alternative rock_band,Ambient_band,...,Singer-songwriter_influencer,Soul_influencer,Surf rock_influencer,Synthpop_influencer,Synthwave_influencer,Techno_influencer,Traditional Music_influencer,Trap_influencer,Trip hop_influencer,Variété Française_influencer
0,7312,324,303,102,4,0.0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
1,7313,324,303,103,10,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,7314,324,303,104,3,0.0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,1
3,7315,324,303,105,1,1.0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,7316,324,303,106,6,0.0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [15]:
X, y = (
    dataset.drop(columns='score').rename(
        {'Variété Française_band': 'Variete Francaise_band', 'Variété Française_influencer': 'Variete Francaise_influencer'},
        axis=1
    ).values,
    dataset.score.values
)

# Preprocess text

* choose english vs french
* remove stop words
* tokenize
* choose max_len
* pad

# Keras models

In [17]:
from keras.layers import Input, Embedding, Flatten, Dot, Dense, Concatenate, Dropout
from keras.models import Model

In [18]:
i_data = dataset.filter(regex='_influencer')
b_data = dataset.filter(regex='_band')

In [19]:
i_data_idx = [dataset.drop(columns='score').columns.get_loc(c) for c in dataset.filter(regex='_influencer')]
b_data_idx = [dataset.drop(columns='score').columns.get_loc(c) for c in dataset.filter(regex='_band')]

In [151]:
def build_model_2(i_emb_dim=50, b_emb_dim=50, kind_emb_dim=5, last_dense=20, dropout=0.2):
    """
    """
    # Influencer embedding
    influencer_input = Input(shape=[i_data.shape[1]], name="Influencer-Input")
    influencer_embedding = Dense(i_emb_dim, activation='tanh', name="Influencer-Embedding")(influencer_input)
    
    # Influencer kind categorical embedding
    influencer_kind_input = Input(shape=[1], name="Influencer-Kind-Input")
    influencer_kind_emb = Embedding(14, kind_emb_dim, name="Influencer-Kind-Embedding")(influencer_kind_input)
    
    # Concatenate influencer emb with influencer kind emb to get full influencer emb
    influencer_full_emb = Concatenate(axis=-1)([influencer_embedding, Flatten(name='Flatten')(influencer_kind_emb)])
    
    # Band embedding
    band_input = Input(shape=[b_data.shape[1]], name="Band-Input")
    band_embedding = Dense(b_emb_dim, activation='tanh', name="Band-Embedding")(band_input)
    
    # Concatenate and create product
    prod = Concatenate(name="Concat", axis=-1)([influencer_full_emb, band_embedding])
    prod2 = Dense(last_dense, activation='tanh', name="Dense1")(prod)
    dropout = Dropout(rate=dropout)(prod2)
    
    # Dropout
    prod3 = Dense(1, activation='tanh', name="Dense2")(dropout)
    model = Model([influencer_input, band_input, influencer_kind_input], prod3)
    model.compile('adam', 'mean_squared_error')
    
    return model

In [70]:
nn_score = 0.0

for train_index, test_index in skf.split(X, (y * 100).astype(int)):
    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=RANDOM_SEED)
    tridx, vidx = next(sss.split(X_train, (y_train * 100).astype(int)))
    X_train, X_valid = X_train[tridx], X_train[vidx]
    y_train, y_valid = y_train[tridx], y_train[vidx]
    
    model = build_model()
    model.fit([X_train[:, i_data_idx], X_train[:, b_data_idx]], y_train,
              validation_data=([X_valid[:, i_data_idx], X_valid[:, b_data_idx]], y_valid), 
              batch_size=64,
              epochs=10,
              verbose=1)
    
    nn_score += np.sqrt(mean_squared_error(model.predict([X_test[:, i_data_idx], X_test[:, b_data_idx]]), y_test))

nn_score /= N_FOLDS

print(nn_score)

Train on 46874 samples, validate on 20090 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 46875 samples, validate on 20090 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 46875 samples, validate on 20090 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 46875 samples, validate on 20090 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 46875 samples, validate on 20090 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.35810610043049085


In [181]:
def build_model_3(i_emb_dim=50, b_emb_dim=50, kind_emb_dim=10, last_dense=40, dropout=0.5):
    """
    """
    # Influencer embedding
    influencer_input = Input(shape=[i_data.shape[1]], name="Influencer-Input")
    influencer_embedding = Dense(i_emb_dim, activation='tanh', name="Influencer-Embedding1")(influencer_input)
    influencer_embedding = Dense(last_dense, activation='tanh', name="Dense1")(influencer_embedding)
    influencer_embedding = Dense(i_emb_dim-10, activation='tanh', name="Influencer-Embedding")(influencer_embedding)
    
    # Influencer kind categorical embedding
    influencer_kind_input = Input(shape=[1], name="Influencer-Kind-Input")
    influencer_kind_emb = Embedding(14, kind_emb_dim, name="Influencer-Kind-Embedding")(influencer_kind_input)
    
    # Concatenate influencer emb with influencer kind emb to get full influencer emb
    influencer_full_emb = Concatenate(axis=-1)([influencer_embedding, Flatten(name='Flatten')(influencer_kind_emb)])
    
    # Band embedding
    band_input = Input(shape=[b_data.shape[1]], name="Band-Input")
    band_embedding = Dense(b_emb_dim, activation='tanh', name="Band-Embedding1")(band_input)
    band_embedding = Dense(last_dense, activation='tanh', name="Dense2")(band_embedding)
    band_embedding = Dense(b_emb_dim-10, activation='tanh', name="Band-Embedding")(band_embedding)
    
    # Concatenate and create product
    prod = Concatenate(name="Concat", axis=-1)([influencer_full_emb, band_embedding])
    prod2 = Dense(last_dense, activation='tanh', name="Dense0")(prod)
    dropout = Dropout(rate=dropout)(prod2)
    
    # Dropout
    prod3 = Dense(1, activation='tanh', name="Dense3")(dropout)
    model = Model([influencer_input, band_input, influencer_kind_input], prod3)
    model.compile('adam', 'mean_squared_error')
    
    return model

In [182]:
dataset.head()

Unnamed: 0,id,track_id,band_id,influencer_id,influencer_kind,score,Acid house_x,African music_x,Alternative rock_x,Ambient_x,...,Singer-songwriter_y,Soul_y,Surf rock_y,Synthpop_y,Synthwave_y,Techno_y,Traditional Music_y,Trap_y,Trip hop_y,Variété Française_y
0,7312,324,303,102,4,0.0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
1,7313,324,303,103,10,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,7314,324,303,104,3,0.0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,1
3,7315,324,303,105,1,1.0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,7316,324,303,106,6,0.0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [183]:
EPOCHS = 100
PATIENCE = 10
BATCH_SIZE = 32

In [None]:
nn_score = 0.0

for train_index, test_index in skf.split(X, (y * 100).astype(int)):
    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=RANDOM_SEED)
    tridx, vidx = next(sss.split(X_train, (y_train * 100).astype(int)))
    X_train, X_valid = X_train[tridx], X_train[vidx]
    y_train, y_valid = y_train[tridx], y_train[vidx]
    
    # Build model
    model = build_model_3()
    
    # Early stoppnig callback
    es = keras.callbacks.EarlyStopping(
        monitor='val_loss', 
        mode='min', 
        patience=PATIENCE,
        restore_best_weights=True,
        verbose=1
    )
    
    # Fit
    model.fit([X_train[:, i_data_idx], X_train[:, b_data_idx], X_train[:, 5]], y_train,
              validation_data=([X_valid[:, i_data_idx], X_valid[:, b_data_idx], X_valid[:, 5]], y_valid), 
              batch_size=BATCH_SIZE,
              epochs=EPOCHS,
              callbacks=[es],
              verbose=1)
    
    nn_score += np.sqrt(mean_squared_error(
        model.predict([X_test[:, i_data_idx], X_test[:, b_data_idx], X_test[:, 5]]), y_test
    ))

nn_score /= N_FOLDS

print(nn_score)

Train on 46874 samples, validate on 20090 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Restoring model weights from the end of the best epoch
Epoch 00049: early stopping
Train on 46875 samples, validate on 20090 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epo

* Check pipe
* Add L2 reg
* look for text embeddings
* preprocess text
* build archi
* print archi
* pretrained emb
* tqdm notebook

Content based : no interaction influencer/artist taken into account (no embedding for them)

advantage : cold start allowed
disadvantage : interesting info lossed

==> hybrid recommender system

label encoded influencer kind : in production, a category 'Other' can be created to account for potential kinds not present in current dataset.

Grid search