# Import packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit

# Load data

In [2]:
sub = pd.read_csv('./data/raw/submission_history.csv')

In [3]:
sub.head()

Unnamed: 0,id,pk,track_id,track_info,band_id,influencer_id,influencer_kind,influencer_feedback,decision,score
0,7312,7312,324,test tim,303,102,Label,"Bonjour, \nle track surf sur les codes ""austra...",['give feedback on your tune'],0.0
1,7313,7313,324,test tim,303,103,Radio,"Bonjour, merci pour votre envoi. Le morceau n'...",['give feedback on your tune'],0.0
2,7314,7314,324,test tim,303,104,Journalist,Le morceau est à lui tout seul une succession ...,['give feedback on your tune'],0.0
3,7315,7315,324,test tim,303,105,Channel,Très bonne pop aux airs de Tame Impala et Pond...,"['share it on social media', 'add it to a play...",1.0
4,7316,7316,324,test tim,303,106,Media,"La production est assurément excellente, mais ...",['give feedback on your tune'],0.0


# Split data

In [4]:
N_FOLDS = 5
RANDOM_SEED = 42

In [5]:
X = sub.drop(columns=['score', 'influencer_feedback', 'decision'])
y = sub.score
skf = StratifiedKFold(n_splits=N_FOLDS, random_state=RANDOM_SEED)



In [6]:
skf.get_n_splits(X, y)
print(skf)

StratifiedKFold(n_splits=5, random_state=42, shuffle=False)


# Load data

In [7]:
band = pd.read_csv('./data/raw/band_content.csv')
content = pd.read_csv('./data/raw/influencer_content.csv')

In [8]:
import nltk
from nltk.corpus import stopwords

In [9]:
stop_words = stopwords.words('french')
print(stop_words)

['au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'ils', 'je', 'la', 'le', 'les', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'étante', 'étants', 'étantes', 'suis', 'es', 'est', 'sommes', 'êtes', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient', 'fusse', 'fusses', 'fût', 'fussions', 'fussiez', 'fussent', 'ayant', 'ayante', 'ayantes', 'ayants', 'eu', 'eue', 'eues', 'eus', 'ai', 'as', 'avons', 'avez', 'ont', 'aurai', 'auras', 'aura', 'aurons', 'aur

In [10]:
content.head()

Unnamed: 0,id,influencer_id,description_fr,description_en,preferences_fr,preferences_en,Acid house,African music,Alternative rock,Ambient,...,Singer-songwriter,Soul,Surf rock,Synthpop,Synthwave,Techno,Traditional Music,Trap,Trip hop,Variété Française
0,96,96,"Ex-BSC NEWS, nouveau magazine culturel franc-t...","Ex-BSC NEWS, nouveau magazine culturel franc-t...",Musique Comtemporaine et Jazz,,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
1,97,97,Underdog Records is a french alternative label...,Underdog Records is a french alternative label...,"Folk, soul, blues, rock&roll, indie pop","Folk, soul, blues, rock&roll, indie pop",0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
2,102,102,HIGHLIFE is a music publishing company + Indep...,HIGHLIFE Recordings has a wide open philosophy...,Déjà de la maturité,Already mature and original,0,0,0,1,...,0,0,0,0,0,1,0,0,1,0
3,103,103,Nectar est une émission radio musicale et hebd...,Nectar is a weekly music radio program about f...,Folk,Folk,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,104,104,Ecrit pour Konbini et Noisey (Vice). Défricheu...,"Writes for Konbini and Noisey (Vice). Rap, Hip...",Rap,Rap,0,0,0,0,...,0,1,0,0,0,0,0,1,0,1


In [11]:
content.isnull().sum()

id                     0
influencer_id          0
description_fr       219
description_en       241
preferences_fr       311
                    ... 
Techno                 0
Traditional Music      0
Trap                   0
Trip hop               0
Variété Française      0
Length: 77, dtype: int64

In [12]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/pa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
band.biography_fr.isnull().sum()

6161

In [14]:
dataset = sub[['id', 'track_id', 'band_id', 'influencer_id', 'influencer_kind', 'score']].merge(
    band.drop(columns=['id', 'biography_fr', 'biography_en']),
    how='left',
    on='band_id',
).merge(
    content.drop(columns=['id', 'description_fr', 'description_en', 'preferences_fr', 'preferences_fr']),
    how='left',
    on='influencer_id',
    suffixes=('_band', '_influencer')
).drop(columns=['id', 'track_id', 'band_id', 'influencer_id'])

In [15]:
dataset

Unnamed: 0,influencer_kind,score,Acid house_band,African music_band,Alternative rock_band,Ambient_band,Blues_band,Bossa Nova_band,Chill-out_band,Classical Music_band,...,Singer-songwriter_influencer,Soul_influencer,Surf rock_influencer,Synthpop_influencer,Synthwave_influencer,Techno_influencer,Traditional Music_influencer,Trap_influencer,Trip hop_influencer,Variété Française_influencer
0,Label,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
1,Radio,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Journalist,0.0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,1
3,Channel,1.0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,Media,0.0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83701,Journalist,1.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
83702,Manager,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
83703,Label,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
83704,Journalist,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
dataset['influencer_kind'] = le.fit_transform(dataset['influencer_kind'])

In [17]:
dataset.head()

Unnamed: 0,influencer_kind,score,Acid house_band,African music_band,Alternative rock_band,Ambient_band,Blues_band,Bossa Nova_band,Chill-out_band,Classical Music_band,...,Singer-songwriter_influencer,Soul_influencer,Surf rock_influencer,Synthpop_influencer,Synthwave_influencer,Techno_influencer,Traditional Music_influencer,Trap_influencer,Trip hop_influencer,Variété Française_influencer
0,4,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
1,10,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0.0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,1
3,1,1.0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,6,0.0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [18]:
X, y = (
    dataset.drop(columns='score').rename(
        {'Variété Française_band': 'Variete Francaise_band', 'Variété Française_influencer': 'Variete Francaise_influencer'},
        axis=1
    ).values,
    dataset.score.values
)

# Preprocess text

* choose english vs french
* remove stop words
* tokenize
* choose max_len
* pad

In [19]:
import en_core_web_sm
nlp = en_core_web_sm.load()

In [20]:
#sample = dataset.sample(frac=1.0)
sample = dataset

### Treat preferences

In [22]:
%%time

series = sample.preferences_en.fillna('Unknown').apply(nlp)

CPU times: user 6min 27s, sys: 908 ms, total: 6min 28s
Wall time: 6min 28s


In [23]:
sequences = [[vec.vector for vec in sequence]for sequence in series]

In [24]:
sample.preferences_en.iloc[0]

'Already mature and original'

In [25]:
max_length = max([len(sequence) for sequence in sequences])
print(max_length)

31


In [26]:
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [27]:
padded_docs = pad_sequences(sequences, maxlen=max_length, padding='post')

In [28]:
padded_docs.shape

(83706, 31, 96)

In [222]:
dataset = sample

In [30]:
content.head()

Unnamed: 0,id,influencer_id,description_fr,description_en,preferences_fr,preferences_en,Acid house,African music,Alternative rock,Ambient,...,Singer-songwriter,Soul,Surf rock,Synthpop,Synthwave,Techno,Traditional Music,Trap,Trip hop,Variété Française
0,96,96,"Ex-BSC NEWS, nouveau magazine culturel franc-t...","Ex-BSC NEWS, nouveau magazine culturel franc-t...",Musique Comtemporaine et Jazz,,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
1,97,97,Underdog Records is a french alternative label...,Underdog Records is a french alternative label...,"Folk, soul, blues, rock&roll, indie pop","Folk, soul, blues, rock&roll, indie pop",0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
2,102,102,HIGHLIFE is a music publishing company + Indep...,HIGHLIFE Recordings has a wide open philosophy...,Déjà de la maturité,Already mature and original,0,0,0,1,...,0,0,0,0,0,1,0,0,1,0
3,103,103,Nectar est une émission radio musicale et hebd...,Nectar is a weekly music radio program about f...,Folk,Folk,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,104,104,Ecrit pour Konbini et Noisey (Vice). Défricheu...,"Writes for Konbini and Noisey (Vice). Rap, Hip...",Rap,Rap,0,0,0,0,...,0,1,0,0,0,0,0,1,0,1


### Treat description

In [32]:
%%time

series = content.description_en.fillna('Unknown').apply(nlp)

CPU times: user 8.1 s, sys: 16 ms, total: 8.12 s
Wall time: 8.12 s


In [34]:
sequences = [[vec.vector for vec in sequence]for sequence in series]

In [35]:
sample.preferences_en.iloc[0]

'Already mature and original'

In [36]:
max_length = max([len(sequence) for sequence in sequences])
print(max_length)

547


In [37]:
padded_docs = pad_sequences(sequences, maxlen=max_length, padding='post')

### Treat biography

In [40]:
%%time

series = band.biography_en.fillna('Unknown').apply(nlp)

CPU times: user 1min 12s, sys: 0 ns, total: 1min 12s
Wall time: 1min 12s


In [34]:
sequences = [[vec.vector for vec in sequence]for sequence in series]

In [35]:
sample.preferences_en.iloc[0]

'Already mature and original'

In [36]:
max_length = max([len(sequence) for sequence in sequences])
print(max_length)

547


In [37]:
padded_docs = pad_sequences(sequences, maxlen=max_length, padding='post')

# Keras models

In [223]:
from sklearn.metrics import mean_squared_error

In [224]:
from keras.layers import Input, Embedding, Flatten, Dot, Dense, Concatenate, Dropout, LSTM
from keras.models import Model

In [225]:
i_data = dataset.filter(regex='_influencer')
b_data = dataset.filter(regex='_band')

In [226]:
i_data_idx = [dataset.drop(columns='score').columns.get_loc(c) for c in dataset.filter(regex='_influencer')]
b_data_idx = [dataset.drop(columns='score').columns.get_loc(c) for c in dataset.filter(regex='_band')]

In [233]:
def build_hybrid_model(i_emb_dim=50, b_emb_dim=50, kind_emb_dim=5, lstm=10, last_dense=20, dropout=0.2):
    """
    """
    # Influencer embedding
    influencer_input = Input(shape=[i_data.shape[1]], name="Influencer-Input")
    influencer_embedding = Dense(i_emb_dim, activation='tanh', name="Influencer-Embedding")(influencer_input)
    
    # Influencer kind categorical embedding
    influencer_kind_input = Input(shape=[1], name="Influencer-Kind-Input")
    influencer_kind_emb = Embedding(14, kind_emb_dim, name="Influencer-Kind-Embedding")(influencer_kind_input)
    
    # Influencer preferences_en LSTM embedding
    influencer_preferences_input = Input(shape=[31, 96], name="Influencer-Sequence-Input")
    sequence_emb = LSTM(lstm)(influencer_preferences_input)
    
    # Concatenate influencer emb with influencer kind emb to get full influencer emb
    influencer_full_emb = Concatenate(axis=-1)([influencer_embedding, 
                                                Flatten(name='Flatten')(influencer_kind_emb),
                                                sequence_emb])
    
    # Band embedding
    band_input = Input(shape=[b_data.shape[1]], name="Band-Input")
    band_embedding = Dense(b_emb_dim, activation='tanh', name="Band-Embedding")(band_input)
    
    # Concatenate and create product
    prod = Concatenate(name="Concat", axis=-1)([influencer_full_emb, band_embedding])
    prod2 = Dense(last_dense, activation='tanh', name="Dense1")(prod)
    dropout = Dropout(rate=dropout)(prod2)
    
    # Dropout
    prod3 = Dense(1, activation='tanh', name="Dense2")(dropout)
    model = Model([influencer_input, band_input, influencer_kind_input, influencer_preferences_input], prod3)
    model.compile('adam', 'mean_squared_error')
    
    return model

In [288]:
X

array([[4, 0, 0, ..., 0, 1, 0],
       [10, 0, 0, ..., 0, 0, 0],
       [3, 0, 0, ..., 1, 0, 1],
       ...,
       [4, 0, 0, ..., 0, 0, 0],
       [3, 0, 0, ..., 0, 0, 0],
       [10, 0, 0, ..., 1, 1, 1]], dtype=object)

In [291]:
dataset.dtypes.tolist()

[dtype('int64'),
 dtype('float64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int6

In [344]:
X, y = (
    dataset.drop(columns='score').rename(
        {'Variété Française_band': 'Variete Francaise_band', 'Variété Française_influencer': 'Variete Francaise_influencer'},
        axis=1
    ).values,
    dataset.score.values
)

In [236]:
nn_score = 0.0

for train_index, test_index in skf.split(X, (y * 100).astype(int)):
    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    docs_train, docs_test = padded_docs[train_index], padded_docs[test_index]
    
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=RANDOM_SEED)
    tridx, vidx = next(sss.split(X_train, (y_train * 100).astype(int)))
    X_train, X_valid = X_train[tridx], X_train[vidx]
    y_train, y_valid = y_train[tridx], y_train[vidx]
    
    docs_train, docs_valid = docs_train[tridx], docs_train[vidx]
    
    model = build_hybrid_model()
    model.fit([X_train[:, i_data_idx], X_train[:, b_data_idx], X_train[:, 0], docs_train], y_train,
              validation_data=([X_valid[:, i_data_idx], X_valid[:, b_data_idx], X_valid[:, 0], docs_valid], y_valid), 
              batch_size=64,
              epochs=5,
              verbose=1)
    
    nn_score += np.sqrt(mean_squared_error(model.predict([X_test[:, i_data_idx], 
                                                          X_test[:, b_data_idx],
                                                          X_test[:, 0],
                                                          docs_test]), y_test))

nn_score /= N_FOLDS

print(nn_score)

Train on 46874 samples, validate on 20090 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 46875 samples, validate on 20090 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 46875 samples, validate on 20090 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 46875 samples, validate on 20090 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 46875 samples, validate on 20090 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
0.3245228152861984


In [245]:
EPOCHS = 10
PATIENCE = 5
BATCH_SIZE = 128

In [258]:
nn_score = 0.0

for train_index, test_index in skf.split(X, (y * 100).astype(int)):
    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=RANDOM_SEED)
    tridx, vidx = next(sss.split(X_train, (y_train * 100).astype(int)))
    X_train, X_valid = X_train[tridx], X_train[vidx]
    y_train, y_valid = y_train[tridx], y_train[vidx]
    
    # Build model
    model = build_model_3()
    
    # Early stoppnig callback
    es = keras.callbacks.EarlyStopping(
        monitor='val_loss', 
        mode='min', 
        patience=PATIENCE,
        restore_best_weights=True,
        verbose=1
    )
    
    # Fit
    model.fit([X_train[:, i_data_idx], X_train[:, b_data_idx], X_train[:, 0]], y_train,
              validation_data=([X_valid[:, i_data_idx], X_valid[:, b_data_idx], X_valid[:, 0]], y_valid), 
              batch_size=BATCH_SIZE,
              epochs=EPOCHS,
              callbacks=[es],
              verbose=1)
    
    nn_score += np.sqrt(mean_squared_error(
        model.predict([X_test[:, i_data_idx], X_test[:, b_data_idx], X_test[:, 0]]), y_test
    ))

nn_score /= N_FOLDS

print(nn_score)

Train on 60267 samples, validate on 6697 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 60268 samples, validate on 6697 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Restoring model weights from the end of the best epoch
Epoch 00008: early stopping
Train on 60268 samples, validate on 6697 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 60268 samples, validate on 6697 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 60268 samples, validate on 6697 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.32747562200339536


In [272]:
EPOCHS = 100
PATIENCE = 10
BATCH_SIZE = 128

In [260]:
nn_score = 0.0

for train_index, test_index in skf.split(X, (y * 100).astype(int)):
    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    docs_train, docs_test = padded_docs[train_index], padded_docs[test_index]
    
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=RANDOM_SEED)
    tridx, vidx = next(sss.split(X_train, (y_train * 100).astype(int)))
    X_train, X_valid = X_train[tridx], X_train[vidx]
    y_train, y_valid = y_train[tridx], y_train[vidx]
    docs_train, docs_valid = docs_train[tridx], docs_train[vidx]
    
    # Build model
    model = build_hybrid_model()
    
    # Early stoppnig callback
    es = keras.callbacks.EarlyStopping(
        monitor='val_loss', 
        mode='min', 
        patience=PATIENCE,
        restore_best_weights=True,
        verbose=1
    )
    
    # Fit
    model.fit([X_train[:, i_data_idx], X_train[:, b_data_idx], X_train[:, 0], docs_train], y_train,
              validation_data=([X_valid[:, i_data_idx], X_valid[:, b_data_idx], X_valid[:, 0], docs_valid], y_valid), 
              batch_size=BATCH_SIZE,
              epochs=EPOCHS,
              callbacks=[es],
              verbose=1)
    
    nn_score += np.sqrt(mean_squared_error(
        model.predict([X_test[:, i_data_idx], X_test[:, b_data_idx], X_test[:, 0], docs_test]), y_test
    ))

nn_score /= N_FOLDS

print(nn_score)

Train on 60267 samples, validate on 6697 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 60268 samples, validate on 6697 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 60268 samples, validate on 6697 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 60268 samples, validate on 6697 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 60268 samples, validate on 6697 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
0.3236429465753356


In [241]:
import keras

In [242]:
def build_model_3(i_emb_dim=60, b_emb_dim=60, kind_emb_dim=10, last_dense=40, dropout=0.3):
    """
    """
    # Influencer embedding
    influencer_input = Input(shape=[i_data.shape[1]], name="Influencer-Input")
    influencer_embedding = Dense(i_emb_dim, activation='relu', name="Influencer-Embedding1")(influencer_input)
    influencer_embedding = Dense(last_dense, activation='relu', name="Dense1")(influencer_embedding)
    influencer_embedding = Dense(i_emb_dim-10, activation='relu', name="Influencer-Embedding")(influencer_embedding)
    
    # Influencer kind categorical embedding
    influencer_kind_input = Input(shape=[1], name="Influencer-Kind-Input")
    influencer_kind_emb = Embedding(14, kind_emb_dim, name="Influencer-Kind-Embedding")(influencer_kind_input)
    
    # Concatenate influencer emb with influencer kind emb to get full influencer emb
    influencer_full_emb = Concatenate(axis=-1)([influencer_embedding, Flatten(name='Flatten')(influencer_kind_emb)])
    
    # Band embedding
    band_input = Input(shape=[b_data.shape[1]], name="Band-Input")
    band_embedding = Dense(b_emb_dim, activation='relu', name="Band-Embedding1")(band_input)
    band_embedding = Dense(last_dense, activation='relu', name="Dense2")(band_embedding)
    band_embedding = Dense(b_emb_dim-10, activation='relu', name="Band-Embedding")(band_embedding)
    
    # Concatenate and create product
    prod = Concatenate(name="Concat", axis=-1)([influencer_full_emb, band_embedding])
    prod2 = Dense(last_dense, activation='relu', name="Dense0")(prod)
    dropout = Dropout(rate=dropout)(prod2)
    
    # Dropout
    prod3 = Dense(1, activation='relu', name="Dense3")(dropout)
    model = Model([influencer_input, band_input, influencer_kind_input], prod3)
    model.compile('adam', 'mean_squared_error')
    
    return model

In [348]:
def build_hybrid_model2(i_emb_dim=50, b_emb_dim=50, kind_emb_dim=4, lstm=16, last_dense=64, dropout=0.3):
    """
    """
    # Influencer embedding
    influencer_input = Input(shape=[i_data.shape[1]], name="Influencer-Input")
    influencer_embedding = Dense(i_emb_dim, activation='relu', name="Influencer-Embedding1")(influencer_input)
    influencer_embedding = Dense(i_emb_dim - 10, activation='relu', name="Dense0")(influencer_embedding)
    influencer_embedding = Dropout(rate=dropout)(influencer_embedding)
    
    # Influencer kind categorical embedding
    influencer_kind_input = Input(shape=[1], name="Influencer-Kind-Input")
    influencer_kind_emb = Embedding(14, kind_emb_dim, name="Influencer-Kind-Embedding")(influencer_kind_input)
    
    # Influencer preferences_en LSTM embedding
    influencer_preferences_input = Input(shape=[31, 96], name="Influencer-Sequence-Input")
    sequence_emb = LSTM(lstm)(influencer_preferences_input)
    sequence_emb = Dropout(rate=dropout)(sequence_emb)
    
    # Concatenate influencer emb with influencer kind emb to get full influencer emb
    influencer_full_emb = Concatenate(axis=-1)([influencer_embedding, 
                                                Flatten(name='Flatten')(influencer_kind_emb),
                                                sequence_emb])
    
    # Band embedding
    band_input = Input(shape=[b_data.shape[1]], name="Band-Input")
    band_embedding = Dense(b_emb_dim, activation='relu', name="Band-Embedding1")(band_input)
    band_embedding = Dense(b_emb_dim - 10, activation='relu', name="Dense2")(band_embedding)
    band_embedding = Dropout(rate=dropout)(band_embedding)
    
    # Concatenate and create product
    prod = Concatenate(name="Concat", axis=-1)([influencer_full_emb, band_embedding])
    prod2 = Dense(last_dense, activation='tanh', name="Dense1")(prod)
    dropout = Dropout(rate=dropout)(prod2)
    
    # Dropout
    prod3 = Dense(1, activation='tanh', name="Dense3")(dropout)
    model = Model([influencer_input, band_input, influencer_kind_input, influencer_preferences_input], prod3)
    model.compile('adam', 'mean_squared_error')
    
    return model

In [345]:
EPOCHS = 100
PATIENCE = 10
BATCH_SIZE = 32

In [350]:
nn_score = 0.0

for train_index, test_index in skf.split(X, (y * 100).astype(int)):
    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    docs_train, docs_test = padded_docs[train_index], padded_docs[test_index]
    
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=RANDOM_SEED)
    tridx, vidx = next(sss.split(X_train, (y_train * 100).astype(int)))
    X_train, X_valid = X_train[tridx], X_train[vidx]
    y_train, y_valid = y_train[tridx], y_train[vidx]
    docs_train, docs_valid = docs_train[tridx], docs_train[vidx]
    
    # Build model
    model = build_hybrid_model2(i_emb_dim=60, b_emb_dim=60, kind_emb_dim=8, lstm=8, last_dense=64, dropout=0.5)
    
    # Early stoppnig callback
    es = keras.callbacks.EarlyStopping(
        monitor='val_loss', 
        mode='min', 
        patience=PATIENCE,
        restore_best_weights=True,
        verbose=1
    )
    
    # Fit
    model.fit([X_train[:, i_data_idx], X_train[:, b_data_idx], X_train[:, 0], docs_train], y_train,
              validation_data=([X_valid[:, i_data_idx], X_valid[:, b_data_idx], X_valid[:, 0], docs_valid], y_valid), 
              batch_size=BATCH_SIZE,
              epochs=EPOCHS,
              callbacks=[es],
              verbose=1)
    
    nn_score += np.sqrt(mean_squared_error(
        model.predict([X_test[:, i_data_idx], X_test[:, b_data_idx], X_test[:, 0], docs_test]), y_test
    ))

nn_score /= N_FOLDS

print(nn_score)

Train on 53571 samples, validate on 13393 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Restoring model weights from the end of the best epoch
Epoch 00063: early stopping
Train on 53572 samples, validate on 13393 samples
Epoch 1

Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Restoring model weights from the end of the best epoch
Epoch 00039: early stopping
Train on 53572 samples, validate on 13393 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Restoring model weights from the end of the best epoch
Epoch 00030: early stopping
Train on 53572 samples, validate on 13393 samples
Epoch 1/100
Epo

In [274]:
nn_score = 0.0

for train_index, test_index in skf.split(X, (y * 100).astype(int)):
    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    docs_train, docs_test = padded_docs[train_index], padded_docs[test_index]
    
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=RANDOM_SEED)
    tridx, vidx = next(sss.split(X_train, (y_train * 100).astype(int)))
    X_train, X_valid = X_train[tridx], X_train[vidx]
    y_train, y_valid = y_train[tridx], y_train[vidx]
    docs_train, docs_valid = docs_train[tridx], docs_train[vidx]
    
    # Build model
    model = build_hybrid_model2()
    
    # Early stoppnig callback
    es = keras.callbacks.EarlyStopping(
        monitor='val_loss', 
        mode='min', 
        patience=PATIENCE,
        restore_best_weights=True,
        verbose=1
    )
    
    # Fit
    model.fit([X_train[:, i_data_idx], X_train[:, b_data_idx], X_train[:, 0], docs_train], y_train,
              validation_data=([X_valid[:, i_data_idx], X_valid[:, b_data_idx], X_valid[:, 0], docs_valid], y_valid), 
              batch_size=BATCH_SIZE,
              epochs=EPOCHS,
              callbacks=[es],
              verbose=1)
    
    nn_score += np.sqrt(mean_squared_error(
        model.predict([X_test[:, i_data_idx], X_test[:, b_data_idx], X_test[:, 0], docs_test]), y_test
    ))

nn_score /= N_FOLDS

print(nn_score)

Train on 60267 samples, validate on 6697 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Restoring model weights from the end of the best epoch
Epoch 00061: early stopping
Train on 60268 samples, validate on 6697 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100

Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Restoring model weights from the end of the best epoch
Epoch 00054: early stopping
0.32737252168938963


In [275]:
nn_score = 0.0

for train_index, test_index in skf.split(X, (y * 100).astype(int)):
    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=RANDOM_SEED)
    tridx, vidx = next(sss.split(X_train, (y_train * 100).astype(int)))
    X_train, X_valid = X_train[tridx], X_train[vidx]
    y_train, y_valid = y_train[tridx], y_train[vidx]
    
    # Build model
    model = build_model_3()
    
    # Early stoppnig callback
    es = keras.callbacks.EarlyStopping(
        monitor='val_loss', 
        mode='min', 
        patience=PATIENCE,
        restore_best_weights=True,
        verbose=1
    )
    
    # Fit
    model.fit([X_train[:, i_data_idx], X_train[:, b_data_idx], X_train[:, 0]], y_train,
              validation_data=([X_valid[:, i_data_idx], X_valid[:, b_data_idx], X_valid[:, 0]], y_valid), 
              batch_size=BATCH_SIZE,
              epochs=EPOCHS,
              callbacks=[es],
              verbose=1)
    
    nn_score += np.sqrt(mean_squared_error(
        model.predict([X_test[:, i_data_idx], X_test[:, b_data_idx], X_test[:, 0]]), y_test
    ))

nn_score /= N_FOLDS

print(nn_score)

Train on 60267 samples, validate on 6697 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100

KeyboardInterrupt: 

In [276]:
from lightgbm import LGBMRegressor

In [306]:
X.shape

(83706, 144)

In [305]:
X[10]

array([6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 'Indie, emerging', 0, 0, 1, 1, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1], dtype=object)

In [322]:
X = np.delete(X, 72, 1)

In [340]:
lgbm_score =  0.0


for train_index, test_index in skf.split(X, (y * 100).astype(int)):
    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=RANDOM_SEED)
    tridx, vidx = next(sss.split(X_train, (y_train * 100).astype(int)))
    X_train, X_valid = X_train[tridx], X_train[vidx]
    y_train, y_valid = y_train[tridx], y_train[vidx]
    
    lgbm = LGBMRegressor(
        num_leaves=2**9,
        learning_rate=0.05,
        subsample=0.8,
        reg_alpha=1.0,
        reg_lambda=1.0,
        n_estimators=10000,
        silent=False
    )
    lgbm.fit(X_train, y_train, eval_set=(X_valid, y_valid), 
             eval_metric='mse', early_stopping_rounds=100,
             verbose=100)
    lgbm_score += np.sqrt(mean_squared_error(lgbm.predict(X_test), y_test))

lgbm_score /= N_FOLDS

print(lgbm_score)

Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 0.089253
[200]	valid_0's l2: 0.0889512
[300]	valid_0's l2: 0.08903
Early stopping, best iteration is:
[212]	valid_0's l2: 0.0889244
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 0.092336
[200]	valid_0's l2: 0.0922979
Early stopping, best iteration is:
[147]	valid_0's l2: 0.0922423
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 0.0935227
[200]	valid_0's l2: 0.0931942
[300]	valid_0's l2: 0.0931966
Early stopping, best iteration is:
[236]	valid_0's l2: 0.0931803
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 0.094163
[200]	valid_0's l2: 0.0938801
Early stopping, best iteration is:
[189]	valid_0's l2: 0.0938587
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 0.0951689
[200]	valid_0's l2: 0.0950135
Early stopping, best iteration is:
[158]	valid_0's l2: 0.0949667
0.32319608

* Check pipe
* Add L2 reg
* look for text embeddings
* preprocess text
* build archi
* print archi
* pretrained emb
* tqdm notebook

also:
* Visualize embeddings

English chosen because in french description, mix of english and french

Content based : no interaction influencer/artist taken into account (no embedding for them)

advantage : cold start allowed
disadvantage : interesting info lossed

==> hybrid recommender system

label encoded influencer kind : in production, a category 'Other' can be created to account for potential kinds not present in current dataset.

Grid search