In [6]:
import numpy as np
import math
import pandas as pd

from tqdm import tqdm
import keras
from keras.utils import np_utils
from keras.models import Model
from keras.layers import Input, Dense, Dropout, Embedding, Flatten
from keras.callbacks import ModelCheckpoint
import cPickle as pickle
import progressbar
import os
import argparse

In [7]:
def load_trained_model(model_file):
    '''This method loads the song embeddings model into memory'''
    model = keras.models.load_model(model_file)
    return model

In [8]:
def load_test_dataset(test_file):
    embedded_dataset = pd.read_csv(test_file).fillna('')
    return embedded_dataset

def load_songs_members_dataset():
    songs_dataset = pd.read_csv('../../new_data/Data/songs.csv')
    members_dataset = pd.read_csv('../Data/members.csv')
    return songs_dataset, members_dataset

In [9]:
def read_from_pickle(filename):
    return pickle.load(open(filename, 'r'))

In [10]:
def load_song_mapper_picke():
    song_mapper = pickle.load(open('../New_Data/model_song_embeddings/song_mapper_py2.pkl', 'rb'))
    return song_mapper

def load_user_mapper_pickle():
    user_mapper = pickle.load(open('../New_Data/model_user_embeddings/msno_mapper_py2.pkl', 'rb'))
    return user_mapper

def create_mapper(values):
    mapper = dict()
    for v in values:
        mapper[v] = len(v)
    return mapper

In [11]:
def input_generator(data):
    num_rows = data.shape[0]
    X_msno = np.empty(num_rows)
    X_song_id = np.empty(num_rows)
    X_source_system_tab = np.empty(num_rows)
    X_source_screen_name = np.empty(num_rows)
    X_source_type = np.empty(num_rows)
    batch = num_rows
    input_genre_ids_count = np.empty(batch)
    input_artist_count = np.empty(batch)
    input_lyricist_count = np.empty(batch)
    
    input_composer_count = np.empty(batch)
    input_song_count_played = np.empty(batch)
    input_artist_count_played = np.empty(batch)
    
    count = 0
    
    song_mapper = load_song_mapper_picke()
    user_mapper = load_user_mapper_pickle()
    source_tab_mapper = create_mapper(data.source_system_tab.unique())
    s_scr_name_mapper = create_mapper(data.source_screen_name.unique())
    s_type_mapper = create_mapper(data.source_type.unique())

    bar = progressbar.ProgressBar()
    print 'Generating inputs...'
    
    for _, row in bar(data.iterrows()):
        curr_msno = user_mapper[row['msno']]
        if curr_song_id not in song_mapper:
            X_song_id[count, ] = song_mapper[data.iloc[0]['song_id']]
        else:    
            X_song_id[count, ] = song_mapper[row['song_id']]
        X_msno[count,] = curr_msno
        X_source_system_tab[count, ] = source_tab_mapper[row['source_system_tab']]
        X_source_screen_name[count, ] = s_scr_name_mapper[row['source_screen_name']]
        X_source_type[count, ] = s_type_mapper[row['source_type']]
        
        input_genre_ids_count[count, ] = row['genre_ids_count']
        input_artist_count[count, ] = row['artist_count']
        input_lyricist_count[count, ] = row['lyricists_count']
        
        input_composer_count[count, ] = row['composer_count']
        input_song_count_played[count, ] = row['count_song_played']
        input_artist_count_played[count, ] = row['count_artist_played']
        
        count += 1
        
    return X_msno, X_song_id, X_source_system_tab, X_source_screen_name, X_source_type, input_genre_ids_count, input_artist_count, 
            input_lyricist_count, input_composer_count, input_song_count_played, input_artist_count_played
    

IndentationError: unexpected indent (<ipython-input-11-6d47519b1ae7>, line 50)

In [None]:
def generate_submission_file(preds, preds_vals):
    submission = pd.DataFrame(columns=['id', 'target', 'preds'])
    submission['id'] = range(len(preds))
    submission['target'] = preds
    submission['preds'] = preds_vals
    return submission

In [12]:
def write_to_csv(filename, data):
    data.to_csv(filename, index=False)

In [15]:
if __name__=="__main__":
    parser = argparse.ArgumentParser(description='Enter the model file path and test file path')
    parser.add_argument('-model', type=str, required=True)
    parser.add_argument('-test', type=str, required=True)
    parser.add_argument('-batch_size', type=str, required=True)
    args = parser.parse_args()
    
    print 'Loading the model'
    trained_model = load_trained_model(args.model)
    print trained_model.summary()
    test_dataset = load_test_dataset(args.test)
    songs_dataset, members_dataset = load_songs_members_dataset()
    test_dataset = test_dataset.merge(members_dataset, on='msno', how='left')
    
    test_dataset = test_dataset.merge(songs_dataset, on='song_id', how='left')
    
    
    def genre_id_count(x):
        if x == 'no_genre_id':
            return 0
        else:
            return x.count('|') + 1

    train_dataset = test_dataset

    train_dataset['genre_ids'].fillna('no_genre_id',inplace=True)
    train_dataset['genre_ids_count'] = train_dataset['genre_ids'].apply(genre_id_count).astype(np.int32)
    train_dataset['genre_ids_count'] = train_dataset['genre_ids_count'] / max(train_dataset['genre_ids_count'])


# In[90]:


    def artist_count(x):
        if x == 'no_artist':
            return 0
        else:
            return x.count('and') + x.count(',') + x.count('feat') + x.count('&')

    train_dataset['artist_name'].fillna('no_artist',inplace=True)
    train_dataset['artist_count'] = train_dataset['artist_name'].map(str).apply(artist_count).astype(np.int32)
    train_dataset['artist_count'] = train_dataset['artist_count'] / max(train_dataset['artist_count'])


    # In[91]:


    def lyricist_count(x):
        if x == 'no_lyricist':
            return 0
        else:
            return sum(map(x.count, ['|', '/', '\\', ';'])) + 1
        return sum(map(x.count, ['|', '/', '\\', ';']))

    train_dataset['lyricist'].fillna('no_lyricist',inplace=True)
    train_dataset['lyricists_count'] = train_dataset['lyricist'].map(str).apply(lyricist_count).astype(np.int32)
    train_dataset['lyricists_count'] = train_dataset['lyricists_count'] / max(train_dataset['lyricists_count'])


    # In[92]:


    def composer_count(x):
        if x == 'no_composer':
            return 0
        else:
            return sum(map(x.count, ['|', '/', '\\', ';'])) + 1

    train_dataset['composer'].fillna('no_composer',inplace=True)
    train_dataset['composer_count'] = train_dataset['composer'].map(str).apply(composer_count).astype(np.int8)
    train_dataset['composer_count'] = train_dataset['composer_count'] / max(train_dataset['composer_count'])


    # In[93]:


    # number of times a song has been played before
    _dict_count_song_played_train = {k: v for k, v in train_dataset['song_id'].value_counts().iteritems()}
    def count_song_played(x):
        try:
            return _dict_count_song_played_train[x]
        except KeyError:
                return 0    

    train_dataset['count_song_played'] = train_dataset['song_id'].map(str).apply(count_song_played).astype(np.int64)
    train_dataset['count_song_played'] = train_dataset['count_song_played'] / max(train_dataset['count_song_played'])


    # In[94]:


    # number of times an artist has been played
    _dict_count_artist_played_train = {k: v for k, v in train_dataset['artist_name'].value_counts().iteritems()}
    def count_artist_played(x):
        try:
            return _dict_count_artist_played_train[x]
        except KeyError:
                return 0

    train_dataset['count_artist_played'] = train_dataset['artist_name'].map(str).apply(count_artist_played).astype(np.int64)
    train_dataset['count_artist_played'] = train_dataset['count_artist_played'] / max(train_dataset['count_artist_played'])


    # In[95]:


    embedded_dataset = train_dataset
    embedded_dataset = embedded_dataset.fillna('')
    test_dataset = embedded_dataset

    
    
    test_X_msno, test_song_id, test_source_tab, test_screen_name, test_source_type, input_genre_ids_count, input_artist_count, input_lyricist_count, input_composer_count, input_song_count_played, input_artist_count_played = input_generator(test_dataset)
    print 'Generating predictions'
    preds = trained_model.predict(x = [test_X_msno, test_song_id, test_source_tab, test_screen_name, test_source_type, input_genre_ids_count, input_artist_count, input_lyricist_count, input_composer_count, input_song_count_played, input_artist_count_played])
    predictions = [1.0 if p > 0.5 else 0.0 for p in preds]
    submission = generate_submission_file(predictions, preds)
    write_to_csv(args.model + '_submission.csv', submission)
    print 'Submission written to csv file'


usage: ipykernel_launcher.py [-h] -model MODEL -test TEST -batch_size
                             BATCH_SIZE
ipykernel_launcher.py: error: argument -model is required


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [16]:
!ipython nbconvert --to script test_script_lysto_no_embeddings_md_counts.ipynb

[NbConvertApp] Converting notebook test_script_lysto_no_embeddings_md_counts.ipynb to script
[NbConvertApp] Writing 8758 bytes to test_script_lysto_no_embeddings_md_counts.py
