In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import glob
from sklearn.preprocessing import scale, MinMaxScaler, LabelEncoder
from sklearn.decomposition import PCA

In [0]:
data_path = '/content/drive/My Drive/Mineria de datos/datasets/'

training_path = data_path + 'training_set/'
track_features_path = data_path + 'track_features/'

training_logs = sorted(glob.glob(training_path + "log_*.csv"))
track_logs = sorted(glob.glob(track_features_path + 'tf_*.csv'))

In [0]:
session_features_to_scale = [
      'hist_user_behavior_n_seekfwd',
      'hist_user_behavior_n_seekback',
      'hour_of_day'
]

track_features_to_scale = [
      'acousticness', 'beat_strength', 'bounciness', 'danceability',
      'dyn_range_mean', 'energy', 'flatness', 'instrumentalness',
      'liveness', 'loudness', 'mechanism', 'organism', 'speechiness',
      'tempo', 'time_signature', 'valence', 'acoustic_vector_0',
      'acoustic_vector_1', 'acoustic_vector_2', 'acoustic_vector_3',
      'acoustic_vector_4', 'acoustic_vector_5', 'acoustic_vector_6',
      'acoustic_vector_7'
]

## TRACKS

In [0]:
TRACKS = pd.concat([pd.read_csv(path) for path in track_logs], ignore_index=True)

In [0]:
scaler1 = MinMaxScaler()
TRACKS[['us_popularity_estimate', 'duration']] = scaler1.fit_transform(TRACKS[['us_popularity_estimate', 'duration']])

TRACKS[track_features_to_scale] = scale(TRACKS[track_features_to_scale])

In [0]:
pca = PCA(n_components=15)
tracks_principal_components = pca.fit_transform(TRACKS[track_features_to_scale])

tracks_principal_components_columns = ['pca{}'.format(i) for i in range(1, 16)]

tracks_principal_components = pd.DataFrame(data=tracks_principal_components, columns=tracks_principal_components_columns)

TRACKS.drop(track_features_to_scale, axis=1, inplace=True)
TRACKS = pd.concat([TRACKS, tracks_principal_components], axis=1)

In [0]:
TRACKS.drop(['release_year', 'key'], axis=1, inplace=True)

## SESIONS

In [0]:
log_index = 0 ## change by adding one to generate the dataframe with the next 2 log files.

In [0]:
SESSIONS = pd.concat([pd.read_csv(path) for path in training_logs[log_index*2: (log_index + 1)*2]], ignore_index=True)

In [0]:
SESSIONS[[
        'skip_1', 'skip_2', 'skip_3', 'not_skipped', 'context_switch',
        'no_pause_before_play', 'short_pause_before_play',
        'long_pause_before_play', 'hist_user_behavior_is_shuffle', 'premium'
    ]] = SESSIONS[[
            'skip_1', 'skip_2', 'skip_3', 'not_skipped', 'context_switch',
            'no_pause_before_play', 'short_pause_before_play',
            'long_pause_before_play', 'hist_user_behavior_is_shuffle', 'premium'
        ]].astype('bool')
    
scaler = MinMaxScaler()
SESSIONS[session_features_to_scale] = scaler.fit_transform(SESSIONS[session_features_to_scale])

In [0]:
SESSIONS['skip'] = SESSIONS['skip_1'] | SESSIONS['skip_2']
SESSIONS.drop(['skip_1', 'skip_2', 'skip_3', 'not_skipped', 'date'], axis=1, inplace=True)


## MERGE

In [0]:
DATA_ALL = SESSIONS.merge(TRACKS, how='left', left_on='track_id_clean', right_on='track_id')
DATA_ALL.drop('track_id', axis=1, inplace=True)

In [0]:
#DATA_ALL.to_pickle(data_path + 'preprocessed/{}.pkl'.format('data_all' + str(log_index)))

In [0]:
#DATA_ALL = pd.read_pickle(data_path + 'preprocessed/{}.pkl'.format('data_all' + str(log_index)))

In [0]:
DATA_ALL.head()

In [0]:
 features_column_names = [
       'context_switch', 'no_pause_before_play',
       'short_pause_before_play', 'long_pause_before_play',
       'hist_user_behavior_n_seekfwd', 'hist_user_behavior_n_seekback',
       'hist_user_behavior_is_shuffle', 'hour_of_day', 'premium',
       'context_type', 'hist_user_behavior_reason_start',
       'hist_user_behavior_reason_end', 'duration',
       'us_popularity_estimate', 'mode', 'pca1', 'pca2', 'pca3', 'pca4',
       'pca5', 'pca6', 'pca7', 'pca8', 'pca9', 'pca10', 'pca11', 'pca12',
       'pca13', 'pca14', 'pca15']

In [0]:
columns_names = []
for i in range(3):
    temp = []
    for column in features_column_names:
        temp.append(column + str(i+1))
    columns_names.append(temp)

In [0]:
## run this after, to free the ram
del DATASET
del df1
del df2
#del df3
del wts

In [0]:
wts = DATA_ALL[features_column_names]

# 1

In [0]:
df1 = pd.concat(
    [
        wts.iloc[[0]],
        wts
    ],
    ignore_index=True
)
df1.columns = columns_names[0]

df2 = pd.concat(
    [
     wts,
     wts.iloc[[wts.shape[0]-1]]
    ],
    ignore_index=True
)
df2.columns = columns_names[1]


In [0]:
DATASET = pd.concat([df1, df2], axis=1)

In [0]:
DATASET.drop(DATASET.shape[0]-1, inplace=True)

In [24]:
DATASET.columns

Index(['context_switch1', 'no_pause_before_play1', 'short_pause_before_play1',
       'long_pause_before_play1', 'hist_user_behavior_n_seekfwd1',
       'hist_user_behavior_n_seekback1', 'hist_user_behavior_is_shuffle1',
       'hour_of_day1', 'premium1', 'context_type1',
       'hist_user_behavior_reason_start1', 'hist_user_behavior_reason_end1',
       'duration1', 'us_popularity_estimate1', 'mode1', 'pca11', 'pca21',
       'pca31', 'pca41', 'pca51', 'pca61', 'pca71', 'pca81', 'pca91', 'pca101',
       'pca111', 'pca121', 'pca131', 'pca141', 'pca151', 'context_switch2',
       'no_pause_before_play2', 'short_pause_before_play2',
       'long_pause_before_play2', 'hist_user_behavior_n_seekfwd2',
       'hist_user_behavior_n_seekback2', 'hist_user_behavior_is_shuffle2',
       'hour_of_day2', 'premium2', 'context_type2',
       'hist_user_behavior_reason_start2', 'hist_user_behavior_reason_end2',
       'duration2', 'us_popularity_estimate2', 'mode2', 'pca12', 'pca22',
       'pca32',

In [0]:
DATASET.reset_index(inplace=True, drop=True)
DATASET[['session_id', 'session_position', 'session_length']] = DATA_ALL[['session_id', 'session_position', 'session_length']]
DATASET['skip'] = DATA_ALL['skip']

In [0]:
first_indexes = list(DATASET.loc[DATASET['session_position'] == 1].index)
#last_indexes = list(DATASET.loc[DATASET['session_position'] == DATASET['session_length']].index)

In [0]:
_c1 = [{columns_names[1][i]:columns_names[0][i]} for i in range(len(columns_names[0]))]
c1 = {}
for i in range(len(_c1)):
    c1.update(_c1[i])

DATASET.loc[first_indexes, columns_names[0]] = DATASET.loc[first_indexes, columns_names[1]].rename(columns=c1)

In [0]:
DATASET.drop([
              'context_switch2',
       'no_pause_before_play2', 'short_pause_before_play2',
       'long_pause_before_play2', 'hist_user_behavior_n_seekfwd2',
       'hist_user_behavior_n_seekback2', 'hist_user_behavior_is_shuffle2',
       'hour_of_day2', 'premium2', 'context_type2',
       'hist_user_behavior_reason_start2', 'hist_user_behavior_reason_end2'
], axis=1, inplace=True)

# SPLIT TRAINING TEST

In [0]:
import random
import math

DATASET.set_index('session_id', inplace=True, drop=True)

fraction=0.8
sessions_indexes = np.unique(DATASET.index.values)
sessions_indexes = list(sessions_indexes)
train_cant = math.ceil(len(sessions_indexes)*fraction)

In [0]:
random.shuffle(sessions_indexes)

In [0]:
TRAINING = DATASET.loc[sessions_indexes[: train_cant], :]

In [0]:
TEST = DATASET.loc[sessions_indexes[train_cant:],:]

In [0]:
TRAINING.reset_index(inplace=True)
TEST.reset_index(inplace=True)

In [0]:
#TRAINING.to_pickle('/content/drive/My Drive/Mineria de datos/datasets/ready/{}.pkl'.format('training' + str(log_index)))

In [0]:
#TEST.to_pickle('/content/drive/My Drive/Mineria de datos/datasets/ready/{}.pkl'.format('test' + str(log_index)))