In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report
from sklearn.metrics import make_scorer, accuracy_score, roc_auc_score 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GroupShuffleSplit 
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import top_k_accuracy_score

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Abro datasets y hago split en base a artistas al azar. Genero sets de entrenamiento, validación y testeo.

In [None]:
df_train = pd.read_parquet('/content/drive/MyDrive/ODD tp 1 dataset/train.parquet')

In [None]:
df_test = pd.read_parquet('/content/drive/MyDrive/ODD tp 1 dataset/test.parquet')

In [None]:
splitter = GroupShuffleSplit(test_size=.20, n_splits=2, random_state = 26)
split = splitter.split(df_train, groups=df_train['artist'])
train_inds, valid_inds = next(split)

train = df_train.iloc[train_inds]
valid = df_train.iloc[valid_inds]

In [None]:
y_train = train.genre
X_train = train.drop(["track_name", "a_genres", "did", "genre", 'artist'], axis = 1)

In [None]:
y_valid = valid.genre
X_valid = valid.drop(["track_name", "a_genres", "did", "genre", 'artist'], axis = 1)

In [None]:
y_test = df_test.genre
X_test = df_test.drop(["track_name", "a_genres", "did", "genre", 'artist'], axis = 1)

# Imputación de nulos + OHE

In [None]:
X_valid['language'] = X_valid['language'].fillna(X_train['language'].mode()[0])
X_train['language'] = X_train['language'].fillna(X_train['language'].mode()[0])
X_test['language'] = X_test['language'].fillna(X_train['language'].mode()[0])

In [None]:
encoder = OneHotEncoder(drop='first', handle_unknown='ignore' )
lenguaje_train = encoder.fit_transform(X_train[['language']]).todense().astype(int)
encoder.get_feature_names_out()

array(['language_de', 'language_en', 'language_es', 'language_fr',
       'language_ga', 'language_gl', 'language_is', 'language_it',
       'language_pt', 'language_rw'], dtype=object)

In [None]:
lenguaje_valid = encoder.transform(X_valid[['language']]).todense().astype(int)
encoder.get_feature_names_out()



array(['language_de', 'language_en', 'language_es', 'language_fr',
       'language_ga', 'language_gl', 'language_is', 'language_it',
       'language_pt', 'language_rw'], dtype=object)

In [None]:
lenguaje_test = encoder.transform(X_test[['language']]).todense().astype(int)
encoder.get_feature_names_out()

array(['language_de', 'language_en', 'language_es', 'language_fr',
       'language_ga', 'language_gl', 'language_is', 'language_it',
       'language_pt', 'language_rw'], dtype=object)

In [None]:
X_train = X_train.reset_index().drop(['index', 'language'], axis=1).join(pd.DataFrame(lenguaje_train, columns = ('language_de', 'language_en', 'language_es', 'language_fr',
       'language_ga', 'language_gl', 'language_is', 'language_it',
       'language_pt', 'language_rw')))
X_valid = X_valid.reset_index().drop(['index', 'language'], axis=1).join(pd.DataFrame(lenguaje_valid, columns = ('language_de', 'language_en', 'language_es', 'language_fr',
       'language_ga', 'language_gl', 'language_is', 'language_it',
       'language_pt', 'language_rw')))
X_test = X_test.reset_index().drop(['index', 'language'], axis=1).join(pd.DataFrame(lenguaje_test, columns = ('language_de', 'language_en', 'language_es', 'language_fr',
       'language_ga', 'language_gl', 'language_is', 'language_it',
       'language_pt', 'language_rw')))

In [None]:
X_valid['s-label'] = X_valid['s-label'].fillna(X_train['s-label'].median())
X_train['s-label'] = X_train['s-label'].fillna(X_train['s-label'].median())
X_test['s-label'] = X_test['s-label'].fillna(X_train['s-label'].median())

In [None]:
key_train =  encoder.fit_transform(X_train[['key']]).todense().astype(int)
encoder.get_feature_names_out()

array(['key_A#', 'key_B', 'key_C', 'key_C#', 'key_D', 'key_D#', 'key_E',
       'key_F', 'key_F#', 'key_G', 'key_G#'], dtype=object)

In [None]:
key_valid =  encoder.transform(X_valid[['key']]).todense().astype(int)
encoder.get_feature_names_out()

array(['key_A#', 'key_B', 'key_C', 'key_C#', 'key_D', 'key_D#', 'key_E',
       'key_F', 'key_F#', 'key_G', 'key_G#'], dtype=object)

In [None]:
key_test =  encoder.transform(X_test[['key']]).todense().astype(int)
encoder.get_feature_names_out()

array(['key_A#', 'key_B', 'key_C', 'key_C#', 'key_D', 'key_D#', 'key_E',
       'key_F', 'key_F#', 'key_G', 'key_G#'], dtype=object)

In [None]:
X_train = X_train.reset_index().drop(['index', 'key'], axis=1).join(pd.DataFrame(key_train, columns = ('key_A#', 'key_B', 'key_C', 'key_C#', 'key_D', 'key_D#', 'key_E',
       'key_F', 'key_F#', 'key_G', 'key_G#')))
X_valid = X_valid.reset_index().drop(['index', 'key'], axis=1).join(pd.DataFrame(key_valid, columns = ('key_A#', 'key_B', 'key_C', 'key_C#', 'key_D', 'key_D#', 'key_E',
       'key_F', 'key_F#', 'key_G', 'key_G#')))
X_test = X_test.reset_index().drop(['index', 'key'], axis=1).join(pd.DataFrame(key_test, columns = ('key_A#', 'key_B', 'key_C', 'key_C#', 'key_D', 'key_D#', 'key_E',
       'key_F', 'key_F#', 'key_G', 'key_G#')))

# Mean Encoding

In [None]:
X_train.loc[(X_train['mode'] == 'Major'),'mode']=int(1)
X_train.loc[(X_train['mode'] == 'Minor'),'mode']=int(0)

X_valid.loc[(X_valid['mode'] == 'Major'),'mode']=int(1)
X_valid.loc[(X_valid['mode'] == 'Minor'),'mode']=int(0)

X_test.loc[(X_test['mode'] == 'Major'),'mode']=int(1)
X_test.loc[(X_test['mode'] == 'Minor'),'mode']=int(0)

In [None]:
X_train['mode'] = X_train['mode'].astype(int)
X_valid['mode'] = X_valid['mode'].astype(int)
X_test['mode'] = X_test['mode'].astype(int)

In [None]:
mean_encoding = X_train.groupby(['time_signature'])['mode'].mean().to_dict()
  
X_train['time_signature'] =  X_train['time_signature'].map(mean_encoding)
X_valid['time_signature'] =  X_train['time_signature'].map(mean_encoding)
X_test['time_signature'] =  X_train['time_signature'].map(mean_encoding)

# TF-IDF Vectorizer

In [None]:
X_train["lyric"] = X_train["lyric"].fillna("")
X_valid["lyric"] = X_valid["lyric"].fillna("")
X_test["lyric"] = X_test["lyric"].fillna("")

In [None]:
countIDF = TfidfVectorizer(lowercase=True, stop_words='english', max_features=50)

In [None]:
vectorizado_train = countIDF.fit_transform(X_train["lyric"]).todense().astype(int)
vectorizado_valid = countIDF.transform(X_valid["lyric"]).todense().astype(int)
vectorizado_test = countIDF.transform(X_test["lyric"]).todense().astype(int)

In [None]:
X_train = X_train.reset_index().drop(['index', 'lyric'], axis=1).join(pd.DataFrame(vectorizado_train))
X_valid = X_valid.reset_index().drop(['index', 'lyric'], axis=1).join(pd.DataFrame(vectorizado_valid))
X_test = X_test.reset_index().drop(['index', 'lyric'], axis=1).join(pd.DataFrame(vectorizado_test))

In [None]:
escalador = MinMaxScaler()

X_train_escalado = escalador.fit_transform(X_train)
X_train = pd.DataFrame(X_train_escalado, columns = X_train.columns)

X_valid_escalado = escalador.transform(X_valid)
X_valid = pd.DataFrame(X_valid_escalado, columns = X_valid.columns)

X_test_escalado = escalador.transform(X_test)
X_test = pd.DataFrame(X_test_escalado, columns = X_test.columns)



# Busco hiper parametros y hago modelo de XGBoost

In [None]:
# Busqueda de hiperparametros

parametros = {
    'learning_rate':[0.05, 0.1, 0.2, 0.3,0.5],
     'n_estimators':[15, 30, 50, 75, 100],
     'max_depth':[2,3,5,8,10],
    'colsample_bytree':[0.1,0.3,0.5,1],
    'gamma':[0.5,1,3,5, 7]
    }

xgb = XGBClassifier(random_state = 26)
resultados = RandomizedSearchCV(xgb, parametros, cv=3, scoring='accuracy', n_iter=40, n_jobs=1, verbose=10, random_state=26)
resultados.fit(X_train, y_train)
print(resultados.best_estimator_)
print(resultados.best_params_)

Fitting 3 folds for each of 40 candidates, totalling 120 fits
[CV 1/3; 1/40] START colsample_bytree=0.1, gamma=1, learning_rate=0.3, max_depth=8, n_estimators=30




[CV 1/3; 1/40] END colsample_bytree=0.1, gamma=1, learning_rate=0.3, max_depth=8, n_estimators=30;, score=0.338 total time=  14.6s
[CV 2/3; 1/40] START colsample_bytree=0.1, gamma=1, learning_rate=0.3, max_depth=8, n_estimators=30
[CV 2/3; 1/40] END colsample_bytree=0.1, gamma=1, learning_rate=0.3, max_depth=8, n_estimators=30;, score=0.259 total time=  13.1s
[CV 3/3; 1/40] START colsample_bytree=0.1, gamma=1, learning_rate=0.3, max_depth=8, n_estimators=30
[CV 3/3; 1/40] END colsample_bytree=0.1, gamma=1, learning_rate=0.3, max_depth=8, n_estimators=30;, score=0.350 total time=  13.2s
[CV 1/3; 2/40] START colsample_bytree=0.1, gamma=5, learning_rate=0.2, max_depth=3, n_estimators=100
[CV 1/3; 2/40] END colsample_bytree=0.1, gamma=5, learning_rate=0.2, max_depth=3, n_estimators=100;, score=0.382 total time=  29.5s
[CV 2/3; 2/40] START colsample_bytree=0.1, gamma=5, learning_rate=0.2, max_depth=3, n_estimators=100
[CV 2/3; 2/40] END colsample_bytree=0.1, gamma=5, learning_rate=0.2, max_

In [None]:
# Resultados con hiperparametros default 

model = XGBClassifier(random_state = 26)
modelo = model.fit(X_train,y_train)

In [None]:
print(f'El mejor score de validación obtenido con hiperparametros default es {top_k_accuracy_score(y_valid, model.predict_proba(X_valid), k=2, labels=modelo.classes_)}')

El mejor score de validación obtenido con hiperparametros default es 0.5241682360326428


In [None]:
# Prueba con hiperparametros buscados por RandomSearch

model_posta2 = XGBClassifier(colsample_bytree=0.5, gamma=0.5, learning_rate=0.5, max_depth=8,
              objective='multi:softprob', random_state=26)
modelo_postalina2 = model_posta2.fit(X_train,y_train)

In [None]:
print(f'El mejor score de validación obtenido con los hiperparametros buscados por RandomSearch es {top_k_accuracy_score(y_valid, model_posta2.predict_proba(X_valid), k=2, labels=modelo_postalina2.classes_)}')

El mejor score de validación obtenido con los hiperparametros buscados por RandomSearch es 0.4844632768361582


# Para el mejor modelo de ambos, ¿cuál es el score en test?

In [None]:
# Uso el modelo de XG Boost con parametros default porque es el que mejor resultado en validacion dio de todos los modelos

print(f'El score en test es {top_k_accuracy_score(y_test, model.predict_proba(X_test), k=2, labels=modelo.classes_)}') 

El score en test es 0.478334461746784
