In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score
import matplotlib.pyplot as plt

from imblearn.over_sampling import SMOTE
from collections import Counter

In [3]:
#Importando dataset
DATASET_PATH = "/Users/matias/Documents/Posgrado/mis_trabajos/posgrado_ia/Bimestre 3/ML2/tp_final_ML2/data_playlist.csv"
# DATASET_PATH = "/content/sample_data/data_playlist.csv"
df = pd.read_csv(DATASET_PATH, delimiter=',')

FileNotFoundError: [Errno 2] No such file or directory: '/Users/matias/Documents/Posgrado/mis_trabajos/posgrado_ia/AMq1/tp_final/data_playlist.csv'

In [None]:
#from google.colab import drive
#  drive.mount('/content/drive')

In [None]:
df.head(5)

In [None]:
df.describe()

Analizamos el tipo de variables

In [None]:
df.dtypes

Analizamos si tenemos datos nulos

In [None]:
df.isnull().sum()

Analizamos correlaciones entre las variables

In [None]:
sm = pd.plotting.scatter_matrix(df, figsize=(20, 20))

In [None]:
def display_dataset_distributions(dataset):
    fig = dataset.hist(xlabelsize=12, ylabelsize=12,figsize=(22,10))
    [x.title.set_size(14) for x in fig.ravel()]
    plt.tight_layout()
    plt.show()

In [None]:
display_dataset_distributions(df)

In [None]:
# Matriz de correlación, redondeo a 2 decimales
correlation_matrix = df[df.columns].corr(method = 'spearman').round(2)
fig,axes = plt.subplots(1,1,figsize=(20,8))
sns.heatmap(data=correlation_matrix, annot=True,ax=axes);

In [None]:
# Matriz de correlación, redondeo a 2 decimales
correlation_matrix = df[df.columns].corr(method = 'pearson').round(2)
fig,axes = plt.subplots(1,1,figsize=(20,8))
sns.heatmap(data=correlation_matrix, annot=True,ax=axes);

In [None]:
# Matriz de correlación, redondeo a 2 decimales
correlation_matrix = df[df.columns].corr(method = 'kendall').round(2)
fig,axes = plt.subplots(1,1,figsize=(20,8))
sns.heatmap(data=correlation_matrix, annot=True,ax=axes);

analizamos si identificamos features que son catetegoricos a simple vista

In [None]:
for r in df.columns:    
    l = np.array(df[r].values)
    print('column:',r,'rows:',len(df[r]),'y clases: ',len(np.unique(l)))

identificamos a simple vista 3 variables categoricas

Ademas: creamos mapping para TEMPO, para transformarlo en categorica y mappeamos a binarios

In [None]:
tempo_mappings = {
    (40,60)   : '000', #'lento',
    (60,66)   : '001', #'Larghetto',
    (66,76)   : '010', #'Adagio',
    (76,108)  : '011', #'Andante',
    (108,120) : '100', #'Moderato',
    (120,168) : '101', #'Allegro',
    (168,200) : '110', #'Presto',
    (200,216) : '111', #'Prestissimo',
                }


def map_tempos(x):
    for key in tempo_mappings:
        if x >= key[0] and x <= key[1]:
            return tempo_mappings[key]

df['tempo'] = df['tempo'].apply(map_tempos)

df.head(5)

Ahora, intentaremos categorizar las columnas 'liveness' y 'instrumentalness' para analizar nuevamente las metricas. Para ello nos valemos de los graficos en histogramas y luego un plot para elegir una region a partir tomaremos como instrumentalness 1 o 0 

In [None]:
df.instrumentalness.hist(bins=10)

In [None]:
#Todos los valores menores a 0.8 seran considerados 0 y 1 caso contrario
df['instrumentalness'] = df['instrumentalness'].apply(lambda d: 1 if d>0.8 else 0)
df.head(5)


Hacemos el mismo analisis para liveness

In [None]:
df.liveness.hist(bins=10)

In [None]:
df['liveness'] = df['liveness'].apply(lambda d: 1 if d>0.8 else 0)
df.head(5)

Analizamos si el dataset esta balanceado

In [None]:
def getLabelCount(df,target):
    return sorted([( labelValue,len(df.loc[df[target] == labelValue]) ) for labelValue in df[target].unique()])

getLabelCount(df,'label')

In [None]:
X = df.drop(['label'],axis = 1)
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1,random_state=2022)

In [None]:
# Aplicamos SMOTE

In [None]:
over_sampler = SMOTE(k_neighbors=2)
X_res, y_res = over_sampler.fit_resample(X_train, y_train)
print(f"Training target statistics: {Counter(y_res)}")
print(f"Testing target statistics: {Counter(y_test)}")

# Aplicamos PyCaret para ver el modelo que mejor se ajustaria

In [None]:
## ------ ESTO ESTA HECHO EN EL FILE DE COLAB

In [None]:
clf = RandomForestClassifier(max_depth=10, random_state=2022)
clf.fit(X_res, y_res)

In [None]:
y_pred = clf.predict(X_test)



In [None]:
# verificar características importantes
feature_importances_df = pd.DataFrame(
    {"feature": list(X.columns), "importance": clf.feature_importances_}
).sort_values("importance", ascending=False)

# visualizarcaracterísticas importantes
# Crear un diagrama de barras
sns.barplot(x=feature_importances_df.feature, y=feature_importances_df.importance)
# agregar estiquestas
plt.xlabel("Feature Importance Score")
plt.ylabel("Features")
plt.title("Visualizing Important Features")
plt.xticks(
    rotation=45, horizontalalignment="right", fontweight="light", fontsize="x-large"
)
plt.show()

In [None]:
print('score',clf.score(X_test , y_test))
print('macro recall',recall_score(y_test, y_pred, average='macro'))
print('micro recall',recall_score(y_test, y_pred, average='micro'))
print('weighted recall',recall_score(y_test, y_pred, average='weighted'))

In [None]:
# df2 = X_res  
# y_res_frame = y_res.to_frame()
# df2['label'] =  y_res_frame['label']
# df2.head()

In [None]:
#df2.to_csv(DATASET_PATH+"/data_playlist_pre-processed.csv", encoding='utf-8')
# df2.to_csv("/content/sample_data/data_playlist_pre-processed.csv", encoding='utf-8')



# Grid Search de Hiperparámetros



In [None]:

## cambiar los valores del range para ver diferencias en score

for search_depth in range(1,50,1):
  clf_grid_search = RandomForestClassifier(max_depth=search_depth, random_state=2022)
  clf_grid_search.fit(X_res, y_res)
  y_pred = clf_grid_search.predict(X_test)
  print('score con',search_depth,'nodos->',clf_grid_search.score(X_test , y_test))
