In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import resample
from sklearn.datasets import load_iris, load_breast_cancer, load_wine
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score,precision_score, recall_score
import nltk
import pickle
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
# Establecer semilla para reproducibilidad
np.random.seed(42)

In [2]:

def gini_impurity(y):
    """Calcular la impureza Gini de un array de etiquetas"""
    if len(y) == 0:
        return 0
    # Proporción de cada clase
    clases = np.unique(y)
    probabilities = [np.mean(y == c) for c in clases]
    gini = 1 - sum(p ** 2 for p in probabilities)
    return gini

def entropy(y):
    """Calcular la entropía de un array de etiquetas"""
    if len(y) == 0:
        return 0
    # Proporción de cada clase
    clases = np.unique(y)
    probabilities = [np.mean(y == c) for c in clases]
    # Usamos log2 para calcular la entropía
    entropy_value = -sum(p * np.log2(p) for p in probabilities if p > 0)
    return entropy_value

def information_gain(y, left_indices, right_indices, impurity_function=gini_impurity):
    """Calcular la ganancia de información de una división utilizando impureza Gini o entropía"""
    parent_impurity = impurity_function(y)

    # Subconjuntos izquierdo y derecho
    left_impurity = impurity_function(y[left_indices])
    right_impurity = impurity_function(y[right_indices])

    # Peso de los subconjuntos izquierdo y derecho
    left_weight = len(left_indices) / len(y)
    right_weight = len(right_indices) / len(y)

    # Impureza ponderada de los hijos
    weighted_impurity = left_weight * left_impurity + right_weight * right_impurity

    # Ganancia de información
    info_gain = parent_impurity - weighted_impurity
    return info_gain

class DecisionTree:
    def __init__(self, max_depth=None, impurity_function=gini_impurity, min_samples_split=2,
                 min_samples_leaf=1):
        self.max_depth = max_depth
        self.impurity_function = impurity_function
        self.tree = None

        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
    def fit(self, X, y):
        """Ajustar el árbol de decisión a los datos"""
        self.tree = self._build_tree(X, y, depth=0)

    def _build_tree(self, X, y, depth):
        n_samples, n_features = X.shape
        n_labels = len(np.unique(y))

        if (n_samples < self.min_samples_split
            or len(np.unique(y)) == 1
            or (self.max_depth and depth == self.max_depth)):
            return np.bincount(y).argmax()
        # Si ya es un nodo hoja o alcanzamos la profundidad máxima ya no dividimos
        if n_labels == 1 or (self.max_depth and depth == self.max_depth):
            return np.unique(y)[0]

        # Buscamos el atributo que gane más información
        best_gain = -1 # Valor de inicio
        best_split = None
        best_left_indices = None
        best_right_indices = None

        # Por cada caracterísitca o dimensión
        for feature in range(n_features):
            # Limitar cantidad de thresholds por feature usando percentiles
            # Esto para limitar la cantidad de separaciones
            valores = np.percentile(X[:,feature], np.linspace(5, 95, 19))
            valores = np.unique(valores)
            # Sacamos las ramas
            for i in range(1, len(valores)):
                # Para que funcione con datos continuos sin la necesidad de discretizar
                threshold = (valores[i - 1] + valores[i]) / 2
                left_indices = np.where(X[:, feature] <= threshold)[0]
                right_indices = np.where(X[:, feature] > threshold)[0]
                # Descarta divisiones que dejen muy pocas muestras en una rama
                if (len(left_indices) < self.min_samples_leaf
                    or len(right_indices) < self.min_samples_leaf):
                    continue

                if len(left_indices) == 0 or len(right_indices) == 0:
                    continue

                gain = information_gain(y, left_indices, right_indices, self.impurity_function)

                if gain > best_gain:
                    best_gain = gain
                    best_split = {
                        "feature": feature,
                        "value": threshold
                    }
                    best_left_indices = left_indices
                    best_right_indices = right_indices
             # Si no se encontró una mejor división, retornamos la clase mayoritaria
        if best_gain == -1:
            return np.bincount(y).argmax()

        # Construir recursivamente las ramas
        left_subtree = self._build_tree(X[best_left_indices], y[best_left_indices], depth + 1)
        right_subtree = self._build_tree(X[best_right_indices], y[best_right_indices], depth + 1)
        return {
        "feature": best_split["feature"],
        "value": best_split["value"],
        "left": left_subtree,
        "right": right_subtree
        }

    def predict_one(self, x, node=None):
        """Predecir una muestra individual"""
        if node is None:
            node = self.tree

        if not isinstance(node, dict):
            return node

        feature = node["feature"]
        value = node["value"]

        if x[feature] <= value:
            return self.predict_one(x, node["left"])
        else:
            return self.predict_one(x, node["right"])
    def get_params(self, deep=True):
        """Obtener los parámetros del modelo"""
        return {"max_depth": self.max_depth, "impurity_function": self.impurity_function}

    def set_params(self, **params):
        """Configurar los parámetros del modelo"""
        for param, value in params.items():
            setattr(self, param, value)
        return self

    def predict(self, X):
        """Predecir múltiples muestras"""
        return np.array([self.predict_one(x) for x in X])




In [3]:
dataset_path = "./data/spotify_dataset.csv"
df = pd.read_csv(dataset_path)
print(df.columns)
#print(df["Explicit"].unique)
#print(df["Danceability"].unique)
print(df["Loudness (db)"].unique)


Index(['Artist(s)', 'song', 'text', 'Length', 'emotion', 'Genre', 'Album',
       'Release Date', 'Key', 'Tempo', 'Loudness (db)', 'Time signature',
       'Explicit', 'Popularity', 'Energy', 'Danceability', 'Similar Artist 1'],
      dtype='object')
<bound method Series.unique of 0        -13.78db
1        -10.54db
2        -11.63db
3        -10.26db
4        -10.46db
           ...   
65102     -8.38db
65103     -5.61db
65104        -6db
65105     -8.44db
65106     -12.7db
Name: Loudness (db), Length: 65107, dtype: object>


In [4]:
import nltk
# Comprueba ruta:
print(nltk.data.path[0]) 
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')


/home/pioran/nltk_data


[nltk_data] Downloading package punkt to /home/pioran/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/pioran/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /home/pioran/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:

from text_preprocessing import TextPreprocessor, TFIDFVectorizer

preprocessor = TextPreprocessor()
# Juntar columnas
columns_to_use = ['text', 'song', 'Artist(s)', 'Album', 'Similar Artist 1', 'Genre']
df['combined_text'] = df[columns_to_use].fillna('').agg(' '.join, axis=1)

preprocessor = TextPreprocessor()
df['text_processed'] = df['combined_text'].apply(preprocessor.preprocess)

vectorizer = TFIDFVectorizer(max_features=1000)
X = vectorizer.fit_transform(df['text_processed'].tolist())



In [None]:
pd.DataFrame(X).to_csv("X_tfidf_las_df.csv", index=False)

: 

In [None]:
df['Loudness (db)'] = df['Loudness (db)'].astype(str).str.replace('db', '', regex=False)
df['Loudness (db)'] = pd.to_numeric(df['Loudness (db)'], errors='coerce')
# Asegurarse que tenga data
df[['Danceability', 'Loudness (db)']] = df[['Danceability', 'Loudness (db)']].fillna(0)
scaler = StandardScaler()
X_num = scaler.fit_transform(df[['Danceability', 'Loudness (db)']])

X = pd.read_csv("X_tfidf_las_df.csv").values
X = np.concatenate([X, X_num], axis=1)



In [None]:
print(X.shape)

(65107, 1002)


In [None]:
df['Explicit_binary'] = df['Explicit'].map({'Yes': 1, 'No': 0})
y = df['Explicit_binary'].values

depths = [10]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


for depth in depths:
    tree = DecisionTree(max_depth=depth)
    tree.fit(X_train, y_train)
    y_pred = tree.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    # Mostrar las métricas
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(cm)
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Not Explicit', 'Explicit'],
                yticklabels=['Not Explicit', 'Explicit'])

    # Etiquetas personalizadas
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(f'DT C type Confusion Matrix (Depth={depth}) - Explicit Content Detection')

    plt.tight_layout()

    # Guardar imagen
    plt.savefig( str(depth) + "confusion_matrix_C_type.png")
    plt.close()

    with open(f"decision_tree_model_depth_{depth}.pkl", "wb") as f:
        pickle.dump(tree, f)


    

Accuracy: 0.8794
Precision: 0.8292
F1 Score: 0.7514
[[9077  489]
 [1082 2374]]
Accuracy: 0.8846
Precision: 0.8245
F1 Score: 0.7675
[[9038  528]
 [ 975 2481]]
Accuracy: 0.8829
Precision: 0.8173
F1 Score: 0.7653
[[9010  556]
 [ 969 2487]]
Accuracy: 0.8827
Precision: 0.8180
F1 Score: 0.7647
[[9014  552]
 [ 975 2481]]
Accuracy: 0.8820
Precision: 0.8166
F1 Score: 0.7631
[[9010  556]
 [ 981 2475]]
Accuracy: 0.8796
Precision: 0.8099
F1 Score: 0.7588
[[8987  579]
 [ 989 2467]]
Accuracy: 0.8787
Precision: 0.7995
F1 Score: 0.7602
[[8938  628]
 [ 952 2504]]


Accuracy: 0.8829
Precision: 0.8173
F1 Score: 0.7653
[[9010  556]
 [ 969 2487]]
