In [1]:
import os
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
#cambiar el directorio de trabajo actual
os.chdir('/content/drive/MyDrive/datos')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree  import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from imblearn.over_sampling import RandomOverSampler

In [5]:
data=pd.read_csv('WineQT.csv')
data = data_clean(data)

In [4]:
def data_clean(dataframe):
    # Categorizar niveles de alcohol
    bins = [0, 10, 12, float('inf')]  # Ejemplo: bajos (<10), medios (10-12), altos (>12)
    labels = [1, 2, 3]

    df = (dataframe
          .drop_duplicates()  # Eliminar duplicados
          .assign(alcohol_bins=dataframe['alcohol'].pipe(pd.cut, bins=bins, labels=labels))  # Agregar categorías de alcohol
          .reset_index(drop=True)
          .astype({
              'fixed acidity': 'float32',
              'volatile acidity': 'float32',
              'citric acid': 'float32',
              'residual sugar': 'float32',
              'chlorides': 'float32',
              'free sulfur dioxide': 'uint8',
              'total sulfur dioxide': 'uint16',
              'density': 'float32',
              'pH': 'float32',
              'sulphates': 'float32',
              'alcohol': 'float32',
              'quality': 'uint8',
              'alcohol_bins': 'uint8'  # Nueva columna categórica
          }))
    return df


PREPROCESAMIENTO

In [7]:
# df.shape
print(f"Num rows: {len(data)}")
print(f"Num columns: {len(data.columns)}")

Num rows: 1143
Num columns: 14


In [8]:
# Imprimir valores únicos para cada columna
for column in data.columns:
    unique_values = data[column].unique()
    print("Unique values for column", column, ":")
    print(unique_values)
    print()

Unique values for column fixed acidity :
[ 7.4  7.8 11.2  7.9  7.3  6.7  5.6  8.5  7.6  6.9  6.3  7.1  8.3  5.2
  8.1  8.8  7.5  4.6  7.7  6.6  8.6 10.2  7.   7.2  8.   6.8  6.2  9.3
  5.   8.4 10.1  9.4  8.2  5.8  9.2  8.9  6.4 11.5  9.6 12.8  9.7  8.7
 12.  15.  10.8 11.1 10.  12.5 10.3 11.4  9.9 11.6 10.4 13.3 10.6  9.
  9.8 13.4 11.9 10.9 13.8  9.1 13.5  6.1 10.5 12.6 13.7  9.5 12.2 12.3
 15.6  5.3 11.3 13.   6.5 10.7 12.9 14.3 12.4 15.5 12.7 13.2 15.9  5.1
 11.8  4.9  5.9 11.7  5.4  5.7  6. ]

Unique values for column volatile acidity :
[0.7   0.88  0.76  0.28  0.66  0.6   0.65  0.58  0.615 0.61  0.32  0.39
 0.43  0.49  0.4   0.41  0.71  0.645 0.675 0.655 0.38  0.45  0.52  0.935
 0.5   0.51  0.42  0.59  0.69  0.735 0.725 0.705 0.785 0.75  0.625 0.3
 0.55  0.62  1.02  0.775 0.9   0.545 0.575 0.63  0.56  0.31  0.54  1.07
 0.695 1.33  0.745 1.04  0.415 0.34  0.67  0.68  0.33  0.95  0.53  0.64
 0.885 0.805 0.73  0.37  1.09  0.44  0.57  0.48  1.    0.635 0.21  0.35
 0.975 0.87  0.18  0

In [9]:
# Función para mostrar el conteo de valores por columna
def v_counts(dataframe):
    for i in dataframe:
        print(dataframe[i].value_counts())
        print("_____________________________________________________________________________")
v_counts(data)

fixed acidity
7.2     43
7.1     41
7.0     40
7.8     40
7.5     37
        ..
4.6      1
13.7     1
13.4     1
13.5     1
12.2     1
Name: count, Length: 91, dtype: int64
_____________________________________________________________________________
volatile acidity
0.600    32
0.500    32
0.430    31
0.390    29
0.580    28
         ..
1.035     1
0.565     1
0.865     1
0.965     1
0.160     1
Name: count, Length: 135, dtype: int64
_____________________________________________________________________________
citric acid
0.00    99
0.49    47
0.24    42
0.02    35
0.01    26
        ..
0.61     1
0.72     1
1.00     1
0.75     1
0.62     1
Name: count, Length: 77, dtype: int64
_____________________________________________________________________________
residual sugar
2.00    107
2.10    103
1.80     92
2.20     88
1.90     80
       ... 
7.30      1
7.20      1
2.95      1
3.65      1
4.40      1
Name: count, Length: 80, dtype: int64
_________________________________________________

VERIFICACION DE DATOS NULOS

In [10]:
data.isnull().sum()

Unnamed: 0,0
fixed acidity,0
volatile acidity,0
citric acid,0
residual sugar,0
chlorides,0
free sulfur dioxide,0
total sulfur dioxide,0
density,0
pH,0
sulphates,0


Como podemos ver no existen valores nulos dentro de los campos

VERIFICACION DE DATOS DUPLICADOS

In [14]:
print("Duplicados:", data.duplicated().sum())

Duplicados: 0


In [12]:
data.drop_duplicates(inplace=True)

In [15]:
print("Duplicados después de limpieza:", data.duplicated().sum())

Duplicados después de limpieza: 0


CLASIFICADOR

Para este seccion hemos seleccionaod el calsificador de arbol de desicion

Para este proceso lo que primero vamos a hacer es Scalar las caracteristicas(X) y separar el objetivo (Y)

In [17]:
# Separación de características (X) y objetivo (y)
X = data.drop(['quality'], axis=1)  # Usamos 'quality' como objetivo
y = data['quality']

# Balanceo de datos con RandomOverSampler
oversampler = RandomOverSampler()
X_resampled, y_resampled = oversampler.fit_resample(X, y)

In [18]:
scaler = StandardScaler()
scaler.fit(X)

VERIFICAMOS

In [19]:
scaled_features = scaler.transform(X)
X = pd.DataFrame(scaled_features,columns=data.columns[1:])
X.head(10)

Unnamed: 0,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Id,alcohol_bins
0,-0.52158,0.939332,-1.365027,-0.466421,-0.231395,-0.450466,-0.363601,0.55585,1.270695,-0.573658,-0.963382,-1.735618,-0.966826
1,-0.292593,1.941813,-1.365027,0.05006,0.234246,0.916307,0.643498,0.036167,-0.708927,0.130881,-0.593601,-1.733462,-0.966826
2,-0.292593,1.273492,-1.161568,-0.171289,0.107253,-0.05996,0.246762,0.140091,-0.325775,-0.045254,-0.593601,-1.731306,-0.966826
3,1.653789,-1.399789,1.4834,-0.466421,-0.25256,0.135294,0.429871,0.659805,-0.964362,-0.456235,-0.593601,-1.72915,-0.966826
4,-0.52158,0.939332,-1.365027,-0.466421,-0.231395,-0.450466,-0.363601,0.55585,1.270695,-0.573658,-0.963382,-1.726993,-0.966826
5,-0.52158,0.716559,-1.365027,-0.540205,-0.25256,-0.255213,-0.180492,0.55585,1.270695,-0.573658,-0.963382,-1.724837,-0.966826
6,-0.235347,0.382399,-1.059838,-0.687771,-0.379554,-0.05996,0.399353,-0.171712,-0.07034,-1.160774,-0.963382,-1.722681,-0.966826
7,-0.578826,0.660865,-1.365027,-0.982903,-0.464216,-0.05996,-0.760338,-1.107153,0.50439,-1.102062,-0.408711,-1.720525,-0.966826
8,-0.292593,0.271012,-1.263297,-0.392638,-0.294891,-0.64572,-0.851892,0.036167,0.312812,-0.514946,-0.870937,-1.718369,-0.966826
9,-0.922306,0.271012,-0.958109,-0.540205,0.213081,-0.05996,0.582462,-0.431569,-0.198057,-0.691081,-1.148272,-1.714057,-0.966826


In [21]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.20, random_state=0)

In [22]:
# Entrenamiento del modelo
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

In [23]:
print(model.score(X_train , y_train))
print(model.score(X_test, y_test))

1.0
0.8517241379310345


In [24]:
# Calcular la matriz de confusión

y_pred = model.predict(X_test)

confusion = confusion_matrix(y_test, y_pred)
print("Matriz de Confusión:")
print(confusion)


# Calcular la precisión del clasificador
accuracy = accuracy_score(y_test, y_pred)
print("Precisión del Clasificador:", accuracy)

Matriz de Confusión:
[[ 99   0   0   0   0   0]
 [  0  79   0   0   0   0]
 [  0   3  60  27   7   2]
 [  0   3  23  63  12   2]
 [  0   1   4   2 101   0]
 [  0   0   0   0   0  92]]
Precisión del Clasificador: 0.8517241379310345


PRIMERA EJECUCION

Splits: al menos 100 asignaciones, la mediana de la confiabilidad
Académico (primera ejecucion) 80(train)/20(test) – Investigación 50/50 (segunda
ejecución)

In [25]:
accuracy_list=[]
for i in range(100):
  X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.20, random_state=0)
  model= DecisionTreeClassifier()
  model.fit(X_train , y_train)
  y_pred = model.predict(X_test)
  confusion = confusion_matrix(y_test, y_pred)
  print("Matriz de Confusión:")
  print(confusion)
  accuracy = accuracy_score(y_test, y_pred)
  print("Precisión del Clasificador:", accuracy)
  accuracy_list.append(accuracy)
  print("__________________________________________________")
print(accuracy_list)
print(np.mean(accuracy_list))

Matriz de Confusión:
[[ 99   0   0   0   0   0]
 [  0  79   0   0   0   0]
 [  0   5  61  27   4   2]
 [  0   2  20  65  14   2]
 [  0   1   4   2 101   0]
 [  0   0   0   0   0  92]]
Precisión del Clasificador: 0.8568965517241379
__________________________________________________
Matriz de Confusión:
[[ 99   0   0   0   0   0]
 [  0  79   0   0   0   0]
 [  0   4  59  28   6   2]
 [  0   2  26  62  11   2]
 [  0   1   4   2 101   0]
 [  0   0   0   0   0  92]]
Precisión del Clasificador: 0.8482758620689655
__________________________________________________
Matriz de Confusión:
[[ 99   0   0   0   0   0]
 [  0  79   0   0   0   0]
 [  0   3  60  28   6   2]
 [  0   3  20  65  13   2]
 [  0   1   4   2 101   0]
 [  0   0   0   0   0  92]]
Precisión del Clasificador: 0.8551724137931035
__________________________________________________
Matriz de Confusión:
[[ 99   0   0   0   0   0]
 [  0  79   0   0   0   0]
 [  0   3  59  30   4   3]
 [  0   2  26  63  11   1]
 [  0   1   5   2 100   0

SEGUNDA EJECUCION

In [30]:
accuracy_list=[]
for i in range(100):
  X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.50, random_state=0)
  model= DecisionTreeClassifier()
  model.fit(X_train , y_train)
  y_pred = model.predict(X_test)
  confusion = confusion_matrix(y_test, y_pred)
  print("Matriz de Confusión:")
  print(confusion)
  accuracy = accuracy_score(y_test, y_pred)
  print("Precisión del Clasificador:", accuracy)
  accuracy_list.append(accuracy)
  print("__________________________________________________")
print(accuracy_list)
print(np.mean(accuracy_list))


Matriz de Confusión:
[[230   0   0   0   0   0]
 [  0 234   0   0   0   0]
 [  0  18 156  56  16   0]
 [  0  14  66 112  48   6]
 [  0   0   7  20 206   7]
 [  0   0   0   0   0 253]]
Precisión del Clasificador: 0.8219461697722568
__________________________________________________
Matriz de Confusión:
[[230   0   0   0   0   0]
 [  0 234   0   0   0   0]
 [  0  15 165  52  14   0]
 [  0  18  68 113  41   6]
 [  0   0   7  21 207   5]
 [  0   0   0   0   0 253]]
Precisión del Clasificador: 0.8295376121463078
__________________________________________________
Matriz de Confusión:
[[230   0   0   0   0   0]
 [  0 234   0   0   0   0]
 [  1  17 165  47  16   0]
 [  2  17  61 113  47   6]
 [  0   0   4  17 214   5]
 [  0   0   0   0   0 253]]
Precisión del Clasificador: 0.8343685300207039
__________________________________________________
Matriz de Confusión:
[[230   0   0   0   0   0]
 [  0 234   0   0   0   0]
 [  0  17 156  54  19   0]
 [  0  14  74 111  43   4]
 [  0   0   8  14 211   7