In [70]:
import plotly
import pandas as pd
# now we train a decision tree on the columns of interest
from sklearn import tree
from sklearn import metrics
import plotly.graph_objects as go
import numpy as np
from scipy import special,stats
from sklearn import ensemble
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score



In [71]:
df = pd.read_csv('winequality-red.csv')# download dataset -> https://www.kaggle.com/datasets/piyushgoyal443/red-wine-dataset

# since the dataset contains the target variable in a range between 3 and 8, we map them from 1 to 5.
quality_mapping = {
    3: 0,
    4: 1,
    5: 2,
    6: 3,
    7: 4,
    8: 5
}

df.loc[:, 'quality'] = df.quality.map(quality_mapping)

# split the dataset in two portions, training and test sets. 
# since the dataset has 1599 examples, we'll use 1000 for training and 599 for test
# we use frac=1 to shuffle the data and reset the index
df = df.sample(frac=1).reset_index(drop=True)

df_train = df.head(1000)
df_test = df.tail(599)


In [72]:
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,8.2,1.00,0.09,2.3,0.065,7.0,37.0,0.99685,3.32,0.55,9.0,3
1,6.8,0.64,0.03,2.3,0.075,14.0,31.0,0.99545,3.36,0.58,10.4,3
2,7.0,0.69,0.08,1.8,0.097,22.0,89.0,0.99590,3.34,0.54,9.2,3
3,11.6,0.41,0.58,2.8,0.096,25.0,101.0,1.00024,3.13,0.53,10.0,2
4,6.8,0.63,0.12,3.8,0.099,16.0,126.0,0.99690,3.28,0.61,9.5,2
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.1,0.64,0.02,2.4,0.069,26.0,46.0,0.99358,3.47,0.45,11.0,2
1595,6.9,0.39,0.24,2.1,0.102,4.0,7.0,0.99462,3.44,0.58,11.4,1
1596,7.1,0.56,0.14,1.6,0.078,7.0,18.0,0.99592,3.27,0.62,9.3,2
1597,7.5,0.52,0.42,2.3,0.087,8.0,38.0,0.99720,3.58,0.61,10.5,3


In [73]:
fig = go.Figure(data=[go.Table(
    header=dict(values=list(df.columns),
                align='left'),
    cells=dict(values=[df[col] for col in df.columns],
               align='left'))
])

fig.show()

## 1 - Simple model to explain overfitting 

In [92]:

cols = [
    'fixed acidity', 'volatile acidity', 'citric acid','residual sugar', 'chlorides', 'free sulfur dioxide',
    'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol',
]

# train the model
clf = tree.DecisionTreeClassifier(max_depth=3) 
clf.fit(df_train[cols], df_train.quality)

# create predictions
train_predictions = clf.predict(df_train[cols])
test_predictions = clf.predict(df_test[cols])

# compute accuracy
train_accuracy = metrics.accuracy_score(df_train.quality, train_predictions)
test_accuracy = metrics.accuracy_score(df_test.quality, test_predictions)

print(f"Train accuracy: {round(train_accuracy, 3)}")
print(f"Test accuracy: {round(test_accuracy, 3)}")

Train accuracy: 0.597
Test accuracy: 0.568


In [101]:
train_accs = []
test_accs = []

# init a loop where we dynamically change the value of max_depth
for depth in range(1, 200):
    clf = tree.DecisionTreeClassifier(max_depth=depth)
    clf.fit(df_train[cols], df_train.quality)
    train_predictions = clf.predict(df_train[cols])
    test_predictions = clf.predict(df_test[cols])
    
    train_acc = metrics.accuracy_score(df_train.quality, train_predictions)
    test_acc = metrics.accuracy_score(df_test.quality, test_predictions)
    
    # append the accuracies to the lists
    train_accs.append(train_acc)
    test_accs.append(test_acc)
    


In [103]:
# plot the data
fig = go.Figure()

# Add the train accuracy line trace
fig.add_trace(go.Scatter(x=list(range(0, 30, 1)), y=train_accs, mode='lines', name='train accuracy'))

# Add the test accuracy line trace
fig.add_trace(go.Scatter(x=list(range(0, 30, 1)), y=test_accs, mode='lines', name='test accuracy'))

# Update layout
fig.update_layout(
    title='Train and Test Accuracy vs. max_depth - Decision Tree',
    xaxis_title='max_depth',
    yaxis_title='accuracy',
    legend=dict(x=0, y=1, bgcolor='rgba(255, 255, 255, 0.5)'),
    font=dict(size=15),
    width=800,
    height=500
)

# Show the figure
fig.show()

## 2 - Complex Model to dig into performance and real business value

In [95]:
new_df  = df.copy()
skew_df = new_df[columns].skew().to_frame().rename(columns={0:"Skewness"})

columns = df.columns.tolist()
columns.remove("quality")
skewness_transformation = {}

for col in columns:
    transformed_log = np.log(df[col])                        # Log Transformation
    transformed_boxcox = special.boxcox1p(df[col], 0.15)     # Box-Cox Transformation with lambda=0.15
    transformed_inverse = 1 / df[col]                        # Inverse Transformation
    transformed_yeojohnson, _ = stats.yeojohnson(df[col])    # Yeo-Johnson Transformation
    transformed_cbrt = np.cbrt(df[col])                      # Cube Root Transformation

    # Create a dictionary for the skewness values of each transformation
    transformation_skewness = {
        "Log Transformation": stats.skew(transformed_log),
        "Box-Cox Transformation": stats.skew(transformed_boxcox),
        "Inverse Transformation": stats.skew(transformed_inverse),
        "Yeo Johnson Transformation": stats.skew(transformed_yeojohnson),
        "Cube Root Transformation": stats.skew(transformed_cbrt)}

    # Store the transformation skewness values for the column
    
skewness_transformation[col] = transformation_skewness
result_df = pd.DataFrame.from_dict(skewness_transformation, orient='index')
result_df = pd.concat([skew_df["Skewness"], result_df], axis=1)

for col in columns:
    transformed_col,_ = stats.yeojohnson(df[col])
    df[col] = transformed_col


divide by zero encountered in log


invalid value encountered in subtract



In [1]:
df["quality"].unique()

NameError: name 'tree' is not defined

In [97]:
#Condition of Splitting: If quality > 6.5 => "good" ELSE => "bad"

bin_edges = [0,6.5,10]
group_names = ["Bad","Good"]

df["quality"] = pd.cut(df["quality"], bins=bin_edges, labels=group_names)
df["quality"].unique()

[NaN]
Categories (2, object): ['Bad' < 'Good']

In [98]:
df["quality"] = df["quality"].replace({"Bad":0,"Good":1})

In [54]:
df.sample(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
730,0.967431,0.39348,0.265471,0.456221,0.045429,2.900674,4.856523,0.057393,1.157232,0.212592,0.268967,0
1451,0.953699,0.27339,0.265471,0.430199,0.037887,1.704353,2.255848,0.057392,1.145198,0.205677,0.268995,0
301,0.985192,0.246567,0.318722,0.444508,0.040956,1.45634,1.991797,0.057393,1.132822,0.216684,0.26899,0
456,0.961815,0.535426,-0.0,0.461218,0.045278,2.239687,2.931184,0.057393,1.154586,0.194115,0.268974,0
180,1.024424,0.290411,0.451752,0.447288,0.04343,2.085786,3.050348,0.057393,1.117206,0.198363,0.268967,0


In [90]:
df2 = df.drop(columns="quality")
df2 = df2.dropna()
n_splits = 5  # Set the number of folds
stratified_kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

In [91]:
for train_index, test_index in stratified_kfold.split(df2, df["quality"]):
    x_train, x_test = df2.iloc[train_index], df2.iloc[test_index]
    y_train, y_test = df["quality"].iloc[train_index], df["quality"].iloc[test_index]

ValueError: Input y contains NaN.

10