# Import

## Setup

In [195]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import display, Markdown

sns.set_theme(style="darkgrid")
pd.set_option('display.max_columns', None)  

import sys, os, yaml

DATASET = "Apple-Quality"
COLAB = 'google.colab' in sys.modules

DEBUG = False
SEED = 666

In [196]:
COLAB = 'google.colab' in sys.modules

if COLAB:
  from google.colab import drive
  if not os.path.isdir("/content/gdrive"):
    drive.mount("/content/gdrive")
    d = "/content/gdrive/MyDrive/datasets"
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
  ROOT = f"/content/gdrive/MyDrive/datasets/{DATASET.replace(' ','_')}/"
  if not os.path.isdir(ROOT): os.makedirs(ROOT)
else:
  ROOT = "./"

def makedirs(d):
  if COLAB:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
  else:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d, mode=0o777, exist_ok=True)

for d in ['orig','data','output']: makedirs(d)

## Load Dataset

In [197]:
df = pd.read_pickle(f"{ROOT}/data/apple.pickle")

In [198]:
print(df.shape)
df.head()

(4000, 8)


Unnamed: 0,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity,Target
0,-1.798424,-0.950373,2.993421,-1.42415,0.690545,-0.089872,-0.269415,0
1,-0.35906,-1.154404,2.127698,0.429746,0.176767,0.19702,-0.378997,0
2,0.109445,-0.225759,-0.652507,-0.946892,1.205422,-0.286156,1.206044,1
3,-0.079977,-0.800146,0.923916,-0.772399,1.619575,-2.08732,0.338315,0
4,0.968573,-0.19164,0.044164,-1.096894,1.305025,-0.961548,0.201472,0


# Baseline Model

In [199]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import roc_curve, auc

## Split

In [200]:
X = df.drop('Target', axis=True)
X.head()

Unnamed: 0,Size,Weight,Sweetness,Crunchiness,Juiciness,Ripeness,Acidity
0,-1.798424,-0.950373,2.993421,-1.42415,0.690545,-0.089872,-0.269415
1,-0.35906,-1.154404,2.127698,0.429746,0.176767,0.19702,-0.378997
2,0.109445,-0.225759,-0.652507,-0.946892,1.205422,-0.286156,1.206044
3,-0.079977,-0.800146,0.923916,-0.772399,1.619575,-2.08732,0.338315
4,0.968573,-0.19164,0.044164,-1.096894,1.305025,-0.961548,0.201472


In [201]:
y = df.Target
y.head()

0    0
1    0
2    1
3    0
4    0
Name: Target, dtype: int64

In [202]:
from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)



## Train

In [203]:
classifiers = {
    "KNN" : KNeighborsClassifier(),
    "KNN(3)" : KNeighborsClassifier(3),
    "SVC" : SVC(),
    "LGBM" : LGBMClassifier(),
    "RF" : RandomForestClassifier(),
    "RF(n_est = 300)" : RandomForestClassifier(n_estimators=300),
    "XGB": XGBClassifier(),
}

## Test

In [204]:
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

for name, model in classifiers.items():
    
    model.fit(X_train, y_train)
    
    # Scoring on SEEN data - effectively "useless"
    y_pred = model.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_pred)
    
    # Scoring on UNSEEN data - important
    y_pred = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    
    print(f"{name:20s} accuracy\ttrain = {train_accuracy:.2%} \ttest = {test_accuracy:.2%}")

KNN                  accuracy	train = 92.81% 	test = 91.50%
KNN(3)               accuracy	train = 94.03% 	test = 89.75%


SVC                  accuracy	train = 90.22% 	test = 90.25%
[LightGBM] [Info] Number of positive: 1586, number of negative: 1614
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000197 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1785
[LightGBM] [Info] Number of data points in the train set: 3200, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495625 -> initscore=-0.017500
[LightGBM] [Info] Start training from score -0.017500
LGBM                 accuracy	train = 99.12% 	test = 89.62%
RF                   accuracy	train = 100.00% 	test = 90.50%
RF(n_est = 300)      accuracy	train = 100.00% 	test = 92.25%
XGB                  accuracy	train = 100.00% 	test = 90.75%


## Resultados

Foi obtido bons resultados (>0.9) já na primeira abordagem. Isso indica que o dataset está bem equilibrado e parametrizado. Para uma possível melhora nos resultados será necessário uma manipulação nos Hyperparamters e Avaliação das Features mais significantes.

https://www.kaggle.com/code/alkidiarete/apple-quality-roc-0-97

# Questões

**Pergunta:** Qual é a diferença entre dados de treinamento e dados de teste ao treinar um modelo de classificação?


Dados de treinamento são usados para treinar o modelo, enquanto dados de teste são usados para avaliar sua performance e capacidade de generalização para novos dados.

**Pergunta:** Com base no dataset fornecido, a abordagem do treinamento do modelo deve ser `supervisionada` ou `não-supervisionada`? Por quê?


Deve ser supervisionada porque o conjunto de dados fornecido está rotulado, o que significa que já há o "gabarito"  associados às entradas.