In [1]:
# import general libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# import helper functions
from src.utils.DataSetup import DataSetup
import src.model.modeling
# set up plot parameters
plt.rcParams.update({'font.size': 16})


# Data Cleaning

## Wine Data set-up

In [2]:
# load wine data
wine_data = pd.read_csv('../data/wine-data/wine.data')
wine_data.columns = ['class','alcohol','malic_acid','ash','alcalinity_of_ash','magnesium',
                     'total_phenols','flavanoids','nonflavanoid_phenols','proanthocyanins',
                     'color_intensity','hue','OD280/OD315_of_diluted_wines','proline']

In [3]:
# initialize data setup object to split to train and test
wine = DataSetup(wine_data, 'class')

# check out dataset
wine.describe_dataset(sort_index=False)

# initialize split on all features
wine.save_train_test_split(X=wine.X,
                            y=wine.y,
                            test_split=0.2,
                            random_state=2024,
                            store_splits=True)

Datapoints: 177
Features: alcohol, malic_acid, ash, alcalinity_of_ash, magnesium, total_phenols, flavanoids, nonflavanoid_phenols, proanthocyanins, color_intensity, hue, OD280/OD315_of_diluted_wines, proline (13 attributes)
Missing Values: 0
--------------Target Counts--------------
Target Variable: class
2    71 (40.11%)
1    58 (32.77%)
3    48 (27.12%)
dtype: object
-----------------------------------------


TypeError: DataSetup.save_train_test_split() got an unexpected keyword argument 'X'

In [None]:
wine_data.columns

In [None]:
# tsne viz on wine
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(wine_data.drop('class', axis=1))

pca = PCA(n_components=3)
pca_result = pca.fit_transform(wine_data.drop('class', axis=1))

wine_data['pca-one'] = pca_result[:,0]
wine_data['pca-two'] = pca_result[:,1]
wine_data['pca-three'] = pca_result[:,1]
wine_data['tsne-2d-one'] = tsne_results[:,0]
wine_data['tsne-2d-two'] = tsne_results[:,1]

sns.scatterplot(
    x="tsne-2d-one", y="tsne-2d-two",
    hue="class",
    data=wine_data,
    legend="full",

)

In [None]:
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.colors import ListedColormap

# axes instance
fig = plt.figure(figsize=(6,6))
ax = Axes3D(fig, auto_add_to_figure=False)
fig.add_axes(ax)

# get colormap from seaborn
cmap = ListedColormap(sns.color_palette().as_hex())

# plot
sc = ax.scatter(wine_data['pca-one'],
                wine_data['pca-two'],
                wine_data['pca-three'], s=40,
                c=wine_data['class'], marker='o', cmap=cmap, alpha=1)

# legend
plt.legend(*sc.legend_elements(), bbox_to_anchor=(1.05, 1), loc=2)

ax.set_xlabel('X Label')
ax.set_ylabel('Y Label')
ax.set_zlabel('Z Label')


## Poker Data set-up

In [None]:
# read poker data
poker_data = pd.read_csv('../data/poker-data/poker-hand.data', header=None)
poker_data.columns = ['S1','C1','S2','C2','S3','C3','S4','C4','S5','C5','hand']

# save numerical version of data ("one-hot" encoding)
poker_num = poker_data.copy()

# dictionaries for cleaning
poker_suits = {1:'Hearts',2:'Spades',3:'Diamonds',4:'Clubs'}
poker_hands = {0:'Nothing in hand',1:'One pair',2:'Two pairs',3:'Three of a kind',
                             4:'Straight',5:'Flush',6:'Full house',7:'Four of a kind',
                             8:'Straight flush',9:'Royal flush'}
# clean suit columns
for i in range(1,6):
    poker_data['S'+str(i)] = poker_data['S'+str(i)].map(poker_suits)

# clean hand categories
poker_data.hand = poker_data.hand.map(poker_hands)

In [None]:
# initialize data setup object
poker = DataSetup(poker_data, 'hand')

# save numerical version of data ("one-hot" encoding)
# poker.dataset_onehot = poker_num

# check out dataset
poker.describe_dataset(sort_index=False)

# initialize split on all features
poker.save_train_test_split(X=poker.dataset_onehot[poker.onehot_features],
                            y=poker.y,
                            test_split=0.2,
                            random_state=2024,
                            store_splits=True)

In [None]:
poker.num_features

## Student Performance Data set-up

In [None]:
# read student data
student_data = pd.read_csv('../data/student-data/student.csv', sep=';')

In [None]:
student_data.G3


In [None]:
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(student_data.drop('G3', axis=1))

pca = PCA(n_components=3)
pca_result = pca.fit_transform(student.drop('G3', axis=1))

wine_data['pca-one'] = pca_result[:,0]
wine_data['pca-two'] = pca_result[:,1]
wine_data['pca-three'] = pca_result[:,1]
wine_data['tsne-2d-one'] = tsne_results[:,0]
wine_data['tsne-2d-two'] = tsne_results[:,1]

sns.scatterplot(
    x="tsne-2d-one", y="tsne-2d-two",
    hue="G3",
    data=student_data,
    legend="full",

)

In [None]:
import math
import numpy as np
from scipy.stats import lognorm
import statsmodels.api as sm
import matplotlib.pyplot as plt

#make this example reproducible
np.random.seed(1)

#generate dataset that contains 1000 log-normal distributed values
lognorm_dataset = lognorm.rvs(s=.5, scale=math.exp(1), size=1000)

#create Q-Q plot with 45-degree line added to plot
output=student_data.loc[:,'G3']
output.value_counts().sort_index().plot(kind='bar')
plt.title(f"Student Dataset "
              f"G3 Grade Distribution")
plt.xlabel(f"G3")
plt.ylabel("Count")
plt.savefig(f"student_target_distribution_unclean.png")
plt.clf()
# fig = sm.qqplot(output, line='45')

plt.show()

In [None]:
# create overlapping loss curves
names = ['wine','student']
import seaborn as sns

# create loss curves
for name in names:
    # initialize data setup object
    loss = pd.read_csv(f"results/{name}/final/losses_{name}.csv")

    # # create lineplot with confidence intervals
    sns.lineplot(data=loss, x='fold', y='loss', markers=True,
                 dashes=False, legend=True, label=name)
    # sns.scatterplot(data=loss_avg, x='fold', y='loss', size=2)

        # add labels
    plt.legend()
    plt.title(f'Loss Curve of Neural Network (Split into {5} Folds)')
    plt.ylabel('Loss')
    plt.xlabel('Epochs')

In [None]:
    # initialize data setup object
student = DataSetup(student_data, 'G3')

# check out dataset
student.describe_dataset(sort_index=False)

In [None]:
training_features = [
            'school_MS',
            'sex_M',
            'address_U', 'famsize_LE3', 'Pstatus_T',
            'Mjob_health', 'Mjob_other', 'Mjob_services',
            'Mjob_teacher',
            'Fjob_health', 'Fjob_other', 'Fjob_services',
            'Fjob_teacher',
            'reason_home', 'reason_other', 'reason_reputation',
            'guardian_mother', 'guardian_other',
            'schoolsup_yes', 'famsup_yes',
            'paid_yes', 'activities_yes',
            'nursery_yes', 'higher_yes',
            'internet_yes', 'romantic_yes',
            'age',
            'Medu', 'Fedu',
            'traveltime', 'studytime',
            'failures',
            'famrel', 'freetime', 'goout', 'Dalc', 'Walc',
            'health', 'absences',
            'G1', 'G2',
        ]

In [None]:
len(training_features)

In [None]:
student.dataset_onehot[student.num_features+student.onehot_features]

In [None]:
poker.dataset_onehot

# Decision Trees

# per model
1. Validation curve for 2 hyperparameters (fix range and choice)
2. Learning curve
3. Have interesting findings
4. Grid search to get optimal model in the end

In [None]:
# import modeling
from importlib import reload
reload(modeling)

In [None]:
# initialize the classifier
modeling.plot_validation_curve('DecisionTree',
                               wine,
                               'max_depth',
                               range(1, 10),
                               k_folds=5,
                               scoring='f1_weighted')

In [None]:
# initialize the classifier
modeling.plot_validation_curve('DecisionTree',
                               wine,
                               'min_samples_split',
                               np.arange(4,40,4),
                               k_folds=5,
                               scoring='f1_weighted')

In [None]:
strings = '[0.0001, 0.05, 0.01]'

In [None]:
eval('DecisionTreeClassifier()')

- Split test set
- From train set, use cross_validate and KFold (or StratifiedKFold) to get cv metrics (train scores, test scores)
  - either do a learning curve or a validation curve
    - learning curve: plot train and test scores vs. dataset size
        - sample the dataset (e.g. 10%, 20%, 30%, ..., 100%), then run cv
    - validation curve: plot train and test scores vs. hyperparameter value

In [None]:
from sklearn.model_selection import (
    cross_validate,
    GridSearchCV,
    KFold
)

In [None]:
cv = KFold(n_splits=5, shuffle=True, random_state=2024)
ci_cv = pd.DataFrame()
plot=True
k_folds=5

for train_percent in np.linspace(0.1, 1, 10):
    # cut down train set
    X_train_temp, y_train_temp = poker.X_train.sample(frac=train_percent), poker.y_train.sample(frac=train_percent)

    # create decision tree
    clf, y_pred = model_algos.decision_tree_setup(X_train_temp,
                                                  poker.X_test,
                                                  y_train_temp,
                                                  poker.y_test,
                                                  max_depth=8,
                                                  show_metrics=False)
    # results from tree
    results = cross_validate(clf, X_train_temp, y_train_temp, cv=cv,
                                   return_train_score=True)

    # fix results into dataframe
    ci_cv_temp = pd.DataFrame(results)
    ci_cv_temp['percentage'] = train_percent
    ci_cv = pd.concat([ci_cv, ci_cv_temp], axis=0)

    # summarize and get average
    summary_cv = ci_cv.groupby('percentage').mean().reset_index()

# plot results
if plot:
    # create lineplot with confidence intervals
    sns.lineplot(data=ci_cv, x='percentage', y='train_score', markers=True,
                 dashes=False, legend=True, label='Train')
    sns.lineplot(data=ci_cv, x='percentage', y='test_score', markers=True,
                 dashes=False, legend=True, label='Test')
    sns.scatterplot(data=summary_cv, x='percentage', y='train_score')
    sns.scatterplot(data=summary_cv, x='percentage', y='test_score')

    # add labels
    plt.title(f'Decision Tree Cross Validation ({k_folds} Folds)')
    plt.ylabel('Accuracy')
    plt.xlabel('Percentage of Training Data Used')
    plt.xlim(0.1,1)

    plt.show()

In [None]:
ci_cv

In [None]:
# use cross validate
for k in range(3,6):
    cross_validate = model_algos.decision_tree_depth(depths=range(1, 10),
                                                     X_train=poker.X_train,
                                                     X_test=poker.X_test,
                                                     y_train=poker.y_train,
                                                     y_test=poker.y_test,
                                                     k_folds=k)

# Appendix

In [None]:
# read heart disease data
heart_disease_data = pd.read_csv('../heart-disease-data/processed.cleveland.data', header=None)
heart_disease_data.columns = ['age','sex','cp','trestbps','chol','fbs','restecg','thalach',
                              'exang','oldpeak','slope','ca','thal','num']

# clean using strings
heart_disease_data.sex = heart_disease_data.sex.map({1:'male',0:'female'})
heart_disease_data.cp = heart_disease_data.cp.map({1:'typical angina',2:'atypical angina',
                                                   3:'non-anginal pain',4:'asymptomatic'})
heart_disease_data.fbs = heart_disease_data.fbs.map({1:True,0:False})   # if fasting blood sugar > 120mg/dl
heart_disease_data.restecg = heart_disease_data.restecg.map({0:'normal',1:'ST-T wave abnormality',
                                                             2:'left ventricular hypertrophy'})
heart_disease_data.exang = heart_disease_data.exang.map({1:True,0:False})   # if exercise induced angina
heart_disease_data.slope = heart_disease_data.slope.map({1:'upsloping',2:'flat',3:'downsloping'}) # slope of peak exercise ST segment
heart_disease_data.thal = heart_disease_data.thal.map({3:'normal',6:'fixed defect',7:'reversable defect'})

# remove fields with lots of missing data
heart_disease_data = heart_disease_data.drop(['thal'], axis=1)

# check out dataset
helper.describe_dataset(heart_disease_data, "num")