# UCI Poker dataset classification with Scikit-learn
**Student Name: Nguyen Minh Khoi**\
**Student ID: 21127081**



The libraries for this kernel are:

• Pandas

• Seaborn

• Matplotlib

• Graphviz

• Scikit-Learn

In [None]:
import pandas as pd
import graphviz
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load data and merge manually

Load data to Pandas dataframes and save into poker-hand-data.csv

In [None]:
train_data = pd.read_csv('./input/poker-hand-training-true.data', header=None)
test_data = pd.read_csv('./input/poker-hand-testing.data', header=None)

merged_data = pd.concat([train_data, test_data])
merged_file = merged_data.to_csv('poker-hand-data.csv', index=False, header=None)

# Preparing the data sets


We need to prepare four subsets from merged data:

• features_train

• features_test

• labels_train

• labels_test

In [None]:
features = merged_data.iloc[:, :-1]
labels = merged_data.iloc[:, -1]

split_ratios = [0.4, 0.6, 0.8, 0.9]

datasets = {}

for split_ratio in split_ratios:
    datasets[split_ratio] = train_test_split(features, labels, test_size=1-split_ratio, stratify=labels, shuffle=True)

In [None]:
# Declare class names of Poker Hand
class_names = ['Nothing', 'One pair', 'Two pairs', 'Three of a kind', 'Straight', 'Flush', 'Full house', 'Four of a kind', 'Straight flush', 'Royal flush']

Visualize the distributions of classes in all the data sets with Bar Chart

In [None]:
def visualize_barplot(type_set):
    for split_ratio in split_ratios:
        features_train, features_test, labels_train, labels_test = datasets[split_ratio]
        if type_set == "original":
            labels = pd.read_csv('poker-hand-data.csv', header=None)
            class_counts = labels.iloc[:, -1].value_counts().sort_index()
        elif type_set == "train":
            class_counts = labels_train.value_counts().sort_index()
        else:
            class_counts = labels_test.value_counts().sort_index()
            
        class_counts = class_counts.reindex(range(10), fill_value=0)
        for i in range(len(class_counts)):
            class_counts = class_counts.rename({i: class_names[i]})

        df = pd.DataFrame(class_counts, class_names)
        plt.figure(figsize=(6, 6))
        plots = sns.barplot(x=class_names, y=class_counts, data=df)

        for bar in plots.patches:
            plots.annotate(format(float(bar.get_height()), '.1f'), (bar.get_x() + bar.get_width() / 2,
                            bar.get_height()), ha='center', va='center', size=8, xytext=(0, 6), textcoords='offset points')
        
        if type_set == "original":
            plt.title(f'Distribution of the original set')
        elif type_set == "train": 
            plt.title(f'Distribution of training sets for split ratio {round(split_ratio*100)}%')
        else:
            plt.title(f'Distribution of test sets for split ratio {round((1- split_ratio)*100)}%')

        plt.xticks(range(len(class_counts)), class_names, rotation=90)
        plt.xlabel('Class')
        plt.ylabel('Count')
        plt.show()

        if type_set == "original":
            return

visualize_barplot("original")
visualize_barplot("train")
visualize_barplot("test")

# Building the decision tree classifiers

DecisionTreeClassifier is a class capable of performing multi-class classification on a dataset.

Classification criteria: Entropy

We use graphviz to render decision tree into pdf file.

In [None]:
for ratio in split_ratios:
    features_train, features_test, labels_train, labels_test = datasets[ratio]
    dtc = DecisionTreeClassifier(criterion='entropy')
    dtc.fit(features_train, labels_train)

    depth = 5
    dot_data = export_graphviz(dtc, max_depth=depth, feature_names=class_names, rounded=True, out_file=None, filled=True, special_characters=True)
    graph = graphviz.Source(dot_data)
    graph.render(directory="decision-tree-graph", filename=f"graph_{int(ratio*100)}")
    

# Evaluating the decision tree classifiers

Classification Report: A classification report is a summary of the performance of a classification model.

Confusion Matrix: A confusion matrix is a table that is used to evaluate the performance of a classification model.

To visualize the Confustion Matrix, we use heatmap. Heatmaps use color-coding to represent the values in the matrix, making it easier to interpret the results of the classification model.

In [None]:
for ratio in split_ratios:
    features_train, features_test, labels_train, labels_test = datasets[ratio]
    dtc = DecisionTreeClassifier(criterion='entropy')
    dtc.fit(features_train, labels_train)
    labels_pred_dtc = dtc.predict(features_test)
    print(f"Classification report for {ratio}")
    print(classification_report(labels_test, labels_pred_dtc, zero_division=0))
    confusion_matrix(labels_test, labels_pred_dtc)
    table = pd.DataFrame(confusion_matrix(labels_test, labels_pred_dtc))
    plt.title(f"UCI Poker Heat Map for {round(ratio*100)}%",fontsize=16)
    sns.heatmap(table, annot=True, fmt='.1f', cmap='viridis', annot_kws={'size': 6.5})
    plt.show()

# The depth and accuracy of a decision tree

In [None]:
test_ratio = 0.8
features_train, features_test, labels_train, labels_test = datasets[test_ratio]
list_depths = [None, 2, 3, 4, 5, 6, 7]
for depth in list_depths:
    dtc = DecisionTreeClassifier(criterion='entropy', max_depth=depth, random_state=42)
    dtc.fit(features_train, labels_train)
    if depth == None:
        visual_depth = 5
    else:
        visual_depth = None
    dot_data = export_graphviz(dtc, max_depth=visual_depth, feature_names=class_names, rounded=True, out_file=None, filled=True, special_characters=True)
    graph = graphviz.Source(dot_data)
    graph.render(directory="decision-tree-graph", filename=f"graph_{int(ratio*100)}_{depth}")
    labels_pred_dtc = dtc.predict(features_test)
    print(f"Accuracy score for {test_ratio} in depth {depth}: ")
    print(accuracy_score(labels_test, labels_pred_dtc))