In [1]:
# Import Modules
!pip install ucimlrepo



In [16]:
import pandas as pd
import matplotlib as plt
import sklearn as skl
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.preprocessing import StandardScaler
from ucimlrepo import fetch_ucirepo
import hvplot.pandas
from sklearn.cluster import KMeans

In [3]:
# fetch dataset
poker_hand = fetch_ucirepo(id=158)

# data (as pandas dataframes)
X = poker_hand.data.features
y = poker_hand.data.targets

In [4]:
# metadata
print(poker_hand.metadata)

{'uci_id': 158, 'name': 'Poker Hand', 'repository_url': 'https://archive.ics.uci.edu/dataset/158/poker+hand', 'data_url': 'https://archive.ics.uci.edu/static/public/158/data.csv', 'abstract': 'Purpose is to predict poker hands', 'area': 'Games', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 1025010, 'num_features': 10, 'feature_types': ['Categorical', 'Integer'], 'demographics': [], 'target_col': ['CLASS'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2002, 'last_updated': 'Sat Mar 09 2024', 'dataset_doi': '10.24432/C5KW38', 'creators': ['Robert Cattral', 'Franz Oppacher'], 'intro_paper': None, 'additional_info': {'summary': 'Each record is an example of a hand consisting of five playing cards drawn from a standard deck of 52. Each card is described using two attributes (suit and rank), for a total of 10 predictive attributes. There is one Class attribute that describes the "Poker Hand". T

In [5]:
poker_hand_df = poker_hand["data"]["original"]
poker_hand_df

Unnamed: 0,S1,C1,S2,C2,S3,C3,S4,C4,S5,C5,CLASS
0,1,10,1,11,1,13,1,12,1,1,9
1,2,11,2,13,2,10,2,12,2,1,9
2,3,12,3,11,3,13,3,10,3,1,9
3,4,10,4,11,4,1,4,13,4,12,9
4,4,1,4,13,4,12,4,11,4,10,9
...,...,...,...,...,...,...,...,...,...,...,...
1025005,3,1,1,12,2,9,4,9,2,6,1
1025006,3,3,4,5,2,7,1,4,4,3,1
1025007,1,11,4,7,3,9,1,13,2,7,1
1025008,3,11,1,8,1,1,3,13,2,8,1


In [6]:
y_reshaped = np.ravel(y)

In [7]:
y_reshaped.shape

(1025010,)

In [8]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y_reshaped, random_state=1)

In [9]:
X_train.shape

(768757, 10)

In [10]:
X_test.shape

(256253, 10)

In [11]:
# Create the StandardScaler instance
scaler = StandardScaler()
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [12]:
# Create a a list to store inertia values
inertia = []

# Create a a list to store the values of k
k = list(range(1, 100, 5))

# Create a for loop where each value of k is evaluated using the K-means algorithm
# Fit the model using the home_sales_df DataFrame
# Append the value of the computed inertia from the `inertia_` attribute of the K-means model instance
for i in k:
    k_model = KMeans(n_clusters=i, random_state=1)
    k_model.fit(poker_hand_df)
    inertia.append(k_model.inertia_)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super().

In [17]:
# Create a dictionary that holds the list values for k and inertia
elbow_data = {"k": k, "inertia": inertia}

# Create a DataFrame using the elbow_data dictionary
df_elbow = pd.DataFrame(elbow_data)

# Plot the DataFrame
df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
) 

In [25]:
# Instantiate the model with k = 6 neighbors
model = KNeighborsClassifier(n_neighbors=6)

In [26]:
# Instantiate the model with k = 16 neighbors
model2 = KNeighborsClassifier(n_neighbors=16)

In [27]:
# Train the model
model.fit(X_train_scaled, y_train)

In [28]:
model2.fit(X_train_scaled, y_train)

In [24]:
# Create predictions
y_pred = model.predict(X_test_scaled)

# Review the predictions
display(y_pred)

array([0, 1, 0, ..., 0, 0, 1])

In [29]:
# Create predictions
y_pred2 = model2.predict(X_test_scaled)

# Review the predictions
display(y_pred2)

array([0, 1, 1, ..., 0, 0, 1])

In [30]:
# Print confusion matrix
confusion_matrix(y_pred,y_test)

array([[96749, 62989,  5302,  1599,   240,   160,    83,     2,     0,
            1],
       [31599, 44487,  6387,  3576,   725,    14,   210,    46,     4,
            0],
       [  267,   648,   281,   110,    29,     0,    24,    11,     0,
            0],
       [   40,   200,    52,    65,     4,     0,     5,     2,     0,
            0],
       [    1,     7,     4,     0,     1,     0,     0,     0,     0,
            0],
       [    3,     0,     0,     0,     0,   319,     0,     0,     4,
            1],
       [    0,     1,     1,     0,     0,     0,     0,     0,     0,
            0],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0]])

In [31]:
# Print confusion matrix
confusion_matrix(y_pred2,y_test)

array([[97072, 58822,  4770,  1132,   115,   329,    55,     1,     2,
            2],
       [31572, 49413,  7177,  4175,   882,    22,   258,    54,     4,
            0],
       [   14,    81,    71,    38,     2,     0,     8,     6,     0,
            0],
       [    1,    15,     9,     5,     0,     0,     1,     0,     0,
            0],
       [    0,     1,     0,     0,     0,     0,     0,     0,     0,
            0],
       [    0,     0,     0,     0,     0,   142,     0,     0,     2,
            0],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0]])

In [32]:
# Print classification report
print(classification_report(y_pred,y_test))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.75      0.58      0.65    167125
           1       0.41      0.51      0.46     87048
           2       0.02      0.21      0.04      1370
           3       0.01      0.18      0.02       368
           4       0.00      0.08      0.00        13
           5       0.65      0.98      0.78       327
           6       0.00      0.00      0.00         2
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0

    accuracy                           0.55    256253
   macro avg       0.18      0.25      0.20    256253
weighted avg       0.63      0.55      0.58    256253



  _warn_prf(average, modifier, msg_start, len(result))


In [33]:
print(classification_report(y_pred2,y_test))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.75      0.60      0.67    162300
           1       0.46      0.53      0.49     93557
           2       0.01      0.32      0.01       220
           3       0.00      0.16      0.00        31
           4       0.00      0.00      0.00         1
           5       0.29      0.99      0.45       144
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0

    accuracy                           0.57    256253
   macro avg       0.15      0.26      0.16    256253
weighted avg       0.64      0.57      0.60    256253



  _warn_prf(average, modifier, msg_start, len(result))
