# Machine Learning Classification

by Assoc.Prof.Kobkiat Saraubon, Ph.D. | Computer Sci, KMUTNB

In [1]:
# ws03
import seaborn as sns
import matplotlib.pyplot as plt

df = sns.load_dataset('iris')
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [2]:
df.tail()
df.sample(10)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
83,6.0,2.7,5.1,1.6,versicolor
124,6.7,3.3,5.7,2.1,virginica
132,6.4,2.8,5.6,2.2,virginica
49,5.0,3.3,1.4,0.2,setosa
107,7.3,2.9,6.3,1.8,virginica
113,5.7,2.5,5.0,2.0,virginica
129,7.2,3.0,5.8,1.6,virginica
79,5.7,2.6,3.5,1.0,versicolor
37,4.9,3.6,1.4,0.1,setosa
43,5.0,3.5,1.6,0.6,setosa


In [3]:
df.isnull().sum()  # NaN

Unnamed: 0,0
sepal_length,0
sepal_width,0
petal_length,0
petal_width,0
species,0


In [None]:
# sns.set_style('whitegrid')
sns.scatterplot(x='sepal_length',y='sepal_width',data=df, hue='species',palette='Set1', s=70)
plt.title('Sepal')
plt.show()

In [None]:
sns.scatterplot(x='petal_length',y='petal_width',data=df,
                hue='species',
                # palette='Set1', s=70
                )
plt.title('Petal')
plt.show()

In [None]:
sns.scatterplot(x='petal_length',y='petal_width',data=df, hue='species',palette='Set1', s=70)
plt.title('Petal')
plt.show()

In [None]:
sns.pairplot(df, hue='species', height=1.5)
plt.show()

In [None]:
feature_cols = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
X = df[feature_cols]
X.head()

In [4]:
X = df.drop('species', axis=1)
X.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [5]:
y = df.species
y.head()

Unnamed: 0,species
0,setosa
1,setosa
2,setosa
3,setosa
4,setosa


In [None]:
X.shape

In [None]:
y.shape

In [7]:
# ws06
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()

cvs = cross_val_score(model, X, y, cv=10)  # no need to use model.fit
print('cross val scores {}'.format(cvs.round(3)))
print('mean (%) = {:.3f}' .format(cvs.mean() * 100 ))

cross val scores [0.933 0.933 1.    0.933 0.933 0.933 0.867 1.    1.    1.   ]
mean (%) = 95.333


# Train-test Split

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

X_train, X_test,y_train,y_test = train_test_split(X, y, test_size=0.3, random_state=11)

In [None]:
X_train.shape, X_test.shape

In [9]:
# ws07
import numpy as np
(label, count) = np.unique(y_test , return_counts=True)
freq = np.asarray((label, count)).T
print(freq)

[['setosa' 14]
 ['versicolor' 14]
 ['virginica' 17]]


In [None]:
model = GaussianNB()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print('Accuracy Score: {:.4f}'.format(accuracy_score(y_test, y_pred)))
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
y_test[(y_test == y_pred)].count()

In [None]:
import pandas as pd
dy = pd.DataFrame()
dy['y_test'] = y_test
dy['y_pred'] = y_pred
dy['result'] = y_test == y_pred

dy[dy.result == False]

In [12]:
# ws07
import numpy as np

X_input = np.array([[4.7,3.6,2.0,0.8],
                    [6.5,2.5,4.8,1.4]
                   ])
# X_input = np.array([[4.7,3.6,2.0,0.8]])

# X_input = np.array([[6.5,2.5,4.8,1.4]])


X_input

array([[4.7, 3.6, 2. , 0.8],
       [6.5, 2.5, 4.8, 1.4]])

In [None]:
# model.fit(X, y)
print(model.predict(X_input))

In [None]:
print(y.unique())
print(model.predict_proba(X_input).round(3))

In [None]:
# ws08
xsp = X_input[:,:1].reshape(1,-1)[0]
ysp = X_input[:,1:2].reshape(1,-1)[0]
xsp, ysp

In [None]:
sns.scatterplot(x='sepal_length',y='sepal_width',data=df, hue='species',palette='Set1', s=70)
plt.title('New Points (Sepal)')
sns.scatterplot(x=xsp, y=ysp, marker='s', color='blue', s=180)
plt.show()

In [None]:
xpe = X_input[:,2:3].reshape(1,-1)[0]
ype = X_input[:,3:4].reshape(1,-1)[0]
xpe, ype

In [None]:
sns.scatterplot(x='petal_length',y='petal_width',data=df, hue='species',palette='Set1', s=70)
plt.title('New Points (Petal)')
sns.scatterplot(x=xpe, y=ype, marker='s', color='blue', s=180)
plt.show()

#ws09 SVC & save model

In [10]:
# ws09
from sklearn.svm import SVC

# model = SVC(kernel='linear')
model = SVC(kernel='rbf')

cvs = cross_val_score(model, X, y, cv=10)
print('cross val scores {}'.format(cvs.round(3)))
print('mean (%) = {:.3f}' .format(cvs.mean() * 100 ))

cross val scores [1.    0.933 1.    1.    1.    0.933 0.933 0.933 1.    1.   ]
mean (%) = 97.333


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [13]:
model.fit(X, y)
print(model.predict(X_input))

['setosa' 'versicolor']




In [14]:
# prompt: save model to model.pkl using pickle

import pickle

# Assuming your trained model is stored in the variable 'model'
with open('model.pkl', 'wb') as f:
  pickle.dump(model, f)


In [None]:
# ws10
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=3)

cvs = cross_val_score(model, X, y, cv=10)
print('cross val scores {}'.format(cvs.round(3)))
print('mean (%) = {:.3f}' .format(cvs.mean() * 100 ))

In [None]:
model.fit(X, y)
print(model.predict(X_input))

In [None]:
# ws11
len(X_train)

In [None]:
from sklearn.ensemble import RandomForestClassifier

# model = RandomForestClassifier(n_estimators=80)
model = RandomForestClassifier()

cvs = cross_val_score(model, X, y, cv=10)
print('cross val scores {}'.format(cvs.round(3)))
print('mean (%) = {:.3f}' .format(cvs.mean() * 100 ))

In [None]:
# import scikitplot as skplot
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=80)
model.fit(X_train, y_train)

In [None]:
class_names = df.species.unique()
class_names

In [None]:
y_predict = model.predict(X_test)

print('Score -> {:.4f}' . format(model.score(X_test, y_test)))
print(classification_report(y_test, y_predict, target_names=class_names))
print(confusion_matrix(y_test, y_predict))



In [None]:
# prompt: plot confusion matrix

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Assuming y_test and y_pred are already defined from your model prediction
# Example:
# y_pred = model.predict(X_test)

cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(4, 3))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=class_names, yticklabels=class_names)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()


In [None]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X_train, y_train)


print('Score: {:.4f}' . format(model.score(X_test, y_test)))
print(classification_report(y_test, y_predict, target_names=class_names))
print(confusion_matrix(y_test, y_predict))




In [None]:
# ws12
print(y.unique())

In [None]:
from sklearn.cluster import KMeans
model = KMeans(n_clusters=3)
y_kmeans = model.fit_predict(X)

In [None]:
y_kmeans[:80]

In [None]:
df['cluster'] = y_kmeans
df.sample(8)

In [None]:
df[df.cluster==0].head()

In [None]:

pvt = df.pivot_table(index='species',columns=['cluster'], values='petal_length',
                     fill_value=0, aggfunc='count')
pvt

In [None]:
model.cluster_centers_

In [None]:
x_centroids = model.cluster_centers_[:,0]
y_centroids = model.cluster_centers_[:,1]

In [None]:
x_centroids

In [None]:
sns.scatterplot('sepal_length','sepal_width',data=df,hue='species',s=50,palette='Set1')
plt.title('Actual')
plt.show()

In [None]:

color_dict = dict({0:'blue',
                  1:'red',
                  2: 'g'})

centroid_colors=['r','b','m']
sns.scatterplot('sepal_length','sepal_width',data=df,hue='cluster', palette=color_dict, s=80)
plt.scatter(x_centroids, y_centroids,s=1200, marker='x', c=centroid_colors)
plt.title('k-Means Clustering')

plt.show()

In [None]:
print(model.predict(X_input))

In [None]:
w1 = df[(df.cluster==0) & (df.species!='versicolor')] #.count()
w2 = df[(df.cluster==2) & (df.species!='virginica')] #.count()

color_dict = dict({0:'blue',
                  1:'red',
                  2: 'g'})


sns.scatterplot('sepal_length','sepal_width',data=df,hue='cluster', palette=color_dict, s=80, alpha=0.7)

plt.scatter(w1.sepal_length, w1.sepal_width, marker='x',s=250,c='red')
plt.scatter(w2.sepal_length, w2.sepal_width, marker='x',s=250,c='black')
plt.title('k-Means Clustering')
plt.savefig('kmean-clustering-wrong',dpi=120)
plt.show()

In [17]:
X_test.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
112,6.8,3.0,5.5,2.1
145,6.7,3.0,5.2,2.3
133,6.3,2.8,5.1,1.5
56,6.3,3.3,4.7,1.6
111,6.4,2.7,5.3,1.9


In [18]:
y_test.head()

Unnamed: 0,species
112,virginica
145,virginica
133,virginica
56,versicolor
111,virginica


In [34]:
X_test.iloc[0].tolist()

[6.8, 3.0, 5.5, 2.1]

In [None]:
# prompt: get X_test at 56

X_test.iloc[56]
