In [None]:
# importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.cluster import KMeans

In [None]:
# reading the data and looking at the first five rows of the data
data=pd.read_csv(r"C:\Users\Purushotham\Desktop\deloitte\machinelearning\datasets\whole_sale_customers_data.csv")
data.head()

In [None]:
data.describe()

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
ds = scaler.fit_transform(data)

In [None]:
pd.DataFrame(ds).describe()

In [None]:
kmeans = KMeans(n_clusters=2, init='k-means++')
kmeans.fit(ds)

In [None]:
kmeans.inertia_

### Elbow Method

In [None]:
SSE = []
for cluster in range(1, 20):
    kmeans = KMeans(n_clusters=cluster, init='k-means++')
    kmeans.fit(ds)
    SSE.append(kmeans.inertia_)

In [None]:
frame = pd.DataFrame({'Cluster':range(1,20), 'SSE':SSE})
plt.figure(figsize=(12,6))
plt.plot(frame['Cluster'], frame['SSE'], marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')

In [None]:
kmeans = KMeans(n_clusters=6, init='k-means++')
kmeans.fit(ds)
kmeans.inertia_

### Sihouette Score

In [None]:
from sklearn.metrics import silhouette_score
silhouette_score(ds, kmeans.labels_)

In [None]:
img = r"C:\Users\Purushotham\Desktop\deloitte\machinelearning\handson-ml2-master\images\unsupervised_learning\ladybug.png"

In [None]:
from matplotlib.image import imread
image = imread(img)
image.shape

In [None]:
plt.imshow(image)

In [None]:
X = image.reshape(-1, 3)
kmeans = KMeans(n_clusters=8, random_state=42).fit(X)
segmented_img = kmeans.cluster_centers_[kmeans.labels_]
segmented_img = segmented_img.reshape(image.shape)

In [None]:
segmented_imgs = []
n_colors = (10, 8, 6, 4, 2)
for n_clusters in n_colors:
    kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(X)
    segmented_img = kmeans.cluster_centers_[kmeans.labels_]
    segmented_imgs.append(segmented_img.reshape(image.shape))

In [None]:
kmeans.labels_

In [None]:
plt.figure(figsize=(10,5))
plt.subplots_adjust(wspace=0.05, hspace=0.1)

plt.subplot(231)
plt.imshow(image)
plt.title("Original image")
plt.axis('off')

for idx, n_clusters in enumerate(n_colors):
    plt.subplot(232 + idx)
    plt.imshow(segmented_imgs[idx])
    plt.title("{} colors".format(n_clusters))
    plt.axis('off')


plt.show()

### Using Clustering for Pre-processing

In [None]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
X_digits, y_digits = load_digits(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X_digits, y_digits, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
log_reg = LogisticRegression(multi_class="ovr", solver="lbfgs", max_iter=5000, random_state=42)
log_reg.fit(X_train, y_train)

In [None]:
log_reg_score = log_reg.score(X_test, y_test)
log_reg_score

In [None]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
    ("kmeans", KMeans(n_clusters=50, random_state=42)),
    ("log_reg", LogisticRegression(multi_class="ovr", solver="lbfgs", max_iter=5000, random_state=42)),
])
pipeline.fit(X_train, y_train)

In [None]:
pipeline_score = pipeline.score(X_test, y_test)
pipeline_score