In [None]:
from wrangle_zillow import *


from  acq import *
from  wrangle_mall import *

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns

from sklearn.cluster import KMeans


np.random.seed(4)

from itertools import combinations


sql_database_info_probe(schema_input='iris_db')




def acquire_iris():
    '''
    aquire the zillow data utilizing the query defined earlier in this wrangle file.
    will read in cached data from any present "zillow.csv" present in the current directory.
    first-read data will be saved as "zillow.csv" following query.

    parameters: none

    '''
    query='''
    select * from measurements
	left join species
    using (species_id)
    ;
    ''' 
   
    if os.path.exists('iris_db.csv'):
        df = pd.read_csv('iris_db.csv')
    else:
        database ='iris_db'
        url = f'mysql+pymysql://{env.username}:{env.password}@{env.host}/{database}'
        df = pd.read_sql(query, url)
        df.to_csv('iris_db.csv', index=False)
    return df




In [None]:
iris=acquire_iris()


In [None]:

differentlist=set(iris.columns.to_list())-{'petal_length', 'sepal_length','species_id','measurement_id','species_name'}
differentlist=list(differentlist)
differentlist


In [None]:
X = iris[differentlist]
kmeans = KMeans(n_clusters=3)
kmeans.fit(X)

kmeans.predict(X)
iris['cluster'] = kmeans.predict(X)


In [None]:
kmeans.cluster_centers_


In [None]:
centroids = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns)
centroids


In [None]:
iris.groupby('cluster')[differentlist].mean()


In [None]:
plt.figure(figsize=(14, 9))

for cluster, subset in iris.groupby('cluster'):
    plt.scatter(subset.sepal_length, subset.petal_length, label='cluster ' + str(cluster), alpha=.6)

centroids.plot.scatter(y='petal_width', x='sepal_width', c='black', marker='x', s=1000, ax=plt.gca(), label='centroid')

plt.legend()
plt.xlabel('sepal_width')
plt.ylabel('petal_width')
plt.title('Visualizing Cluster Centers')



In [None]:
kmeans = KMeans(n_clusters=2)
kmeans.fit(X)
iris['cluster'] = kmeans.predict(X)

sns.relplot(data=iris, x='sepal_width', y='petal_width', hue='cluster')


In [None]:
kmeans = KMeans(n_clusters=5).fit(X)
iris['cluster'] = kmeans.predict(X)
iris.cluster = 'cluster_' + (iris.cluster + 1).astype('str')
sns.relplot(data=iris, x='sepal_width', y='petal_width', hue='cluster')





In [None]:
with plt.style.context('seaborn-darkgrid'):
    plt.figure(figsize=(9, 6))
    pd.Series({k: KMeans(k).fit(X).inertia_ for k in range(2, 12)}).plot(marker='x')
    plt.xticks(range(2, 12))
    plt.xlabel('k')
    plt.ylabel('inertia')
    plt.title('Change in inertia as k increases')




In [None]:
fig, axs = plt.subplots(2, 2, figsize=(13, 13), sharex=True, sharey=True)

for ax, k in zip(axs.ravel(), range(2, 6)):
    clusters = KMeans(k).fit(X).predict(X)
    ax.scatter(X.sepal_width, X.petal_width, c=clusters)
    ax.set(title='k = {}'.format(k), xlabel=' sepal_width', ylabel='petal_width ')



In [None]:


mall = get_mallcustomer_data()

mall.nunique()


In [None]:


col_list=[ 'age', 'annual_income', 'spending_score']
# coldict={}

# for i in col_list:
    
#     max=mall[i].max()
#     min=mall[i].min()
#     coldict.update({i:[min,max]})



    

In [None]:
# for i in col_list:
#     mall[f'{i}_bin']=pd.cut(mall[i],bins=5,labels=[1,2,3,4,5])


# mall['gender']=pd.get_dummies(mall['gender'],drop_first=True)

In [None]:
twocombolist=list(combinations(col_list,2))

twocombolist

In [None]:

for i in twocombolist:
    X = mall[list(i)]
    kmeans = KMeans(n_clusters=3)
    kmeans.fit(X)
    kmeans.predict(X)
    mall[f'{i[0]},{i[1]}cluster'] = kmeans.predict(X)
    fig, axs = plt.subplots(2, 2, figsize=(13, 13), sharex=True, sharey=True)
    for ax, k in zip(axs.ravel(), range(2, 6)):
        clusters = KMeans(k).fit(X).predict(X)
        ax.scatter(X[{i[0]}], X[{i[1]}], c=clusters)
        ax.set(title='k = {}'.format(k), xlabel=f'{i[0]}', ylabel=f'{i[1]}')




In [None]:
for i in twocombolist:
    X = mall[list(i)]
    with plt.style.context('seaborn-darkgrid'):
        plt.figure(figsize=(9, 6))
        pd.Series({k: KMeans(k).fit(X).inertia_ for k in range(2, 12)}).plot(marker='x')
        plt.xticks(range(2, 12))
        plt.xlabel('k')
        plt.ylabel('inertia ')
        plt.title(f'Change in for {i[0]} and {i[1]}clusterinertia as k increases')



### Exercises

Do your work for this exercise in a notebook named modeling within your clustering-exercises repository.



1. Clustering with the Iris Dataset \
Using this lesson as a guide, perform clustering on the iris dataset.
    1. Choose features other than the ones used in the lesson.
    2. Visualize the results of your clustering.
    3. Use the elbow method to visually select a good value for k.
    4. Repeat the clustering, this time with 3 different features.
2. Use the techniques discussed in this lesson, as well as the insights gained from the exploration exercise to perform clustering on the mall customers dataset. Be sure to visualize your results!
3. How does scaling impact the results of clustering? \
Compare k-means clustering results on scaled and unscaled data (you can choose any dataset for this exercise OR use the data/steps outlined in the bonus below). You can show how the resulting clusters differ either with descriptive statistics or visually.  \
**Bonus \
**a) Read in the data from this google sheet: https://docs.google.com/spreadsheets/d/1j5EgXVTR5ikUj3G5ZCQmkq6ziz_gvtASGAdw23-5_6M/edit?usp=sharing \
b) Visualize the data and guess the number and shape of the clusters. \
c) Implement the KMeans algorithm on unscaled data and visualize the clusters. \
d) Repeat the step above but with scaled data.   \
c) Write down the takeaways from this exercise

 error.
