In [11]:
import pandas as pd
from path import Path

In [12]:
file_path= Path('iris.csv')
iris_df= pd.read_csv(file_path)
iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [13]:
new_iris_df= iris_df.drop(['class'], axis=1) 
new_iris_df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [14]:
#reorganize order of columns. length first then width after
new_iris_df= new_iris_df[['sepal_length', 'petal_length', 'sepal_width', 'petal_width']]

new_iris_df.head()

Unnamed: 0,sepal_length,petal_length,sepal_width,petal_width
0,5.1,1.4,3.5,0.2
1,4.9,1.4,3.0,0.2
2,4.7,1.3,3.2,0.2
3,4.6,1.5,3.1,0.2
4,5.0,1.4,3.6,0.2


In [15]:
output_file_path= 'new_iris_data.csv'
new_iris_df.to_csv(output_file_path, index= False)

PREPROCESSING DATA

In [16]:
file_path= 'shopping_data.csv'
df_shopping= pd.read_csv(file_path, encoding='ISO-8859-1')
df_shopping.head(5)

Unnamed: 0,CustomerID,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,Yes,19.0,15000,39.0
1,2,Yes,21.0,15000,81.0
2,3,No,20.0,16000,6.0
3,4,No,23.0,16000,77.0
4,5,No,31.0,17000,40.0


In [17]:
#columns
df_shopping.columns

Index(['CustomerID', 'Card Member', 'Age', 'Annual Income',
       'Spending Score (1-100)'],
      dtype='object')

In [18]:
df_shopping.dtypes

CustomerID                  int64
Card Member                object
Age                       float64
Annual Income               int64
Spending Score (1-100)    float64
dtype: object

In [19]:
#find count of null values
for column in df_shopping.columns:
    print(f'column{column} has {df_shopping[column].isnull().sum()} null values')

columnCustomerID has 0 null values
columnCard Member has 2 null values
columnAge has 2 null values
columnAnnual Income has 0 null values
columnSpending Score (1-100) has 1 null values


In [20]:
#drop null rows
df_shopping= df_shopping.dropna()
df_shopping.head()

Unnamed: 0,CustomerID,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,Yes,19.0,15000,39.0
1,2,Yes,21.0,15000,81.0
2,3,No,20.0,16000,6.0
3,4,No,23.0,16000,77.0
4,5,No,31.0,17000,40.0


In [21]:
#find duplicate entries
print(f'duplicate entries:{df_shopping.duplicated().sum()}')

duplicate entries:0


In [22]:
df_shopping.drop(columns=['CustomerID'], inplace= True)
df_shopping.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,Yes,19.0,15000,39.0
1,Yes,21.0,15000,81.0
2,No,20.0,16000,6.0
3,No,23.0,16000,77.0
4,No,31.0,17000,40.0


In [23]:
#transform string column into numerical
def change_string(member):
    if member =='Yes':
        return 1
    else:
        return 0

df_shopping['Card Member'] = df_shopping['Card Member'].apply(change_string)
df_shopping.head()        

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15000,39.0
1,1,21.0,15000,81.0
2,0,20.0,16000,6.0
3,0,23.0,16000,77.0
4,0,31.0,17000,40.0


In [24]:
#transform annual income to scale
df_shopping['Annual Income']= df_shopping['Annual Income'] / 1000

df_shopping.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0


In [25]:
#rename df headers so no spaces or numbers
df_shopping.rename(columns={
    'Card Member': 'Card_Member',
    'Annual Income': 'Annual_Income',
    'Spending Score (1-100)': 'Spending_Score'}, inplace= True)

df_shopping.head()


Unnamed: 0,Card_Member,Age,Annual_Income,Spending_Score
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0


In [26]:
#saving cleaned data as a csv
outgoing_file_path= 'shopping_data_cleaned.csv'
df_shopping.to_csv(outgoing_file_path, index=False)

K-Means Algorithm

In [27]:
import plotly.express as px
import hvplot.pandas
from sklearn.cluster import KMeans

In [28]:
#load the cleaned iris data
file_path= Path('new_iris_data.csv')
df_iris= pd.read_csv(file_path)
df_iris.head()

Unnamed: 0,sepal_length,petal_length,sepal_width,petal_width
0,5.1,1.4,3.5,0.2
1,4.9,1.4,3.0,0.2
2,4.7,1.3,3.2,0.2
3,4.6,1.5,3.1,0.2
4,5.0,1.4,3.6,0.2


In [29]:
#initialize model with K=3 (since we already know 3 classes of iris plants exist)
#we normally wont know how many classes so trial and error method will be used
model= KMeans(n_clusters=3, random_state=5)
model

KMeans(n_clusters=3, random_state=5)

In [30]:
#fit the model
#notice how it isnt being split into training and testing data? 
#the algorithm will look for the best centroid for each cluster
model.fit(df_iris)

KMeans(n_clusters=3, random_state=5)

In [31]:
#get predictions
predictions= model.predict(df_iris)
print(predictions)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 2 2 2 2 0 2 2 2 2
 2 2 0 0 2 2 2 2 0 2 0 2 0 2 2 0 0 2 2 2 2 2 0 2 2 2 2 0 2 2 2 0 2 2 2 0 2
 2 0]


In [32]:
#add a new class column to df_iris
df_iris['class']= model.labels_
df_iris.head()

Unnamed: 0,sepal_length,petal_length,sepal_width,petal_width,class
0,5.1,1.4,3.5,0.2,1
1,4.9,1.4,3.0,0.2,1
2,4.7,1.3,3.2,0.2,1
3,4.6,1.5,3.1,0.2,1
4,5.0,1.4,3.6,0.2,1


Visualizing Findings

In [33]:
import plotly.express as px
import hvplot.pandas

In [34]:
#plotting the clusters with two features (2D)
df_iris.hvplot.scatter(x='sepal_length', y='sepal_width', by='class')

In [35]:
#plotting ther clusters with three features (3D)
fig= px.scatter_3d(df_iris, x='petal_width', y='sepal_length', z='petal_length', symbol='class', size='sepal_width', width=800, color= 'class')
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

Test by Trial and Error


In [36]:
#load data
file_path= Path('shopping_data_cleaned.csv')
df_shopping= pd.read_csv(file_path)
df_shopping.head()

Unnamed: 0,Card_Member,Age,Annual_Income,Spending_Score
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0


In [37]:
#plotting all data points
df_shopping.hvplot.scatter(x='Annual_Income', y='Spending_Score')

In [38]:
# Function to cluster and plot dataset
def test_cluster_amount(df, clusters):
    model = KMeans(n_clusters=clusters, random_state=5)
    model

    # Fitting model
    model.fit(df)

    # Add a new class column to df_iris
    df["class"] = model.labels_

In [39]:
#test function (2D) - 2 clusters
test_cluster_amount(df_shopping, 2)
df_shopping.hvplot.scatter(x='Annual_Income', y='Spending_Score', by='class')

In [40]:
#test function (3D)
fig= px.scatter_3d(df_shopping, x='Annual_Income', y='Spending_Score', z='Age', color='class', symbol='class', width=800)

fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [41]:
#test function (2D) - 4 clusters
test_cluster_amount(df_shopping, 4)
df_shopping.hvplot.scatter(x='Annual_Income', y='Spending_Score', by='class')

In [42]:
#test function (3D)
fig= px.scatter_3d(df_shopping, x='Annual_Income', y='Spending_Score', z='Age', color='class', symbol='class', width=800)

fig.update_layout(legend=dict(x=0, y=1))
fig.show()

Elbow Curve

In [43]:
#initial imports
import pandas as pd
from path import Path
from sklearn.cluster import KMeans
import plotly.express as px
import hvplot.pandas

In [44]:
#load data
file_path= Path('new_iris_data.csv')
df_iris= pd.read_csv(file_path)

df_iris.head()

Unnamed: 0,sepal_length,petal_length,sepal_width,petal_width
0,5.1,1.4,3.5,0.2
1,4.9,1.4,3.0,0.2
2,4.7,1.3,3.2,0.2
3,4.6,1.5,3.1,0.2
4,5.0,1.4,3.6,0.2


In [53]:
#stores vallue of K to Plot
inertia= []
k= list(range(1,11))

In [54]:
#looking for the best k value
for i in k:
    km= KMeans(n_clusters=i, random_state=0)
    km.fit(df_iris)
inertia.append(km.inertia_)


KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.



In [55]:
#define a dataframe to plot the elbow curve using hvplot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

ValueError: arrays must all be same length

In [49]:
elbow_data

{'k': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'inertia': [26.04820224804435]}