In [18]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans

## Defining the bucket 
s3 = boto3.resource('s3')
bucket_name = 'data-445'
bucket = s3.Bucket(bucket_name)

## Defining the csv file 
file_key = 'Fall_2021/In_Class_Assignments/customers.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading the csv file
customers = pd.read_csv(file_content_stream)
customers.head()

Unnamed: 0,ID,Visit_Time,Average_Expense,Sex,Age
0,1,3,5.7,0,10
1,2,5,14.5,0,27
2,3,16,33.5,0,32
3,4,5,15.9,0,30
4,5,16,24.9,0,23


In [19]:
## Dropping ID 
customers = customers.drop(columns = ['ID'], axis = 1)
customers.head()

Unnamed: 0,Visit_Time,Average_Expense,Sex,Age
0,3,5.7,0,10
1,5,14.5,0,27
2,16,33.5,0,32
3,5,15.9,0,30
4,16,24.9,0,23


In [20]:
## Transforming variables to 0-1
scaler = MinMaxScaler()
customers[['Visit_Time_0_1', 'Average_Expense_0_1', 'Age_0_1']] = scaler.fit_transform(customers[['Visit_Time', 'Average_Expense', 'Age']])
customers.head()

Unnamed: 0,Visit_Time,Average_Expense,Sex,Age,Visit_Time_0_1,Average_Expense_0_1,Age_0_1
0,3,5.7,0,10,0.117647,0.041096,0.051282
1,5,14.5,0,27,0.235294,0.342466,0.487179
2,16,33.5,0,32,0.882353,0.993151,0.615385
3,5,15.9,0,30,0.235294,0.390411,0.564103
4,16,24.9,0,23,0.882353,0.69863,0.384615


In [21]:
## Clustering the data 
four_clusters = KMeans(n_clusters = 4, n_init = 20).fit(customers[['Visit_Time_0_1', 'Average_Expense_0_1', 'Sex', 'Age_0_1']])

## Appending cluster label
customers['cluster'] = four_clusters.labels_
customers.head()

Unnamed: 0,Visit_Time,Average_Expense,Sex,Age,Visit_Time_0_1,Average_Expense_0_1,Age_0_1,cluster
0,3,5.7,0,10,0.117647,0.041096,0.051282,2
1,5,14.5,0,27,0.235294,0.342466,0.487179,2
2,16,33.5,0,32,0.882353,0.993151,0.615385,0
3,5,15.9,0,30,0.235294,0.390411,0.564103,2
4,16,24.9,0,23,0.882353,0.69863,0.384615,0


In [23]:
## Cluster 0 
cluster_0 = customers[customers['cluster'] == 0].reset_index(drop = True)
cluster_0.describe()

Unnamed: 0,Visit_Time,Average_Expense,Sex,Age,Visit_Time_0_1,Average_Expense_0_1,Age_0_1,cluster
count,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
mean,14.375,25.5875,0.0,26.625,0.786765,0.722175,0.477564,0.0
std,1.846812,4.647714,0.0,5.629958,0.108636,0.159168,0.144358,0.0
min,12.0,18.8,0.0,18.0,0.647059,0.489726,0.25641,0.0
25%,13.5,22.95,0.0,22.75,0.735294,0.631849,0.378205,0.0
50%,14.0,25.4,0.0,26.0,0.764706,0.715753,0.461538,0.0
75%,16.0,28.5,0.0,32.25,0.882353,0.821918,0.621795,0.0
max,17.0,33.5,0.0,33.0,0.941176,0.993151,0.641026,0.0


In [24]:
## Cluster 1 
cluster_1 = customers[customers['cluster'] == 1].reset_index(drop = True)
cluster_1.describe()

Unnamed: 0,Visit_Time,Average_Expense,Sex,Age,Visit_Time_0_1,Average_Expense_0_1,Age_0_1,cluster
count,25.0,25.0,25.0,25.0,25.0,25.0,25.0,25.0
mean,5.56,10.928,1.0,15.48,0.268235,0.220137,0.191795,1.0
std,2.484619,4.692238,0.0,4.664762,0.146154,0.160693,0.119609,0.0
min,1.0,4.5,1.0,8.0,0.0,0.0,0.0,1.0
25%,3.0,8.1,1.0,12.0,0.117647,0.123288,0.102564,1.0
50%,6.0,10.9,1.0,16.0,0.294118,0.219178,0.205128,1.0
75%,7.0,12.0,1.0,19.0,0.352941,0.256849,0.282051,1.0
max,10.0,23.8,1.0,23.0,0.529412,0.660959,0.384615,1.0
