In [4]:
import pandas as pd
from sklearn.cluster import KMeans
import numpy as np
import altair as alt
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

In [3]:
file_url = 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter05/DataSet/taxstats2015.csv'
data = pd.read_csv(file_url, usecols=['Postcode', 'Average net tax', 'Average total deductions'])
data.head()

Unnamed: 0,Postcode,Average total deductions,Average net tax
0,2000,2071,27555
1,2006,3804,28142
2,2007,1740,15649
3,2008,3917,53976
4,2009,3433,32430


In [67]:
clusters = pd.DataFrame()
clusters['cluster_range'] = range(1, 10)
inertia = []

for k in clusters['cluster_range']:
    model = KMeans(n_clusters=k, random_state=8).fit(X)
    inertia.append(model.inertia_)
    
clusters['inertia'] = inertia
clusters

Unnamed: 0,cluster_range,inertia
0,1,158645900000.0
1,2,66550560000.0
2,3,37081150000.0
3,4,22539730000.0
4,5,15750120000.0
5,6,11544060000.0
6,7,8776361000.0
7,8,7285696000.0
8,9,6000036000.0


In [68]:
alt.Chart(clusters).mark_line().encode(x='cluster_range', y='inertia')

In [60]:
model = KMeans(random_state=42, n_clusters=3)

In [61]:
X = data[['Average net tax', 'Average total deductions']]

In [62]:
model.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=42, tol=0.0001, verbose=0)

In [63]:
preds = model.predict(X)
preds

array([0, 0, 2, ..., 2, 0, 0])

In [64]:
data['cluster'] = preds
data.head()

Unnamed: 0,Postcode,Average total deductions,Average net tax,cluster,cluster2
0,2000,2071,27555,0,2
1,2006,3804,28142,0,0
2,2007,1740,15649,2,1
3,2008,3917,53976,1,0
4,2009,3433,32430,0,2


In [65]:
data.pivot_table(values=['Average net tax', 'Average total deductions'], index='cluster', aggfunc=np.mean)

Unnamed: 0_level_0,Average net tax,Average total deductions
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,21618.214286,3052.446844
1,45151.967391,6046.967391
2,12393.008432,2383.557055


In [66]:
chart = alt.Chart(data)
scatter_plot = chart.mark_circle()
scatter_plot.encode(x='Average net tax', y='Average total deductions', color='cluster:N',
                   tooltip=['Postcode', 'cluster', 'Average net tax', 'Average total deductions']).interactive()

In [73]:
model = KMeans(random_state=14, n_clusters=3, init='k-means++', n_init=5).fit(X)
data['cluster3'] = model.predict(X)

alt.Chart(data).mark_circle().encode(x='Average net tax', y='Average total deductions', color='cluster3:N',
                                    tooltip=['Postcode', 'cluster', 'Average net tax', 'Average total deductions']).interactive()

## Centroids

In [86]:
file_url = 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter05/DataSet/taxstats2015.csv'
df = pd.read_csv(file_url, usecols=['Postcode', 'Average total business income', 'Average total business expenses'])

In [87]:
X = df[['Average total business income', 'Average total business expenses']]

business_income_min = df['Average total business income'].min()
business_income_max = df['Average total business income'].max()

business_expenses_min = df['Average total business expenses'].min()
business_expenses_max = df['Average total business expenses'].max()

print(business_income_min)
print(business_income_max)
print(business_expenses_min)
print(business_expenses_max)

0
876324
0
884659


In [88]:
import random
random.seed(42)

In [89]:
centroids = pd.DataFrame()

In [90]:
centroids['Average total business income'] = random.sample(range(business_income_min, business_income_max), 4)
centroids['Average total business expenses'] = random.sample(range(business_expenses_min, business_expenses_max), 4)

In [91]:
centroids['cluster'] = centroids.index
centroids

Unnamed: 0,Average total business income,Average total business expenses,cluster
0,670487,288389,0
1,116739,256787,1
2,26225,234053,2
3,777572,146316,3


In [95]:
chart1 = alt.Chart(df.head()).mark_circle().encode(x='Average total business income', y='Average total business expenses', 
                                                   color=alt.value('orange'),tooltip=['Postcode', 'Average total business income', 
                                                                                      'Average total business expenses']).interactive()

In [96]:
chart2 = alt.Chart(centroids).mark_circle(size=100).encode(x='Average total business income', y='Average total business expenses', 
                                                           color=alt.value('black'),tooltip=['cluster', 'Average total business income', 
                                                                                             'Average total business expenses']).interactive()

In [97]:
chart1 + chart2

## Atividade

In [43]:
url = 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter05/DataSet/german.data-numeric'
data = pd.read_csv(url, header=None, sep= '\s\s+', prefix='X')
data.head()

  


Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X15,X16,X17,X18,X19,X20,X21,X22,X23,X24
0,1,6,4,12,5,5,3,4,1,67,...,0,0,1,0,0,1,0,0,1,1.0
1,2,48,2,60,1,3,2,2,1,22,...,0,0,1,0,0,1,0,0,1,2.0
2,4,12,4,21,1,4,3,3,1,49,...,0,0,1,0,0,1,0,1,0,1.0
3,1,42,2,79,1,4,3,4,2,45,...,0,0,0,0,0,0,0,0,1,1.0
4,1,24,3,49,1,3,3,4,4,53,...,1,0,1,0,0,0,0,0,1,2.0


In [45]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 25 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   X0      1000 non-null   int64  
 1   X1      1000 non-null   int64  
 2   X2      1000 non-null   object 
 3   X3      1000 non-null   int64  
 4   X4      1000 non-null   int64  
 5   X5      1000 non-null   int64  
 6   X6      1000 non-null   int64  
 7   X7      1000 non-null   int64  
 8   X8      1000 non-null   int64  
 9   X9      1000 non-null   int64  
 10  X10     1000 non-null   int64  
 11  X11     1000 non-null   int64  
 12  X12     1000 non-null   int64  
 13  X13     1000 non-null   int64  
 14  X14     1000 non-null   int64  
 15  X15     1000 non-null   int64  
 16  X16     1000 non-null   int64  
 17  X17     1000 non-null   int64  
 18  X18     1000 non-null   int64  
 19  X19     1000 non-null   int64  
 20  X20     1000 non-null   int64  
 21  X21     1000 non-null   int64  
 22  X

In [46]:
X = data[['X3', 'X9']]
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [48]:
X[:, :]

array([[-0.74764519,  2.54797287],
       [ 1.54618474, -0.94133873],
       [-0.31755208,  1.15224823],
       ...,
       [-0.93879769,  0.29930539],
       [-0.46091645, -0.86379847],
       [ 0.87715101, -0.55363744]])

In [59]:
cluster = pd.DataFrame()
inertia = []
cluster['cluster_range'] = np.arange(1, 10)

for k in cluster['cluster_range']:
    model = KMeans(random_state=0, n_clusters=k).fit(X)
    inertia.append(model.inertia_)

In [61]:
cluster['inertia'] = inertia

In [62]:
cluster

Unnamed: 0,cluster_range,inertia
0,1,2000.0
1,2,1280.63825
2,3,767.705258
3,4,576.086134
4,5,443.936328
5,6,360.377095
6,7,291.400412
7,8,252.709449
8,9,219.553695


In [63]:
alt.Chart(cluster).mark_line().encode(alt.X('cluster_range'), alt.Y('inertia'))

In [64]:
model = KMeans(random_state=1, n_clusters=5, init='k-means++', n_init=50, max_iter=100)
model.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
       n_clusters=5, n_init=50, n_jobs=None, precompute_distances='auto',
       random_state=1, tol=0.0001, verbose=0)

In [65]:
data['cluster'] = model.predict(X)
data.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X16,X17,X18,X19,X20,X21,X22,X23,X24,cluster
0,1,6,4,12,5,5,3,4,1,67,...,0,1,0,0,1,0,0,1,1.0,2
1,2,48,2,60,1,3,2,2,1,22,...,0,1,0,0,1,0,0,1,2.0,4
2,4,12,4,21,1,4,3,3,1,49,...,0,1,0,0,1,0,1,0,1.0,2
3,1,42,2,79,1,4,3,4,2,45,...,0,0,0,0,0,0,0,1,1.0,0
4,1,24,3,49,1,3,3,4,4,53,...,0,1,0,0,0,0,0,1,2.0,2


In [68]:
scatter_plot = alt.Chart(data).mark_circle()
scatter_plot.encode(x='X3', y='X9', color='cluster:N')