# Protein Consumption - K-Means Clustering
- Author: Oliver Mueller
- Last update: 26.01.2024

## Initialize notebook
Load required packages. Set up workspace, e.g., set theme for plotting and initialize the random number generator.

In [1]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [2]:
plt.style.use('fivethirtyeight')

## Problem description

We have data about the protein consumption in twenty-five European countries for nine food groups. We want to find out whether there are any groups of countries with similar protein consumption patterns. 

## Load data

In [3]:
data = pd.read_csv('data/protein.txt', sep='\t')

In [4]:
data.head()

Unnamed: 0,Country,RedMeat,WhiteMeat,Eggs,Milk,Fish,Cereals,Starch,Nuts,Fr&Veg
0,Albania,10.1,1.4,0.5,8.9,0.2,42.3,0.6,5.5,1.7
1,Austria,8.9,14.0,4.3,19.9,2.1,28.0,3.6,1.3,4.3
2,Belgium,13.5,9.3,4.1,17.5,4.5,26.6,5.7,2.1,4.0
3,Bulgaria,7.8,6.0,1.6,8.3,1.2,56.7,1.1,3.7,4.2
4,Czechoslovakia,9.7,11.4,2.8,12.5,2.0,34.3,5.0,1.1,4.0


## Prepare data

Drop the catgeorical variable `Country` and scale the data.

In [5]:
data_prep = data.drop(['Country'], axis=1) 

In [6]:
scaler = StandardScaler()
data_prep = scaler.fit_transform(data_prep)
data_prep = pd.DataFrame(data_prep, columns=data.columns[1:])

In [7]:
data_prep.head()

Unnamed: 0,RedMeat,WhiteMeat,Eggs,Milk,Fish,Cereals,Starch,Nuts,Fr&Veg
0,0.082941,-1.79475,-2.224584,-1.17957,-1.225033,0.934804,-2.295965,1.247968,-1.378251
1,-0.282974,1.686446,1.245621,0.400468,-0.655111,-0.395051,-0.422218,-0.91079,0.092789
2,1.119699,0.387905,1.062979,0.055732,0.064791,-0.525246,0.889405,-0.499598,-0.076947
3,-0.618396,-0.523837,-1.220051,-1.265754,-0.925074,2.273959,-1.983674,0.322786,0.03621
4,-0.039031,0.968104,-0.124197,-0.662467,-0.685107,0.19083,0.452198,-1.013588,-0.076947


## Fit K-Means clustering

In [8]:
k = 5
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(data_prep)

## Predict cluster membership

In [9]:
cluster_membership = kmeans.predict(data_prep)

In [10]:
cluster_membership

array([2, 1, 1, 2, 1, 1, 1, 4, 1, 0, 2, 1, 0, 1, 4, 4, 3, 2, 3, 1, 1, 1,
       4, 1, 2], dtype=int32)

In [11]:
data_prep['Cluster'] = cluster_membership
data_prep['Country'] = data['Country']
data_prep = data_prep.sort_values('Cluster')

In [12]:
data_prep

Unnamed: 0,RedMeat,WhiteMeat,Eggs,Milk,Fish,Cereals,Starch,Nuts,Fr&Veg,Cluster,Country
12,-0.252481,-0.772494,-0.032876,-0.490099,-0.265164,0.423322,-1.359091,0.63118,1.450672,0,Italy
9,0.113434,-1.352693,-0.124197,0.070096,0.484734,0.879006,-1.296633,2.430145,1.337515,0,Greece
1,-0.282974,1.686446,1.245621,0.400468,-0.655111,-0.395051,-0.422218,-0.91079,0.092789,1,Austria
2,1.119699,0.387905,1.062979,0.055732,0.064791,-0.525246,0.889405,-0.499598,-0.076947,1,Belgium
21,2.308921,-0.606723,1.610906,0.501016,0.004799,-0.739139,0.264823,0.168589,-0.472996,1,UK
4,-0.039031,0.968104,-0.124197,-0.662467,-0.685107,0.19083,0.452198,-1.013588,-0.076947,1,Czechoslovakia
5,0.235405,0.802333,0.697694,1.133031,1.68457,-0.962332,0.327281,-1.219184,-0.982202,1,Denmark
6,-0.435438,1.023361,0.697694,-0.863563,0.334754,-0.71124,1.389071,-1.167785,-0.303261,1,E Germany
20,0.997727,0.608933,0.149767,0.960663,-0.595119,-0.618243,-0.921884,-0.345401,0.432259,1,Switzerland
8,2.491879,0.553676,0.332409,0.343012,0.424742,-0.385751,0.327281,-0.345401,1.337515,1,France


## Visualize clusters

In [16]:
clusters = data_prep.groupby('Cluster').agg(
    {
        'RedMeat': 'mean',
        'WhiteMeat': 'mean',
        'Eggs': 'mean', 
        'Milk': 'mean',
        'Fish': 'mean',
        'Cereals': 'mean',
        'Starch': 'mean',
        'Nuts': 'mean',
        'Fr&Veg': 'mean'
    }
)


In [17]:
import plotly.graph_objects as go
import plotly.offline as pyo

fig = go.Figure(
    data=[
        go.Scatterpolar(r=clusters.iloc[0,:].values, theta=clusters.columns.values, fill='toself', name="Cluster 0"),
        go.Scatterpolar(r=clusters.iloc[1,:].values, theta=clusters.columns.values, fill='toself', name="Cluster 1"),
        go.Scatterpolar(r=clusters.iloc[2,:].values, theta=clusters.columns.values, fill='toself', name="Cluster 2"),
        go.Scatterpolar(r=clusters.iloc[3,:].values, theta=clusters.columns.values, fill='toself', name="Cluster 3"),
        go.Scatterpolar(r=clusters.iloc[4,:].values, theta=clusters.columns.values, fill='toself', name="Cluster 4")
    ],
    layout=go.Layout(
        title=go.layout.Title(text='Protein Consumption Profiles'),
        polar={'radialaxis': {'visible': True}},
        showlegend=True
    )
)

pyo.plot(fig)

'temp-plot.html'