# Protein Consumption - K-Means Clustering
- Author: Oliver Mueller
- Last update: 26.01.2024

## Initialize notebook
Load required packages. Set up workspace, e.g., set theme for plotting and initialize the random number generator.

In [None]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [None]:
plt.style.use('fivethirtyeight')

## Problem description

We have data about the protein consumption in twenty-five European countries for nine food groups. We want to find out whether there are any groups of countries with similar protein consumption patterns. 

## Load data

In [None]:
data = pd.read_csv('https://raw.githubusercontent.com/olivermueller/vhbprodok_datascience/main/protein/data/protein.txt', sep='\t')

In [None]:
data.head()

## Prepare data

Drop the catgeorical variable `Country` and scale the data.

In [None]:
data_prep = data.drop(['Country'], axis=1) 

In [None]:
scaler = StandardScaler()
data_prep = scaler.fit_transform(data_prep)
data_prep = pd.DataFrame(data_prep, columns=data.columns[1:])

In [None]:
data_prep.head()

## Fit K-Means clustering

In [None]:
k = 5
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(data_prep)

## Predict cluster membership

In [None]:
cluster_membership = kmeans.predict(data_prep)

In [None]:
cluster_membership

In [None]:
data_prep['Cluster'] = cluster_membership
data_prep['Country'] = data['Country']
data_prep = data_prep.sort_values('Cluster')

In [None]:
data_prep

## Visualize clusters

In [None]:
clusters = data_prep.groupby('Cluster').agg(
    {
        'RedMeat': 'mean',
        'WhiteMeat': 'mean',
        'Eggs': 'mean', 
        'Milk': 'mean',
        'Fish': 'mean',
        'Cereals': 'mean',
        'Starch': 'mean',
        'Nuts': 'mean',
        'Fr&Veg': 'mean'
    }
)


In [None]:
import plotly.graph_objects as go
import plotly.offline as pyo

fig = go.Figure(
    data=[
        go.Scatterpolar(r=clusters.iloc[0,:].values, theta=clusters.columns.values, fill='toself', name="Cluster 0"),
        go.Scatterpolar(r=clusters.iloc[1,:].values, theta=clusters.columns.values, fill='toself', name="Cluster 1"),
        go.Scatterpolar(r=clusters.iloc[2,:].values, theta=clusters.columns.values, fill='toself', name="Cluster 2"),
        go.Scatterpolar(r=clusters.iloc[3,:].values, theta=clusters.columns.values, fill='toself', name="Cluster 3"),
        go.Scatterpolar(r=clusters.iloc[4,:].values, theta=clusters.columns.values, fill='toself', name="Cluster 4")
    ],
    layout=go.Layout(
        title=go.layout.Title(text='Protein Consumption Profiles'),
        polar={'radialaxis': {'visible': True}},
        showlegend=True
    )
)

pyo.plot(fig)