In [1]:
# Hypothesis:  The 5-dimensional OCEAN data set can be represented in 3 dimensions visualized by 27 clusters of points in a 3x3x3 cube.
# Experiment:  Apply machine learning tools that use a dimensionality reduction technique like Principal Component Analysis (PCA) to the Kaggle OCEAN data set. Python provides several machine learning libraries that offer implementations of these dimensionality reduction techniques, such as scikit-learn, TensorFlow, and PyTorch. You can leverage these libraries to apply dimensionality reduction to a 5-dimensional dataset and visualize it in a 3-dimensional space.
# Data set:  https://1drv.ms/u/s!Aj7B9GbKP2y3icRAAZT8i8hNRDq5Vg?e=tMAh8W
# (Reference: https://www.kaggle.com/datasets/tunguz/big-five-personality-test )

In [2]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import plotly.express as px
import plotly.graph_objects as go
pd.options.plotting.backend = "plotly"

## Preprocessing / EDA / Data Cleaning

In [3]:
df = pd.read_csv("data/data-subset.csv", sep="\t")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Columns: 110 entries, EXT1 to long_appx_lots_of_err
dtypes: float64(104), int64(2), object(4)
memory usage: 8.4+ MB


In [4]:
# first 50 columns are the columns of interest
df.columns[:50]

Index(['EXT1', 'EXT2', 'EXT3', 'EXT4', 'EXT5', 'EXT6', 'EXT7', 'EXT8', 'EXT9',
       'EXT10', 'EST1', 'EST2', 'EST3', 'EST4', 'EST5', 'EST6', 'EST7', 'EST8',
       'EST9', 'EST10', 'AGR1', 'AGR2', 'AGR3', 'AGR4', 'AGR5', 'AGR6', 'AGR7',
       'AGR8', 'AGR9', 'AGR10', 'CSN1', 'CSN2', 'CSN3', 'CSN4', 'CSN5', 'CSN6',
       'CSN7', 'CSN8', 'CSN9', 'CSN10', 'OPN1', 'OPN2', 'OPN3', 'OPN4', 'OPN5',
       'OPN6', 'OPN7', 'OPN8', 'OPN9', 'OPN10'],
      dtype='object')

In [5]:
df_sub = df[df.columns[:50]]
df_sub.head()
print(df_sub.head().to_markdown())

|    |   EXT1 |   EXT2 |   EXT3 |   EXT4 |   EXT5 |   EXT6 |   EXT7 |   EXT8 |   EXT9 |   EXT10 |   EST1 |   EST2 |   EST3 |   EST4 |   EST5 |   EST6 |   EST7 |   EST8 |   EST9 |   EST10 |   AGR1 |   AGR2 |   AGR3 |   AGR4 |   AGR5 |   AGR6 |   AGR7 |   AGR8 |   AGR9 |   AGR10 |   CSN1 |   CSN2 |   CSN3 |   CSN4 |   CSN5 |   CSN6 |   CSN7 |   CSN8 |   CSN9 |   CSN10 |   OPN1 |   OPN2 |   OPN3 |   OPN4 |   OPN5 |   OPN6 |   OPN7 |   OPN8 |   OPN9 |   OPN10 |
|---:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|--------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|--------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|--------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|--------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|-------:|--------:|
|  0 |      1 |      5 |      1 |      5 |      1 |      1 |      1 |      5

In [6]:
# df_sub.loc[10001] = [4, 1, 5, 2, 5, 1, 5, 2, 4, 1, 1, 4, 4, 2, 2, 2, 2, 2, 3, 2, 2, 5, 2, 4, 2, 3, 2, 4, 3, 4, 3, 4, 3, 2, 2, 4, 4, 2, 4, 4, 5, 1, 4, 1, 4, 1, 5, 3, 4, 5]
# df_sub.loc[10002] = [3, 5, 3, 4, 3, 3, 2, 5, 1, 5, 2, 3, 4, 1, 3, 1, 2, 1, 3, 1, 1, 4, 1, 5, 1, 5, 3, 4, 5, 3, 3, 2, 5, 3, 3, 1, 3, 3, 5, 3, 1, 2, 4, 2, 3, 1, 4, 2, 5, 3]
# df_sub.shape
# df_sub.tail()

In [7]:
# There are 0 values that need to be dealt with
# The data range is 1-5, which means 0 values should be treated as missing
df_sub = df_sub.replace(0, np.NaN)
# df_sub = df_sub.dropna()

In [8]:
# Normalizing data points between -1 & 1
# We want 1 -> -1, 3-> 0, & 5 -> 1
df_norm = (df_sub - 3) / 2 # assuming mean is 3 and range is 1-5
print(df_norm.shape)
df_norm.tail()

(10000, 50)


Unnamed: 0,EXT1,EXT2,EXT3,EXT4,EXT5,EXT6,EXT7,EXT8,EXT9,EXT10,...,OPN1,OPN2,OPN3,OPN4,OPN5,OPN6,OPN7,OPN8,OPN9,OPN10
9995,0.0,-0.5,0.5,-0.5,0.5,-1.0,1.0,0.0,0.5,-0.5,...,1.0,-1.0,1.0,-1.0,0.0,-1.0,0.5,0.5,1.0,1.0
9996,-0.5,1.0,-1.0,0.0,-0.5,1.0,-0.5,-0.5,1.0,1.0,...,0.5,0.0,0.5,-0.5,0.0,-0.5,1.0,0.5,0.5,0.0
9997,1.0,-0.5,0.5,-1.0,1.0,-1.0,1.0,-1.0,1.0,-1.0,...,1.0,-1.0,1.0,0.0,1.0,-1.0,0.0,0.0,0.5,1.0
9998,-1.0,0.0,0.0,1.0,-0.5,0.0,-1.0,-0.5,-0.5,0.5,...,1.0,-1.0,1.0,0.0,0.5,0.5,1.0,0.5,0.0,0.5
9999,-1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,-0.5,0.0,...,-1.0,0.0,0.5,-0.5,0.0,-0.5,0.0,-0.5,1.0,0.5


In [9]:
"""
Polarity of answers

+ EXT1", "I am the life of the party.
- EXT2", "I don't talk a lot.
+ EXT3", "I feel comfortable around people.
- EXT4", "I keep in the background.
+ EXT5", "I start conversations.
- EXT6", "I have little to say.
+ EXT7", "I talk to a lot of different people at parties.
- EXT8", "I don't like to draw attention to myself.
+ EXT9", "I don't mind being the center of attention.
- EXT10", "I am quiet around strangers.

+ EST1", "I get stressed out easily.
- EST2", "I am relaxed most of the time.
+ EST3", "I worry about things.
- EST4", "I seldom feel blue.
+ EST5", "I am easily disturbed.
+ EST6", "I get upset easily.
+ EST7", "I change my mood a lot.
+ EST8", "I have frequent mood swings.
+ EST9", "I get irritated easily.
+ EST10", "I often feel blue.

- AGR1", "I feel little concern for others.
+ AGR2", "I am interested in people.
- AGR3", "I insult people.
+ AGR4", "I sympathize with others' feelings.
- AGR5", "I am not interested in other people's problems.
+ AGR6", "I have a soft heart.
- AGR7", "I am not really interested in others.
+ AGR8", "I take time out for others.
+ AGR9", "I feel others' emotions.
+ AGR10", "I make people feel at ease.

+ CSN1", "I am always prepared.
- CSN2", "I leave my belongings around.
+ CSN3", "I pay attention to details.
- CSN4", "I make a mess of things.
+ CSN5", "I get chores done right away.
- CSN6", "I often forget to put things back in their proper place.
+ CSN7", "I like order.
- CSN8", "I shirk my duties.
+ CSN9", "I follow a schedule.
+ CSN10", "I am exacting in my work.

+ OPN1", "I have a rich vocabulary.
- OPN2", "I have difficulty understanding abstract ideas.
+ OPN3", "I have a vivid imagination.
- OPN4", "I am not interested in abstract ideas.
+ OPN5", "I have excellent ideas.
- OPN6", "I do not have a good imagination.
+ OPN7", "I am quick to understand things.
+ OPN8", "I use difficult words.
+ OPN9", "I spend time reflecting on things.
+ OPN10", "I am full of ideas.
"""

polarity_dict = {
    "EXT1": 1,
    "EXT2": -1,
    "EXT3": 1,
    "EXT4": -1,
    "EXT5": 1,
    "EXT6": -1,
    "EXT7": 1,
    "EXT8": -1,
    "EXT9": 1,
    "EXT10": -1,
    
    "EST1": 1,
    "EST2": -1,
    "EST3": 1,
    "EST4": -1,
    "EST5": 1,
    "EST6": 1,
    "EST7": 1,
    "EST8": 1,
    "EST9": 1,
    "EST10": 1,
    
    "AGR1": -1,
    "AGR2": 1,
    "AGR3": -1,
    "AGR4": 1,
    "AGR5": -1,
    "AGR6": 1,
    "AGR7": -1,
    "AGR8": 1,
    "AGR9": 1,
    "AGR10": 1,
    
    "CSN1": 1,
    "CSN2": -1,
    "CSN3": 1,
    "CSN4": -1,
    "CSN5": 1,
    "CSN6": -1,
    "CSN7": 1,
    "CSN8": -1,
    "CSN9": 1,
    "CSN10": 1,
    
    "OPN1": 1,
    "OPN2": -1,
    "OPN3": 1,
    "OPN4": -1,
    "OPN5": 1,
    "OPN6": -1,
    "OPN7": 1,
    "OPN8": 1,
    "OPN9": 1,
    "OPN10": 1,
}


In [10]:
# accounting for polarity
df_cp = df_norm.copy()
for col in df_norm.columns:
    df_cp[col] = df_norm[col] * polarity_dict[col]
df_cp.tail()

Unnamed: 0,EXT1,EXT2,EXT3,EXT4,EXT5,EXT6,EXT7,EXT8,EXT9,EXT10,...,OPN1,OPN2,OPN3,OPN4,OPN5,OPN6,OPN7,OPN8,OPN9,OPN10
9995,0.0,0.5,0.5,0.5,0.5,1.0,1.0,-0.0,0.5,0.5,...,1.0,1.0,1.0,1.0,0.0,1.0,0.5,0.5,1.0,1.0
9996,-0.5,-1.0,-1.0,-0.0,-0.5,-1.0,-0.5,0.5,1.0,-1.0,...,0.5,-0.0,0.5,0.5,0.0,0.5,1.0,0.5,0.5,0.0
9997,1.0,0.5,0.5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,-0.0,1.0,1.0,0.0,0.0,0.5,1.0
9998,-1.0,-0.0,0.0,-1.0,-0.5,-0.0,-1.0,0.5,-0.5,-0.5,...,1.0,1.0,1.0,-0.0,0.5,-0.5,1.0,0.5,0.0,0.5
9999,-1.0,-0.0,0.0,-1.0,0.0,-0.0,0.0,-1.0,-0.5,-0.0,...,-1.0,-0.0,0.5,0.5,0.0,0.5,0.0,-0.5,1.0,0.5


In [11]:
df_T = df_cp.T
df_T["type"] = df_T.index.str.slice(0,3)
df_E = df_T.groupby("type", sort=False).mean().T
# NOTE: Mean is taken ignoring np.NaN values. 
# Effectively does the same thing as conditional mean imputation
df_E.head()
df_E.tail(2)

type,EXT,EST,AGR,CSN,OPN
9998,-0.4,0.05,0.2,0.5,0.5
9999,-0.35,0.4,0.7,0.4,0.15


In [12]:
print(df_E.isna().sum())
# Still some nan values
# Means that within certain responses, all values were missing for a trait
df_ED = df_E.dropna(axis=0)

type
EXT    44
EST    46
AGR    46
CSN    46
OPN    48
dtype: int64


In [13]:
df_ED.describe()
print(df_ED.describe().to_markdown())

|       |         EXT |         EST |         AGR |         CSN |         OPN |
|:------|------------:|------------:|------------:|------------:|------------:|
| count | 9952        | 9952        | 9952        | 9952        | 9952        |
| mean  |   -0.01278  |    0.029529 |    0.385407 |    0.181257 |    0.437619 |
| std   |    0.454289 |    0.429102 |    0.365382 |    0.36919  |    0.318982 |
| min   |   -1        |   -1        |   -1        |   -1        |   -0.95     |
| 25%   |   -0.35     |   -0.277778 |    0.15     |   -0.1      |    0.2      |
| 50%   |    0        |    0.05     |    0.45     |    0.2      |    0.45     |
| 75%   |    0.3      |    0.35     |    0.65     |    0.45     |    0.7      |
| max   |    1        |    1        |    1        |    1        |    1        |


In [14]:
# Normalization
df_EDN = (df_ED - df_ED.mean()) / df_ED.std()
df_EDN.cov()
print(df_EDN.cov().to_markdown())

| type   |        EXT |        EST |        AGR |        CSN |        OPN |
|:-------|-----------:|-----------:|-----------:|-----------:|-----------:|
| EXT    |  1         | -0.215521  |  0.304984  |  0.0461086 |  0.149932  |
| EST    | -0.215521  |  1         | -0.0573839 | -0.220419  | -0.090065  |
| AGR    |  0.304984  | -0.0573839 |  1         |  0.15626   |  0.114062  |
| CSN    |  0.0461086 | -0.220419  |  0.15626   |  1         |  0.0663948 |
| OPN    |  0.149932  | -0.090065  |  0.114062  |  0.0663948 |  1         |


## PCA

In [15]:
pca = PCA(n_components=3) # 3 principal components
pca_data = pca.fit_transform(df_EDN)
pca_df = pd.DataFrame(pca_data, columns=["PC1", "PC2", "PC3"])

In [16]:
pd.DataFrame(index=["PC1", "PC2", "PC3"], data={"Proportion of Variance Explained": pca.explained_variance_ratio_, "Cumulative Proportion": np.cumsum(pca.explained_variance_ratio_)})
print(pd.DataFrame(index=["PC1", "PC2", "PC3"], data={"Proportion of Variance Explained": pca.explained_variance_ratio_, "Cumulative Proportion": np.cumsum(pca.explained_variance_ratio_)}).to_markdown())

|     |   Proportion of Variance Explained |   Cumulative Proportion |
|:----|-----------------------------------:|------------------------:|
| PC1 |                           0.317035 |                0.317035 |
| PC2 |                           0.206601 |                0.523636 |
| PC3 |                           0.18338  |                0.707016 |


In [17]:
pd.DataFrame(index=["PC1", "PC2", "PC3"], data=pca.components_, columns=df_EDN.columns)
print(pd.DataFrame(index=["PC1", "PC2", "PC3"], data=pca.components_, columns=df_EDN.columns).to_markdown())

|     |       EXT |       EST |       AGR |       CSN |       OPN |
|:----|----------:|----------:|----------:|----------:|----------:|
| PC1 | -0.540406 |  0.444568 | -0.494664 | -0.381453 | -0.346586 |
| PC2 | -0.364322 | -0.51093  | -0.36901  |  0.634714 | -0.25921  |
| PC3 |  0.1468   |  0.160795 |  0.467782 |  0.140572 | -0.844994 |


In [18]:
fig = px.scatter_3d(pca_df.sample(250), x='PC1', y='PC2', z='PC3')
fig.show()

# Hypothesis:  The 5-dimensional OCEAN data set can be represented in 3 dimensions visualized by 27 clusters of points in a 3x3x3 cube.

In [19]:
# fig.write_html("pca-e-250.html")

## Clustering


In [20]:
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN

kmeans = KMeans(n_clusters=27)
kmeans.fit(pca_df)

KMeans(n_clusters=27)

In [21]:
pca_df["KMEANS_Cluster"] = kmeans.labels_
fig = px.scatter_3d(pca_df, x='PC1', y='PC2', z="PC3", color='KMEANS_Cluster')
fig.show()

In [22]:
pca_df["KMEANS_Cluster"].value_counts().sort_index()

0     640
1     483
2     443
3     535
4     494
5     403
6     229
7     316
8     417
9     248
10    316
11    327
12    190
13    321
14    513
15    489
16    301
17    314
18    418
19    317
20    226
21    345
22    524
23    305
24    278
25    175
26    385
Name: KMEANS_Cluster, dtype: int64

In [23]:
pca_df.dtypes

PC1               float64
PC2               float64
PC3               float64
KMEANS_Cluster      int32
dtype: object

In [26]:
fig = go.Figure()

for index in range(27):
    subset = pca_df[pca_df["KMEANS_Cluster"] == index]
    fig.add_trace(
        go.Scatter3d(
            x=subset["PC1"],
            y=subset["PC2"],
            z=subset["PC3"],
            mode="markers",
            name=f"Cluster {index}",
        ),
    )

buttons_list = []
for index in range(27):
    buttons_list.append(
        dict(
            label=f"Show Cluster {index}",
            method="update",
            args=[
                {"visible": [False] * index + [True] + [False] * (26 - index)},
                {"title": f"Cluster {index}"},
            ],
        )
    )

buttons_list.append(
    dict(
        label="Show All Clusters",
        method="update",
        args=[{"visible": [True] * 27}, {"title": "All Clusters"}],
    )
)

fig.update_layout(updatemenus=[dict(buttons=buttons_list)])
fig.update_scenes(
    xaxis_title="PC1",
    xaxis_range=[-5, 5],
    xaxis_autorange=False,
    yaxis_title="PC2",
    yaxis_range=[-5, 5],
    yaxis_autorange=False,
    zaxis_title="PC3",
    zaxis_range=[-5, 5],
    zaxis_autorange=False,
)

fig.update_xaxes(range=[-5, 5])
fig.update_yaxes(range=[-5, 5])
fig.show()

In [25]:
fig.write_html("pca-clustering.html")