In [1]:
# Dimensional Analysis of OCEAN data set                                                          6/25/2023
# Hypothesis:  The 5-dimensional OCEAN data set can be represented in 3 dimensions visualized by 27 clusters of points in a 3x3x3 cube.
# Experiment:  Apply machine learning tools that use a dimensionality reduction technique like Principal Component Analysis (PCA) to the Kaggle OCEAN data set. Python provides several machine learning libraries that offer implementations of these dimensionality reduction techniques, such as scikit-learn, TensorFlow, and PyTorch. You can leverage these libraries to apply dimensionality reduction to a 5-dimensional dataset and visualize it in a 3-dimensional space.
# Data set:  https://1drv.ms/u/s!Aj7B9GbKP2y3icRAAZT8i8hNRDq5Vg?e=tMAh8W
# (Reference: https://www.kaggle.com/datasets/tunguz/big-five-personality-test )
# The five dimensions of the data set are described by the columns:
# 1. EXT, EXT_E
# 2. EST, EST_E
# 3. AGR, AGR_E
# 4. CSN, CSN_E
# 5. OPN, OPN_E

# MENTIONED IN DATA DICTIONARY
# The time spent on each question is also recorded in milliseconds. These are the variables ending in _E. This was calculated by taking the time when the button for the question was clicked minus the time of the most recent other button click.

In [2]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import plotly.express as px
pd.options.plotting.backend = "plotly"

## Preprocessing / EDA / Data Cleaning

In [3]:
df = pd.read_csv("data/data-subset.csv", sep="\t")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Columns: 110 entries, EXT1 to long_appx_lots_of_err
dtypes: float64(104), int64(2), object(4)
memory usage: 8.4+ MB


In [4]:
E_mask = df.columns.str.endswith("_E")
df_E = df[df.columns[E_mask]]
# print(df_E.head().to_markdown())
df_E.head()

Unnamed: 0,EXT1_E,EXT2_E,EXT3_E,EXT4_E,EXT5_E,EXT6_E,EXT7_E,EXT8_E,EXT9_E,EXT10_E,...,OPN1_E,OPN2_E,OPN3_E,OPN4_E,OPN5_E,OPN6_E,OPN7_E,OPN8_E,OPN9_E,OPN10_E
0,3504.0,777.0,792.0,4037.0,1550.0,3172.0,718.0,751.0,929.0,1576.0,...,879.0,1896.0,2151.0,3589.0,1040.0,1838.0,1832.0,1635.0,1449.0,4236.0
1,7369.0,9328.0,32352.0,2440.0,0.0,2793.0,2952.0,6757.0,5376.0,2081.0,...,10551.0,3032.0,2188.0,3320.0,4817.0,8166.0,3644.0,4408.0,2570.0,1282.0
2,5743.0,2483.0,3179.0,1928.0,2386.0,4312.0,37930.0,5480.0,3312.0,11932.0,...,1866.0,1665.0,3831.0,2367.0,1979.0,1782.0,4436.0,1963.0,2788.0,2051.0
3,2615.0,6207.0,4718.0,9277.0,4268.0,9874.0,5894.0,3154.0,2754.0,5625.0,...,6517.0,3408.0,4282.0,2696.0,5798.0,5680.0,3068.0,2467.0,2626.0,2012.0
4,11476.0,3147.0,3962.0,6104.0,3065.0,6817.0,3354.0,2286.0,5751.0,3625.0,...,3547.0,3545.0,2431.0,3839.0,9352.0,5067.0,2448.0,2119.0,1825.0,1643.0


In [5]:
# there are far more than 5 dimensions for the _E columns
# not sure what exactly is intended here so i'll just take the average for each trait section

df_ER = pd.DataFrame()

five_traits = ["EXT", "EST", "AGR", "CSN", "OPN"]
for trait in five_traits:
    trait_cols = df_E.columns[df_E.columns.str.startswith(trait)]
    trait_mean = df_E[trait_cols].mean(axis=1)
    df_ER[f"{trait}_E"] = trait_mean

In [6]:
# First 10 Rows of Cleaned DataFrame
# print(df_ER.head(10).to_markdown())
df_ER.head(10)

Unnamed: 0,EXT_E,EST_E,AGR_E,CSN_E,OPN_E
0,1780.6,1673.4,3700.7,5490.6,2054.5
1,7144.8,3341.5,3748.7,4251.6,4397.8
2,7868.5,7315.1,7616.6,4394.3,2472.8
3,5438.6,4069.4,5439.5,4422.1,3855.4
4,4958.7,3608.7,4431.5,3708.6,3581.6
5,3177.2,3435.7,3131.4,2923.9,2932.7
6,3216.2,2270.2,3035.3,1849.3,2232.5
7,7047.7,3103.9,4707.2,5832.0,3077.4
8,6417.8,7023.9,4461.4,7340.4,6568.5
9,3436.8,2537.2,2991.4,5591.3,2576.8


In [7]:
# Checking for Missingness
# I am assuming that 0 is not a valid value
print(df_ER.isna().sum())
print((df_ER == 0).sum())

EXT_E    18
EST_E    18
AGR_E    18
CSN_E    18
OPN_E    18
dtype: int64
EXT_E    58
EST_E    61
AGR_E    61
CSN_E    61
OPN_E    61
dtype: int64


In [8]:
# Are these missing values in the same rows?
print(df_ER[df_ER.isna().any(axis=1)].shape) # Yes
print(df_ER[(df_ER == 0).any(axis=1)].shape) # No

(18, 5)
(68, 5)


In [9]:
# I could try to identify the missingness mechanisms and then perform imputation accodingly...
# BUT, I'm lazy. Small proportion missing.
df_ERD = df_ER.replace(0, np.NaN)
df_ERD = df_ERD.dropna()

df_ERD.shape

(9914, 5)

In [10]:
df_ERD.describe()
# some crazy outliers
# someone took 7 hours to average to answer each question

Unnamed: 0,EXT_E,EST_E,AGR_E,CSN_E,OPN_E
count,9914.0,9914.0,9914.0,9914.0,9914.0
mean,15302.93,8166.031,9472.9,8403.173,5752.01
std,375903.4,135267.6,202281.9,107314.0,27974.15
min,72.1,132.3,95.2,40.2,68.0
25%,3503.025,2838.125,3152.1,3260.0,2848.75
50%,4599.25,3743.65,4131.8,4357.1,3780.55
75%,6458.35,5277.5,5735.525,6189.85,5250.0
max,26360440.0,8554173.0,17271330.0,7559926.0,2139661.0


In [11]:
# removing outliers
# heavily right skewed
abs_z_scores = ((df_ERD - df_ERD.mean()) / df_ERD.std()).abs()
df_ERDO = df_ERD[(abs_z_scores < 3).all(axis=1)]

df_ERDO.shape

(9855, 5)

In [12]:
df_ERDO.cov()
# EXT_E variance significantly larger than the other columns
# PCA will be dominated by EXT_E (maximizes variance explained)
# Normalization necessary

Unnamed: 0,EXT_E,EST_E,AGR_E,CSN_E,OPN_E
EXT_E,833389300.0,11391060.0,11084010.0,19822970.0,10290780.0
EST_E,11391060.0,120459000.0,11039460.0,18063260.0,9210457.0
AGR_E,11084010.0,11039460.0,156017000.0,12206450.0,8934009.0
CSN_E,19822970.0,18063260.0,12206450.0,106944300.0,13580450.0
OPN_E,10290780.0,9210457.0,8934009.0,13580450.0,21259720.0


In [13]:
# Normalization
df_ERDON = (df_ERDO - df_ERDO.mean()) / df_ERDO.std()
df_ERDON.cov()

Unnamed: 0,EXT_E,EST_E,AGR_E,CSN_E,OPN_E
EXT_E,1.0,0.035952,0.030739,0.0664,0.077312
EST_E,0.035952,1.0,0.080527,0.159147,0.182005
AGR_E,0.030739,0.080527,1.0,0.094498,0.155125
CSN_E,0.0664,0.159147,0.094498,1.0,0.284811
OPN_E,0.077312,0.182005,0.155125,0.284811,1.0


## PCA

In [14]:
pca = PCA(n_components=3) # 3 principal components
pca_data = pca.fit_transform(df_ERDON)
pca_df = pd.DataFrame(pca_data, columns=["PC1", "PC2", "PC3"])

print(pca.explained_variance_ratio_)
print(sum(pca.explained_variance_ratio_))

print(pca.components_)

[0.30433279 0.19607052 0.18667428]
0.6870775854094158
[[ 0.20718773  0.43924379  0.35286461  0.54424389  0.58602329]
 [ 0.95986694 -0.17942143 -0.2068723  -0.02957813 -0.05284308]
 [ 0.10934857 -0.3750974   0.88198042 -0.25915338 -0.04790511]]


In [15]:
fig = px.scatter_3d(pca_df, x='PC1', y='PC2', z='PC3')
fig.show()

# Hypothesis:  The 5-dimensional OCEAN data set can be represented in 3 dimensions visualized by 27 clusters of points in a 3x3x3 cube.
# Not quite so sure how to interpret this, but I don't think it's true (?)
# ¯\_(ツ)_/¯


In [17]:
fig.write_html("pca-normalized.html")