In [2]:
# Dimensional Analysis of OCEAN data set                                                          6/25/2023
# Hypothesis:  The 5-dimensional OCEAN data set can be represented in 3 dimensions visualized by 27 clusters of points in a 3x3x3 cube.
# Experiment:  Apply machine learning tools that use a dimensionality reduction technique like Principal Component Analysis (PCA) to the Kaggle OCEAN data set. Python provides several machine learning libraries that offer implementations of these dimensionality reduction techniques, such as scikit-learn, TensorFlow, and PyTorch. You can leverage these libraries to apply dimensionality reduction to a 5-dimensional dataset and visualize it in a 3-dimensional space.
# Data set:  https://1drv.ms/u/s!Aj7B9GbKP2y3icRAAZT8i8hNRDq5Vg?e=tMAh8W
# (Reference: https://www.kaggle.com/datasets/tunguz/big-five-personality-test )
# The five dimensions of the data set are described by the columns:
# EXT, EXT_E
# EST, EST_E
# AGR, AGR_E
# CSN, CSN_E
# OPN, OPN_E

# MENTIONED IN DATA DICTIONARY
# The time spent on each question is also recorded in milliseconds. These are the variables ending in _E. This was calculated by taking the time when the button for the question was clicked minus the time of the most recent other button click.

In [3]:
import pandas as pd
from sklearn.decomposition import PCA
# import matplotlib.pyplot as plt
# from mpl_toolkits.mplot3d import Axes3D
import plotly.express as px

## Preprocessing / Data Cleaning

In [4]:
df = pd.read_csv("data/data-subset.csv", sep="\t")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Columns: 110 entries, EXT1 to long_appx_lots_of_err
dtypes: float64(104), int64(2), object(4)
memory usage: 8.4+ MB


In [5]:
E_mask = df.columns.str.endswith("_E")
df_E = df[df.columns[E_mask]]
df_E.head()

Unnamed: 0,EXT1_E,EXT2_E,EXT3_E,EXT4_E,EXT5_E,EXT6_E,EXT7_E,EXT8_E,EXT9_E,EXT10_E,...,OPN1_E,OPN2_E,OPN3_E,OPN4_E,OPN5_E,OPN6_E,OPN7_E,OPN8_E,OPN9_E,OPN10_E
0,3504.0,777.0,792.0,4037.0,1550.0,3172.0,718.0,751.0,929.0,1576.0,...,879.0,1896.0,2151.0,3589.0,1040.0,1838.0,1832.0,1635.0,1449.0,4236.0
1,7369.0,9328.0,32352.0,2440.0,0.0,2793.0,2952.0,6757.0,5376.0,2081.0,...,10551.0,3032.0,2188.0,3320.0,4817.0,8166.0,3644.0,4408.0,2570.0,1282.0
2,5743.0,2483.0,3179.0,1928.0,2386.0,4312.0,37930.0,5480.0,3312.0,11932.0,...,1866.0,1665.0,3831.0,2367.0,1979.0,1782.0,4436.0,1963.0,2788.0,2051.0
3,2615.0,6207.0,4718.0,9277.0,4268.0,9874.0,5894.0,3154.0,2754.0,5625.0,...,6517.0,3408.0,4282.0,2696.0,5798.0,5680.0,3068.0,2467.0,2626.0,2012.0
4,11476.0,3147.0,3962.0,6104.0,3065.0,6817.0,3354.0,2286.0,5751.0,3625.0,...,3547.0,3545.0,2431.0,3839.0,9352.0,5067.0,2448.0,2119.0,1825.0,1643.0


In [6]:
# there are far more than 5 dimensions for the _E columns
# not sure what exactly is intended here so i'll just take the average for each trait section

df_ER = pd.DataFrame()

five_traits = ["EXT", "EST", "AGR", "CSN", "OPN"]
for trait in five_traits:
    trait_cols = df_E.columns[df_E.columns.str.startswith(trait)]
    trait_mean = df_E[trait_cols].mean(axis=1)
    df_ER[f"{trait}_E"] = trait_mean

In [7]:
df_ER.head()

Unnamed: 0,EXT_E,EST_E,AGR_E,CSN_E,OPN_E
0,1780.6,1673.4,3700.7,5490.6,2054.5
1,7144.8,3341.5,3748.7,4251.6,4397.8
2,7868.5,7315.1,7616.6,4394.3,2472.8
3,5438.6,4069.4,5439.5,4422.1,3855.4
4,4958.7,3608.7,4431.5,3708.6,3581.6


In [8]:
# Checking for NaN values
df_ER.isna().sum()

EXT_E    18
EST_E    18
AGR_E    18
CSN_E    18
OPN_E    18
dtype: int64

In [9]:
# Are these NaN values all in the same rows?
df_ER[df_ER.isna().any(axis=1)].shape
# Yep

(18, 5)

In [10]:
# I could try to identify the missingness mechanisms and then perform imputation accodingly...
# BUT, I'm lazy. Very small proportion missing.
df_ERD = df_ER.dropna()

## EDA & More Cleaning

In [15]:
print(df_ERD.describe())
print(df_ERD.shape)
# Some clear outliers
# 2.636044e+07 milliseconds (7 hours) spent on average per question

              EXT_E         EST_E         AGR_E         CSN_E         OPN_E
count  9.982000e+03  9.982000e+03  9.982000e+03  9.982000e+03  9.982000e+03
mean   1.520368e+04  8.112675e+03  9.409165e+03  8.346954e+03  5.715408e+03
std    3.746228e+05  1.348077e+05  2.015931e+05  1.069500e+05  2.788265e+04
min    0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00
25%    3.483200e+03  2.822675e+03  3.137325e+03  3.241400e+03  2.830125e+03
50%    4.581650e+03  3.730500e+03  4.117150e+03  4.338750e+03  3.761400e+03
75%    6.442850e+03  5.264900e+03  5.710500e+03  6.170050e+03  5.227450e+03
max    2.636044e+07  8.554173e+06  1.727133e+07  7.559926e+06  2.139661e+06
(9982, 5)


In [12]:
# removing outliers
# heavily right skewed, shouldn't remove much good data
abs_z_scores = ((df_ERD - df_ERD.mean()) / df_ERD.std()).abs()
df_ERD2 = df_ERD[(abs_z_scores < 3).all(axis=1)]
df_ERD2.shape

(9923, 5)

## PCA

In [13]:
# Reducing from 5 -> 3 dimensions
pca = PCA(n_components=3)
pca_data = pca.fit_transform(df_ERD2)
pca_df = pd.DataFrame(pca_data, columns=["PC1", "PC2", "PC3"])

In [14]:
fig = px.scatter_3d(pca_df, x='PC1', y='PC2', z='PC3')
fig.show()

# Hypothesis:  The 5-dimensional OCEAN data set can be represented in 3 dimensions visualized by 27 clusters of points in a 3x3x3 cube.
# ¯\_(ツ)_/¯

In [None]:
fig.write_html("pca.html")