In [2]:
import pandas as pd

import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import statsmodels.api as sm
from sklearn.manifold import TSNE


In [10]:
# load data from processed_data.parquet
df = pd.read_parquet("processed_data.parquet")

## Feature Selection 

<br /> **Principal Component Analysis (PCA)**: Dimensionality reduction technique used to reduce the dimensionality of a dataset while preserving as much variance as possible. It does this by transforming the original variables into a new set of variables called principal components, which are linear combinations of the original variables. Requires numerical encoding!

<br /> **K-mean clustering**: Unsupervised machine learning algorithm used for clustering data into groups or clusters based on similarities in their feature space. It aims to partition the data into k clusters, where each data point belongs to the cluster with the nearest mean (centroid). Requires numerical encoding!

<br /> **t-Distributed Stochastic Neighbor Embedding (t-SNE)**: Visualizing high-dimensional data in lower-dimensional spaces, often in two or three dimensions. Preserves the local structure of the data points, meaning that similar data points in the high-dimensional space should remain close to each other in the low-dimensional embedding. Requires numerical encoding!

<br /> **Latent Class Analysis (LCA)**: Identifies unobserved or latent subgroups within a population based on patterns of observed categorical variables. It belongs to the family of finite mixture models, where each latent class represents a distinct subgroup with its own characteristic response probabilities for the observed variables. Requires numerical encoding!

#### PCA

In [None]:
# Create sample data
X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])

# Initialize PCA with desired number of components
pca = PCA(n_components=2)

# Fit PCA to the data
pca.fit(X)

# Transform the data into the new feature space
X_transformed = pca.transform(X)

# Explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_

# Print the transformed data and explained variance ratio
print("Transformed data:")
print(X_transformed)
print("\nExplained variance ratio:")
print(explained_variance_ratio)

#### K-mean clustering 

In [None]:
# Generate some random data for demonstration
np.random.seed(0)
X = np.random.randn(100, 2)

# Initialize KMeans with desired number of clusters
kmeans = KMeans(n_clusters=3)

# Fit KMeans to the data
kmeans.fit(X)

# Get cluster centers and labels
cluster_centers = kmeans.cluster_centers_
labels = kmeans.labels_

# Visualize the data and clusters
plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', marker='o', edgecolor='k')
plt.scatter(cluster_centers[:, 0], cluster_centers[:, 1], c='red', marker='x', s=100, label='Cluster Centers')
plt.title('K-means Clustering')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend()
plt.grid(True)
plt.show()

#### t-Distributed Stochastic Neighbor Embedding (t-SNE)

In [1]:
# Load the Iris dataset (for demonstration purposes)
iris = load_iris()
X = iris.data
y = iris.target

# Initialize t-SNE with desired parameters
tsne = TSNE(n_components=2, perplexity=30, random_state=42)

# Fit t-SNE to the data and transform it
X_embedded = tsne.fit_transform(X)

# Visualize the t-SNE embedding
plt.figure(figsize=(8, 6))
plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=y, cmap='viridis')
plt.colorbar(label='Target', ticks=range(3), format=plt.FuncFormatter(lambda x, _: iris.target_names[int(x)]))
plt.title('t-SNE Embedding of Iris Dataset')
plt.xlabel('t-SNE Dimension 1')
plt.ylabel('t-SNE Dimension 2')
plt.grid(True)
plt.show()

NameError: name 'load_iris' is not defined

#### Latent class analysis (LCA)

In [None]:
np.random.seed(0)
n_individuals = 1000
n_questions = 5
data = np.random.randint(2, size=(n_individuals, n_questions))
df = pd.DataFrame(data, columns=[f"Question_{i}" for i in range(n_questions)])

# Perform Latent Class Analysis (LCA)
lca_model = sm.LatentClass(df.values, nclasses=2)  # Specify the number of latent classes
lca_results = lca_model.fit()

# Print summary of the LCA results
print(lca_results.summary())