In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read the data
df = pd.read_csv('../data/data.csv')

In [9]:
features_float = df.loc[:, 'f_22':'f_28'].values

# Power transform integer variables to make them look more 'normal'
power = 0.47  # Best value
features_int_transformed = ((1 + df.loc[:, 'f_07':'f_13'])**power - 1) / power

# Initialize a 3D array with varying dimensions
num_clusters = 7 * 6
int_covariance = np.zeros((num_clusters, 7, 7), dtype=np.float32)

# Set diagonal elements to 1 for each 2D subarray
for i in range(num_clusters):
    np.fill_diagonal(int_covariance[i], 1.0)

# Similarly, for float_covariance
float_covariance = np.zeros((num_clusters, 5, 5), dtype=np.float32)
for i in range(num_clusters):
    np.fill_diagonal(float_covariance[i], 1.0)

float_means_hardcoded = np.array([
    [1, -1, -1, -1, -1],
    [1, -1, 1, 1, -1],
    [-1, 1, 1, -1, 1],
    [-1, -1, -1, -1, -1],
    [1, -1, -1, -1, -1],
    [-1, 1, -1, -1, -1],
    [-1, -1, 1, 1, -1],
    [-1, -1, -1, 1, -1],
    [1, -1, 1, 1, -1],
    [-1, -1, -1, -1, -1],
    [1, -1, 1, 1, -1],
    [-1, -1, 1, 1, -1],
    [1, 1, -1, -1, -1],
    [1, 1, 1, 1, 1],
    [-1, -1, -1, -1, -1]
], dtype=np.float32).reshape(-1, 5)

int_means = 3.33 * np.ones((num_clusters, 7), dtype=np.float32)  # (num_clusters, 7) mean of integer variables - initialized to mean of all integer variables

uncorrelated_var_indices = np.array([
    [0, 1],
    [3, 4],
    [0, 5],
    [1, 5],
    [2, 5],
    [4, 5],
    [4, 6]
], dtype=int)  # For 7 cluster groups, indices of 2 variables that are uncorrelated with the rest
predicted_probabilities = np.zeros((features_float.shape[0], num_clusters), dtype=np.float32)  # Predicted probability of each point for each cluster

# Custom EM loop
for iteration in range(20):  # 20 iterations are enough to converge
    # 1. Expectation
    for cluster_index in range(num_clusters):  # 42 clusters
        # Integer variables
        int_cov = int_covariance[cluster_index]
        int_mean = int_means[cluster_index]
        int_difference = features_int_transformed - int_mean.reshape(1, -1)
        p1 = np.exp(-0.5 * np.sum((int_difference @ np.linalg.inv(int_cov)) * int_difference, axis=1)) / np.sqrt(np.linalg.det(int_cov))
        
        # Float variables part 1 - 5 correlated variables
        float_cov = float_covariance[cluster_index]
        float_mean = float_means_hardcoded[cluster_index % len(float_means_hardcoded)]
        non_correlated_var_indices = uncorrelated_var_indices[cluster_index // 6]  # Indices of 2 variables that are uncorrelated with the rest
        correlated_var_indices = list(set(range(7)) - set(non_correlated_var_indices))  # Indices of 5 variables that are correlated with each other
        features_float_part1 = features_float[:, correlated_var_indices]
        float_difference_part1 = features_float_part1 - float_mean.reshape(1, -1)
        p2 = np.exp(-0.5 * np.sum((float_difference_part1 @ np.linalg.inv(float_cov)) * float_difference_part1, axis=1)) / np.sqrt(np.linalg.det(float_cov))
        
        # Float variables part 2 - 2 uncorrelated variables with 0 mean and 1 std
        features_float_part2 = features_float[:, non_correlated_var_indices]
        p3 = np.exp(-0.5 * np.sum(features_float_part2 * features_float_part2, axis=1))
        
        predicted_probabilities[:, cluster_index] = p1 * p2 * p3  # Final probability is the product of 3 pieces
    
    cluster_assignments = predicted_probabilities.argmax(axis=1)
    log_prob = np.log(predicted_probabilities.max(axis=1)).mean()  # Log prob - to track convergence
    print(iteration, np.round(log_prob, 4))

    # 2. Maximization
    for group_index in range(7):  # Integer variables - 7 cluster groups
        int_data_block = features_int_transformed[(cluster_assignments // 6) == group_index]  # Points for this cluster block only
        int_means[group_index * 6:(group_index + 1) * 6] = int_data_block.mean(axis=0)  # Integer mean is the same for all subclusters
        int_data_block = int_data_block - int_means[group_index * 6]
        int_covariance[group_index * 6:(group_index + 1) * 6] = int_data_block.T @ int_data_block / int_data_block.shape[0]  # Integer covariance is the same for all subclusters
        int_covariance[group_index * 6:(group_index + 1) * 6][np.abs(int_covariance[group_index * 6:(group_index + 1) * 6]) < 0.01] = 0.0  # Set covariance to 0 if it is close to zero (< 0.01)
    
    for cluster_index in range(num_clusters):  # Float variables - 42 clusters
        non_correlated_var_indices = uncorrelated_var_indices[cluster_index // 6]  # Indices of 2 variables that are uncorrelated with the rest
        correlated_var_indices = list(set(range(7)) - set(non_correlated_var_indices))  # Indices of 5 variables that are correlated with each other
        float_data_block = features_float[cluster_assignments == cluster_index]  # Points for this cluster only
        float_data_block = float_data_block[:, correlated_var_indices] - float_means_hardcoded[cluster_index % len(float_means_hardcoded)]
        float_covariance[cluster_index] = float_data_block.T @ float_data_block / float_data_block.shape[0]

# Submission
submission_df = pd.read_csv('../data/sample_submission.csv')
submission_df['Predicted'] = cluster_assignments // 6
submission_df.to_csv('submission.csv', index=False)

0 -4.2453


  float_covariance[cluster_index] = float_data_block.T @ float_data_block / float_data_block.shape[0]
  r = _umath_linalg.det(a, signature=signature)


1 nan
2 nan
3 nan
4 nan
5 nan
6 nan
7 nan
8 nan
9 nan
10 nan
11 nan
12 nan
13 nan
14 nan
15 nan
16 nan
17 nan
18 nan
19 nan
