In [1]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from itertools import combinations
from utils import calculate_purity,perform_kmeans

In [2]:
#DATA PREPROCESSING

data = pd.read_csv('data/finance.csv')
#data = data.sample(frac=0.03)
data = data.drop(["Year"], axis=1)
I = 'Company'
column_names = [col for col in data.columns if col != I]
t = len(column_names)+1
scaler = StandardScaler()
all_subsets = [subset for subset_length in range(5, 6)
               for subset in combinations(column_names, subset_length)]

best_subset = None
best_model = None
best_score = -1

In [3]:
for subset in all_subsets:
    
    # Select the subset of the data
    subset_data = data[list(subset)+[I]]

    current_model, current_silhouette, current_purity= perform_kmeans(subset_data, I)
    current_score = current_purity  # Could adjust scoring logic

    if current_score > best_score and len(subset) > 1:
        best_score = current_score
        best_subset = subset
        best_model = current_model

    print(f"Processed data for columns: {subset}, Silhouette Score: {current_silhouette}, Purity: {current_purity}")

Number of clusters:  12
Silhouette Score:  0.7677122362445827
Purity:  0.7577639751552795
Processed data for columns: ('Category', 'Market Cap(in B USD)', 'Revenue', 'Gross Profit', 'Net Income'), Silhouette Score: 0.7677122362445827, Purity: 0.7577639751552795
Number of clusters:  12
Silhouette Score:  0.6963336158974754
Purity:  0.7763975155279503
Processed data for columns: ('Category', 'Market Cap(in B USD)', 'Revenue', 'Gross Profit', 'Earning Per Share'), Silhouette Score: 0.6963336158974754, Purity: 0.7763975155279503
Number of clusters:  12
Silhouette Score:  0.7719917833703721
Purity:  0.7639751552795031
Processed data for columns: ('Category', 'Market Cap(in B USD)', 'Revenue', 'Gross Profit', 'EBITDA'), Silhouette Score: 0.7719917833703721, Purity: 0.7639751552795031
Number of clusters:  12
Silhouette Score:  0.7308514669257276
Purity:  0.7763975155279503
Processed data for columns: ('Category', 'Market Cap(in B USD)', 'Revenue', 'Gross Profit', 'Share Holder Equity'), Silho

In [4]:
print("\nWinner Subset:")
print(f"Columns: {best_subset}, Best Purity Score: {best_score}")
best_subset = list(best_subset)
winner_data = data[[I] + best_subset]

perform_kmeans(winner_data, I, True)


Winner Subset:
Columns: ('Category', 'Revenue', 'Share Holder Equity', 'Cash Flow from Investing', 'Current Ratio'), Best Purity Score: 0.8944099378881988
  Company Category   Revenue  Share Holder Equity  Cash Flow from Investing  \
0    AAPL       IT  394328.0              50672.0                  -22354.0   
1    AAPL       IT  365817.0              63090.0                  -14545.0   
2    AAPL       IT  274515.0              65339.0                   -4289.0   
3    AAPL       IT  260174.0              90488.0                   45896.0   
4    AAPL       IT  265595.0             107147.0                   16066.0   

   Current Ratio  
0         0.8794  
1         1.0746  
2         1.3636  
3         1.5401  
4         1.1329  
Number of clusters:  12
Silhouette Score:  0.5729423285227238
Purity:  0.8944099378881988


(KMeans(n_clusters=12, random_state=0), 0.5729423285227238, 0.8944099378881988)

In [5]:
test_data = data[[I, 'Category', 'Revenue','Cash Flow from Investing']]

perform_kmeans(test_data, I, True)

  Company Category   Revenue  Cash Flow from Investing
0    AAPL       IT  394328.0                  -22354.0
1    AAPL       IT  365817.0                  -14545.0
2    AAPL       IT  274515.0                   -4289.0
3    AAPL       IT  260174.0                   45896.0
4    AAPL       IT  265595.0                   16066.0
Number of clusters:  12
Silhouette Score:  0.7334773099246679
Purity:  0.7888198757763976


(KMeans(n_clusters=12, random_state=0), 0.7334773099246679, 0.7888198757763976)