In [1]:
import os
import pandas as pd
import networkx as nx
import numpy as np
import pickle
import json

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# datasets is a list of available datasets descriptions containing: path, key columns names, and suitable complex network features
from src.data.dataset_info import datasets, cn_measures_type_1, cn_measures_type_2, cn_measures_type_3, cn_measures_type_4, network_features_type_1,network_features_type_2,network_features_type_3,network_features_type_4
from src.graph_level_measures import compute_graph_properties
from src.add_centralities import add_centralities
from src.add_pca_columns import process_clients_with_grouped_pca, evaluate_pca_results, process_clients_with_pca


with_sort_timestamp = True
undersample_classes = True
folder_path = "temp/"
folder_path_prep = "temp/preprocessed/"
output_folder = 'datasets/gdlc'


if not os.path.isdir(folder_path):
    os.mkdir(folder_path)
    os.mkdir(folder_path_prep)
    os.mkdir(output_folder)

# Preparing Datasets

### Reading and Cleaning

In [2]:
dataset1 = datasets[0]
print(f"==>> dataset1.name: {dataset1.name}")
df1 = pd.read_parquet("./datasets/original/cic_ton_iot.parquet")
# df1 = pd.read_parquet("./testing_dfs/cic_ton_iot_5_percent.parquet")
# converting all infinity values into nan then dropping all records containing nan values
df1.replace([np.inf, -np.inf], np.nan, inplace=True)
df1.dropna(axis=0, how='any', inplace=True)

df1.drop_duplicates(subset=list(set(df1.columns) - set([dataset1.timestamp_col, dataset1.flow_id_col])), keep="first", inplace=True)

if dataset1.weak_columns:
    df1 = df1[~df1[dataset1.class_col].isin(dataset1.weak_columns)]

==>> dataset1.name: cic_ton_iot


In [3]:
dataset2 = datasets[1]
print(f"==>> dataset2.name: {dataset2.name}")
df2 = pd.read_parquet("./datasets/original/cic_ids_2017.parquet")
# df2 = pd.read_parquet("./testing_dfs/cic_ids_2017_5_percent.parquet")
# converting all infinity values into nan then dropping all records containing nan values
df2.replace([np.inf, -np.inf], np.nan, inplace=True)
df2.dropna(axis=0, how='any', inplace=True)

df2.drop_duplicates(subset=list(set(df2.columns) - set([dataset2.timestamp_col, dataset2.flow_id_col])), keep="first", inplace=True)

if dataset2.weak_columns:
    df2 = df2[~df2[dataset2.class_col].isin(dataset2.weak_columns)]

==>> dataset2.name: cic_ids_2017


### Attacks Types

In [4]:
classes1 = df1[dataset1.class_col].unique()
print(classes1)

['Benign' 'mitm' 'scanning' 'dos' 'ddos' 'injection' 'password' 'backdoor'
 'ransomware' 'xss']


In [5]:
classes2 = df2[dataset2.class_col].unique()
print(f"==>> classes2: {classes2}")

==>> classes2: ['BENIGN' 'DDoS' 'PortScan' 'Bot' 'Infiltration'
 'Web Attack � Brute Force' 'Web Attack � XSS'
 'Web Attack � Sql Injection' 'FTP-Patator' 'SSH-Patator' 'DoS slowloris'
 'DoS Slowhttptest' 'DoS Hulk' 'DoS GoldenEye' 'Heartbleed']


renaming some attacks to fit the naming in df1

In [6]:
df2[dataset2.class_col] = df2[dataset2.class_col].replace({"BENIGN": "Benign",
                                                            "DDoS": "ddos",
                                                            "Web Attack � Brute Force": "bruteforce",
                                                            "Web Attack � XSS": "xss"})

In [7]:
classes2 = df2[dataset2.class_col].unique()
print(f"==>> classes2: {classes2}")

==>> classes2: ['Benign' 'ddos' 'PortScan' 'Bot' 'Infiltration' 'bruteforce' 'xss'
 'Web Attack � Sql Injection' 'FTP-Patator' 'SSH-Patator' 'DoS slowloris'
 'DoS Slowhttptest' 'DoS Hulk' 'DoS GoldenEye' 'Heartbleed']


In [8]:
classes = set(np.concatenate([classes2,classes1]))
print(f"==>> classes: {classes}")

==>> classes: {'Bot', 'backdoor', 'PortScan', 'Infiltration', 'dos', 'Heartbleed', 'DoS GoldenEye', 'Benign', 'DoS slowloris', 'ddos', 'Web Attack � Sql Injection', 'bruteforce', 'injection', 'scanning', 'xss', 'SSH-Patator', 'ransomware', 'DoS Hulk', 'DoS Slowhttptest', 'FTP-Patator', 'password', 'mitm'}


### Sorting (optional)

In [9]:
if with_sort_timestamp:
    df1[dataset1.timestamp_col] = pd.to_datetime(df1[dataset1.timestamp_col].str.strip(), format=dataset1.timestamp_format)
    df1.sort_values(dataset1.timestamp_col, inplace= True)

if with_sort_timestamp:
    df2[dataset2.timestamp_col] = pd.to_datetime(df2[dataset2.timestamp_col].str.strip(), format=dataset2.timestamp_format)
    df2.sort_values(dataset2.timestamp_col, inplace= True)

### Encoding Attacks into integers

In [10]:
label_encoder = LabelEncoder()
label_encoder.fit(list(classes))

df1[dataset1.class_num_col] = label_encoder.transform(df1[dataset1.class_col])
df2[dataset2.class_num_col] = label_encoder.transform(df2[dataset2.class_col])
labels_names = dict(zip(label_encoder.transform(label_encoder.classes_), label_encoder.classes_))

print(f"==>> labels_names: {labels_names}")

==>> labels_names: {0: 'Benign', 1: 'Bot', 2: 'DoS GoldenEye', 3: 'DoS Hulk', 4: 'DoS Slowhttptest', 5: 'DoS slowloris', 6: 'FTP-Patator', 7: 'Heartbleed', 8: 'Infiltration', 9: 'PortScan', 10: 'SSH-Patator', 11: 'Web Attack � Sql Injection', 12: 'backdoor', 13: 'bruteforce', 14: 'ddos', 15: 'dos', 16: 'injection', 17: 'mitm', 18: 'password', 19: 'ransomware', 20: 'scanning', 21: 'xss'}


### Undersampling classes (optional)

In [11]:
if undersample_classes:
    # Group by the class column and get the count of records in each class
    class_counts = df1.groupby(dataset1.class_col).size()

    # Sort the counts in descending order
    class_counts_sorted = class_counts.sort_values(ascending=False)
    print(f"==>> class_counts_sorted: {class_counts_sorted}")

==>> class_counts_sorted: Attack
Benign        2514059
xss           2149308
password       340208
injection      277696
scanning        36205
backdoor        27145
ransomware       5098
mitm              517
ddos              202
dos               145
dtype: int64


In [12]:
if undersample_classes:
    # Get the classes with the highest number of records (you can choose how many to undersample)
    classes_to_undersample = class_counts_sorted.index[:2]

    # Undersample the classes with the highest number of records
    dfs = []
    for class_label in class_counts_sorted.index:
        print(f"==>> class_label: {class_label}")
        if class_label in classes_to_undersample:
            class_df = df1[df1[dataset1.class_col] == class_label]
            undersampled_df = class_df.sample(frac=0.5)  # Specify the fraction of samples to keep
            dfs.append(undersampled_df)
        else:
            dfs.append(df1[df1[dataset1.class_col] == class_label])

    df1 = []
    # Optional: shuffle the undersampled DataFrame
    df1 = pd.concat(dfs).sample(frac=1).reset_index(drop=True)


==>> class_label: Benign
==>> class_label: xss
==>> class_label: password
==>> class_label: injection
==>> class_label: scanning
==>> class_label: backdoor
==>> class_label: ransomware
==>> class_label: mitm
==>> class_label: ddos
==>> class_label: dos


In [13]:
if undersample_classes:
    # Group by the class column and get the count of records in each class
    class_counts = df1.groupby(dataset1.class_col).size()

    # Sort the counts in descending order
    class_counts_sorted = class_counts.sort_values(ascending=False)
    print(f"==>> class_counts_sorted: {class_counts_sorted}")

==>> class_counts_sorted: Attack
Benign        1257030
xss           1074654
password       340208
injection      277696
scanning        36205
backdoor        27145
ransomware       5098
mitm              517
ddos              202
dos               145
dtype: int64


In [14]:
if undersample_classes:
    # Group by the class column and get the count of records in each class
    class_counts = df2.groupby(dataset2.class_col).size()

    # Sort the counts in descending order
    class_counts_sorted = class_counts.sort_values(ascending=False)
    print(f"==>> class_counts_sorted: {class_counts_sorted}")

==>> class_counts_sorted: Attack
Benign                        2265910
DoS Hulk                       222563
PortScan                       158804
ddos                           128025
DoS GoldenEye                   10293
FTP-Patator                      7935
SSH-Patator                      5897
DoS slowloris                    5769
DoS Slowhttptest                 5499
Bot                              1956
bruteforce                       1507
xss                               652
Infiltration                       36
Web Attack � Sql Injection         21
Heartbleed                         11
dtype: int64


In [15]:
if undersample_classes:
    # Get the classes with the highest number of records (you can choose how many to undersample)
    classes_to_undersample = class_counts_sorted.index[:1]

    # Undersample the classes with the highest number of records
    dfs = []
    for class_label in class_counts_sorted.index:
        if class_label in classes_to_undersample:
            class_df = df2[df2[dataset2.class_col] == class_label]
            undersampled_df = class_df.sample(frac=0.5)  # Specify the fraction of samples to keep
            dfs.append(undersampled_df)
        else:
            dfs.append(df2[df2[dataset2.class_col] == class_label])

    df2 = []
    # Optional: shuffle the undersampled DataFrame
    df2 = pd.concat(dfs).sample(frac=1).reset_index(drop=True)


In [16]:
if undersample_classes:
    # Group by the class column and get the count of records in each class
    class_counts = df2.groupby(dataset2.class_col).size()

    # Sort the counts in descending order
    class_counts_sorted = class_counts.sort_values(ascending=False)
    print(f"==>> class_counts_sorted: {class_counts_sorted}")

==>> class_counts_sorted: Attack
Benign                        1132955
DoS Hulk                       222563
PortScan                       158804
ddos                           128025
DoS GoldenEye                   10293
FTP-Patator                      7935
SSH-Patator                      5897
DoS slowloris                    5769
DoS Slowhttptest                 5499
Bot                              1956
bruteforce                       1507
xss                               652
Infiltration                       36
Web Attack � Sql Injection         21
Heartbleed                         11
dtype: int64


### saving labels encodings and datasets properties

In [17]:
with open(folder_path + '/labels_names.pkl', 'wb') as f:
    pickle.dump([labels_names, classes], f)

In [18]:
total_count = len(df1)

properties = {
    "name": dataset1.name,
    "length": total_count,
}

num_benign = len(df1[df1['Label'] == 0])
num_attack = len(df1[df1['Label'] == 1])

properties["num_benign"] = num_benign
properties["percentage_of_benign_records"] = ((num_benign * 100)/total_count)

properties["num_attack"] = num_attack
properties["percentage_of_attack_records"] = ((num_attack * 100)/total_count)

properties["attacks"] = list(df1["Attack"].unique()) 


G = nx.from_pandas_edgelist(
    df1,
    source=dataset1.src_ip_col,
    target=dataset1.dst_ip_col,
    create_using=nx.DiGraph()
)

properties["number_of_nodes"] = G.number_of_nodes() 
properties["number_of_edges"] = G.number_of_edges()

with open(folder_path + '/df1_properties.txt', 'w') as f:
    json.dump(properties, f)
    
properties


{'name': 'cic_ton_iot',
 'length': 3018900,
 'num_benign': 1257030,
 'percentage_of_benign_records': 41.6386763390639,
 'num_attack': 1761870,
 'percentage_of_attack_records': 58.3613236609361,
 'attacks': ['xss',
  'Benign',
  'password',
  'injection',
  'scanning',
  'ransomware',
  'backdoor',
  'mitm',
  'dos',
  'ddos'],
 'number_of_nodes': 109340,
 'number_of_edges': 209275}

In [19]:
total_count = len(df2)

properties = {
    "name": dataset1.name,
    "length": total_count,
}

num_benign = len(df2[df2['Label'] == 0])
num_attack = len(df2[df2['Label'] == 1])

properties["num_benign"] = num_benign

properties["percentage_of_benign_records"] = ((num_benign * 100)/total_count)

properties["num_attack"] = num_attack
properties["percentage_of_attack_records"] = ((num_attack * 100)/total_count)

properties["attacks"] = list(df2["Attack"].unique())  # .to_list()


G = nx.from_pandas_edgelist(
    df2,
    source=dataset1.src_ip_col,
    target=dataset1.dst_ip_col,
    create_using=nx.DiGraph()
)

properties["number_of_nodes"] = G.number_of_nodes() 
properties["number_of_edges"] = G.number_of_edges()

with open(folder_path + '/df2_properties.txt', 'w') as f:
    json.dump(properties, f)

properties


{'name': 'cic_ton_iot',
 'length': 1681923,
 'num_benign': 1132955,
 'percentage_of_benign_records': 67.36069368217213,
 'num_attack': 548968,
 'percentage_of_attack_records': 32.63930631782787,
 'attacks': ['Benign',
  'DoS Hulk',
  'PortScan',
  'SSH-Patator',
  'ddos',
  'DoS Slowhttptest',
  'bruteforce',
  'FTP-Patator',
  'DoS GoldenEye',
  'DoS slowloris',
  'Bot',
  'xss',
  'Infiltration',
  'Web Attack � Sql Injection',
  'Heartbleed'],
 'number_of_nodes': 18364,
 'number_of_edges': 93050}

# Splitting into Clients

### creating main training and testing splits

test parts will be concatenated to create the main testing df

train parts will be further splitted into clients

In [20]:
train1, test1 = train_test_split(df1, test_size=0.1, shuffle= True, random_state=1, stratify=df1[dataset1.class_col])
train2, test2 = train_test_split(df2, test_size=0.1, shuffle= True, random_state=1, stratify=df2[dataset2.class_col])


if with_sort_timestamp:
    # train1[dataset1.timestamp_col] = pd.to_datetime(train1[dataset1.timestamp_col].str.strip(), format=dataset1.timestamp_format)
    train1.sort_values(dataset1.timestamp_col, inplace= True)
    train2.sort_values(dataset2.timestamp_col, inplace= True)


### Computing graph-level measures (to apply GDLC)

In [21]:
# split dfs into clients
client_data = np.array_split(train1, 5) + np.array_split(train2, 3)

graphs_properties_path = os.path.join(output_folder, 'graphs_properties')

for cid, data_partition in enumerate(client_data):
    data_partition.to_parquet(
        folder_path + "client_{}.parquet".format(cid))

    G = nx.from_pandas_edgelist(
        data_partition, source=dataset1.src_ip_col, target=dataset1.dst_ip_col, create_using=nx.Graph())
    properties = compute_graph_properties(G, f"client_{cid}", graphs_properties_path)
    print(f"Computed properties for client_{cid}: {properties}")


    
test = pd.concat([test1, test2])
test.to_parquet(folder_path + "test.parquet")

G_test = nx.from_pandas_edgelist(
    test, source=dataset1.src_ip_col, target=dataset1.dst_ip_col, create_using=nx.Graph())
test_properties = compute_graph_properties(G_test, "test", graphs_properties_path)

print(f"Computed properties for test dataset: {test_properties}")

  return bound(*args, **kwds)


Computed properties for client_0: {'name': 'client_0', 'number_of_nodes': 51328, 'number_of_edges': 85513, 'max_degree': 25835, 'avg_degree': 3.3320215087281797, 'transitivity': 9.279595167204398e-05, 'density': 6.49175192146079e-05, 'mixing_parameter': 0.40161145089050787}
Computed properties for client_1: {'name': 'client_1', 'number_of_nodes': 200, 'number_of_edges': 256, 'max_degree': 112, 'avg_degree': 2.56, 'transitivity': 0.016811003565970453, 'density': 0.012864321608040201, 'mixing_parameter': 0.1796875}
Computed properties for client_2: {'name': 'client_2', 'number_of_nodes': 118, 'number_of_edges': 142, 'max_degree': 59, 'avg_degree': 2.406779661016949, 'transitivity': 0.025817555938037865, 'density': 0.020570766333478197, 'mixing_parameter': 0.19014084507042253}
Computed properties for client_3: {'name': 'client_3', 'number_of_nodes': 143, 'number_of_edges': 170, 'max_degree': 71, 'avg_degree': 2.3776223776223775, 'transitivity': 0.01805157593123209, 'density': 0.0167438195

### Adding Centralities

Specifying the centralities to add, according to which branch (type) from the four branches of GDLC

In [22]:
cn_measures_types = [
    cn_measures_type_1,
    cn_measures_type_3,
    cn_measures_type_3,
    cn_measures_type_3,
    cn_measures_type_1,
    cn_measures_type_2,
    cn_measures_type_1,
    cn_measures_type_1,
    cn_measures_type_2,
]

network_features_types = [
    network_features_type_1,
    network_features_type_3,
    network_features_type_3,
    network_features_type_3,
    network_features_type_1,
    network_features_type_2,
    network_features_type_1,
    network_features_type_1,
    network_features_type_2,
]

# homogeneous clients, using same centralities 
# cn_measures_type_0 = ["betweenness", "degree", "pagerank"]
# network_features_type_0 = ['src_betweenness', 'dst_betweenness', 'src_degree', 'dst_degree', 'src_pagerank', 'dst_pagerank']


# cn_measures_types = [
#     cn_measures_type_0,
#     cn_measures_type_0,
#     cn_measures_type_0,
#     cn_measures_type_0,
#     cn_measures_type_0,
#     cn_measures_type_0,
#     cn_measures_type_0,
#     cn_measures_type_0,
#     cn_measures_type_0,
# ]

# network_features_types = [
#     network_features_type_0,
#     network_features_type_0,
#     network_features_type_0,
#     network_features_type_0,
#     network_features_type_0,
#     network_features_type_0,
#     network_features_type_0,
#     network_features_type_0,
#     network_features_type_0,
# ]

In [23]:
client_filenames = [
    "client_0.parquet",
    "client_1.parquet",
    "client_2.parquet",
    "client_3.parquet",
    "client_4.parquet",
    "client_5.parquet",
    "client_6.parquet",
    "client_7.parquet",
    "test.parquet"
]
clients_paths = [os.path.join(folder_path, name) for name in client_filenames]

In [24]:
centralities_columns = []
def process_dataset(name, path, dataset, cn_measures, network_features):
    print("Processing dataset: {}".format(name))
    new_path = os.path.join(folder_path_prep, "{}.parquet".format(name))
    graph_path = os.path.join(folder_path_prep,"graphs","graph_{}.gexf".format(name))
    os.makedirs(os.path.dirname(new_path), exist_ok=True)
    os.makedirs(os.path.dirname(graph_path), exist_ok=True)
    
    df = pd.read_parquet(path)
    # df.replace([np.inf, -np.inf], np.nan, inplace=True)
    # df.dropna(axis=0, how='any', inplace=True)
    # df.drop_duplicates(subset=list(set(df.columns) - set([dataset.timestamp_col, dataset.flow_id_col])), keep="first", inplace=True)
    
    columns = add_centralities(df, new_path, graph_path, dataset, cn_measures, network_features)
    centralities_columns.append(columns)

In [25]:
process_dataset("client_0", clients_paths[0], datasets[0], cn_measures_types[0], network_features_types[0])
process_dataset("client_1", clients_paths[1], datasets[0], cn_measures_types[1], network_features_types[1])
process_dataset("client_2", clients_paths[2], datasets[0], cn_measures_types[2], network_features_types[2])
process_dataset("client_3", clients_paths[3], datasets[0], cn_measures_types[3], network_features_types[3])
process_dataset("client_4", clients_paths[4], datasets[0], cn_measures_types[4], network_features_types[4])
process_dataset("client_5", clients_paths[5], datasets[1], cn_measures_types[5], network_features_types[5])
process_dataset("client_6", clients_paths[6], datasets[1], cn_measures_types[6], network_features_types[6])
process_dataset("client_7", clients_paths[7], datasets[1], cn_measures_types[7], network_features_types[7])
process_dataset("test", clients_paths[8], datasets[1], cn_measures_types[8], network_features_types[8])

Processing dataset: client_0
calculated betweenness
calculated local_betweenness
calculated degree
calculated local_degree
calculated eigenvector
calculated closeness
calculated pagerank
calculated local_pagerank
calculated k_core
calculated k_truss
calculated Comm
==>> features_dicts: ('betweenness', 51328)
==>> features_dicts: ('local_betweenness', 51328)
==>> features_dicts: ('degree', 51328)
==>> features_dicts: ('local_degree', 51328)
==>> features_dicts: ('eigenvector', 51328)
==>> features_dicts: ('closeness', 51328)
==>> features_dicts: ('pagerank', 51328)
==>> features_dicts: ('local_pagerank', 51328)
==>> features_dicts: ('k_core', 51328)
==>> features_dicts: ('k_truss', 51328)
==>> features_dicts: ('Comm', 51328)
DataFrame written to temp/preprocessed/client_0.parquet
Processing dataset: client_1
calculated betweenness
calculated local_betweenness
calculated pagerank
calculated local_pagerank
calculated k_core
calculated k_truss
calculated Comm
==>> features_dicts: ('between

### Adding PCA columns

In [26]:
from collections import defaultdict
import shutil

clients_paths = [
    folder_path_prep + "client_0.parquet",
    folder_path_prep + "client_1.parquet",
    folder_path_prep + "client_2.parquet",
    folder_path_prep + "client_3.parquet",
    folder_path_prep + "client_4.parquet",
    folder_path_prep + "client_5.parquet",
    folder_path_prep + "client_6.parquet",
    folder_path_prep + "client_7.parquet",
    folder_path_prep + "test.parquet",
]

client_features = {}

for client_path in clients_paths:
    df = pd.read_parquet(client_path)
    features = set(df.columns)
    client_features[client_path] = features

common_features = set.intersection(*client_features.values())
print(f"Common Features Across All Clients: {common_features}\n")

unique_features = {client_path: features - common_features for client_path, features in client_features.items()}

feature_groups = defaultdict(list)

for client_path, features in unique_features.items():
    feature_groups[frozenset(features)].append(client_path)

for i, (unique_feature_set, clients) in enumerate(feature_groups.items(), 1):
    print(f"Unique Feature Set Group {i}:")
    print(f"Unique Features: {set(unique_feature_set)}")
    print(f"Clients: {clients}")
    print("----------")


pca_results, pca_columns = process_clients_with_grouped_pca(feature_groups, output_folder, n_components=2)


Common Features Across All Clients: {'Pkt Len Std', 'Fwd IAT Mean', 'Fwd IAT Min', 'Bwd Pkt Len Mean', 'Pkt Len Mean', 'src_betweenness', 'Subflow Fwd Byts', 'Subflow Bwd Pkts', 'Flow Byts/s', 'Label', 'Bwd Seg Size Avg', 'Down/Up Ratio', 'SYN Flag Cnt', 'Fwd URG Flags', 'CWE Flag Count', 'Tot Bwd Pkts', 'Fwd Byts/b Avg', 'Flow ID', 'Active Std', 'Fwd Seg Size Min', 'URG Flag Cnt', 'Bwd Pkt Len Std', 'Bwd IAT Std', 'Dst IP', 'Flow Pkts/s', 'Src IP', 'Flow IAT Max', 'Idle Mean', 'Bwd Blk Rate Avg', 'RST Flag Cnt', 'FIN Flag Cnt', 'Active Max', 'Fwd Seg Size Avg', 'Bwd IAT Max', 'Idle Max', 'Bwd IAT Tot', 'src_k_core', 'Flow IAT Min', 'Init Bwd Win Byts', 'Bwd IAT Mean', 'Bwd Header Len', 'dst_k_core', 'Pkt Len Max', 'Pkt Len Min', 'Class', 'Fwd Pkt Len Std', 'Bwd URG Flags', 'Flow IAT Mean', 'src_pagerank', 'TotLen Bwd Pkts', 'Fwd Pkt Len Min', 'Dst Port', 'dst_pagerank', 'Fwd Pkts/s', 'src_k_truss', 'Subflow Bwd Byts', 'Fwd Pkt Len Mean', 'Attack', 'Fwd IAT Tot', 'Fwd Pkts/b Avg', 'Idl

In [27]:
shutil.copy(os.path.join(folder_path, 'labels_names.pkl'), os.path.join(output_folder, '/labels_names.pkl'))
with open(output_folder + '/added_columns.pkl', 'wb') as f:
    pickle.dump([centralities_columns, pca_columns], f)

In [28]:
# clients_paths = [
#     output_folder + "/client_0.parquet",
#     output_folder + "/client_1.parquet",
#     output_folder + "/client_2.parquet",
#     output_folder + "/client_3.parquet",
#     output_folder + "/client_4.parquet",
#     output_folder + "/client_5.parquet",
#     output_folder + "/client_6.parquet",
#     output_folder + "/client_7.parquet",
#     output_folder + "/test.parquet"

# ]

# evaluate_pca_results(clients_paths)