Importing Libraries

In [1]:
import os
import pandas as pd
import networkx as nx
import numpy as np
import pickle
from src.network.network_features import cal_betweenness_centrality
from sklearn.model_selection import train_test_split
# datasets is a list of available datasets descriptions containing: path, key columns names, and suitable complex network features
from src.data.dataset_info import datasets



with_sort_timestamp = False
undersample_classes = True
folder_path = "fl_from_2_datasets/"

if not os.path.isdir(folder_path):
    os.mkdir(folder_path)

## 1. Data loading and cleaning

In [2]:
dataset1 = datasets[0]
print(f"==>> dataset1.name: {dataset1.name}")
df1 = pd.read_parquet("./dataset/cic_ton_iot.parquet")
# converting all infinity values into nan then dropping all records containing nan values
df1.replace([np.inf, -np.inf], np.nan, inplace=True)
df1.dropna(axis=0, how='any', inplace=True)

df1.drop_duplicates(subset=list(set(df1.columns) - set([dataset1.timestamp_col, dataset1.flow_id_col])), keep="first", inplace=True)

if dataset1.low_classes:
    df1 = df1[~df1[dataset1.class_col].isin(dataset1.low_classes)]

==>> dataset1.name: cic_ton_iot


FileNotFoundError: [Errno 2] No such file or directory: './dataset/cic_ton_iot.parquet'

In [4]:
classes1 = df1[dataset1.class_col].unique()
print(classes1)

['Benign' 'mitm' 'scanning' 'dos' 'ddos' 'injection' 'password' 'backdoor'
 'ransomware' 'xss']


In [5]:
dataset2 = datasets[1]
print(f"==>> dataset2.name: {dataset2.name}")
df2 = pd.read_parquet("./dataset/cic_ids_2017.parquet")
# converting all infinity values into nan then dropping all records containing nan values
df2.replace([np.inf, -np.inf], np.nan, inplace=True)
df2.dropna(axis=0, how='any', inplace=True)

df2.drop_duplicates(subset=list(set(df2.columns) - set([dataset2.timestamp_col, dataset2.flow_id_col])), keep="first", inplace=True)

if dataset2.low_classes:
    df2 = df2[~df2[dataset2.class_col].isin(dataset2.low_classes)]

==>> dataset2.name: cic_ids_2017


In [6]:
classes2 = df2[dataset2.class_col].unique()
print(f"==>> classes2: {classes2}")

==>> classes2: ['BENIGN' 'DDoS' 'PortScan' 'Bot' 'Web Attack � Brute Force'
 'Web Attack � XSS' 'FTP-Patator' 'SSH-Patator' 'DoS slowloris'
 'DoS Slowhttptest' 'DoS Hulk' 'DoS GoldenEye']


In [7]:
df2[dataset2.class_col] = df2[dataset2.class_col].replace({"BENIGN": "Benign",
                                                            "DDoS": "ddos",
                                                            "Web Attack � Brute Force": "bruteforce",
                                                            "Web Attack � XSS": "xss"})

In [8]:
classes2 = df2[dataset2.class_col].unique()
print(f"==>> classes2: {classes2}")

==>> classes2: ['Benign' 'ddos' 'PortScan' 'Bot' 'bruteforce' 'xss' 'FTP-Patator'
 'SSH-Patator' 'DoS slowloris' 'DoS Slowhttptest' 'DoS Hulk'
 'DoS GoldenEye']


In [9]:
classes = set(np.concatenate([classes2,classes1]))
print(f"==>> classes: {classes}")

==>> classes: {'ransomware', 'DoS slowloris', 'backdoor', 'FTP-Patator', 'DoS Hulk', 'SSH-Patator', 'dos', 'injection', 'scanning', 'DoS GoldenEye', 'ddos', 'mitm', 'PortScan', 'bruteforce', 'DoS Slowhttptest', 'Bot', 'Benign', 'password', 'xss'}


In [10]:
from sklearn.preprocessing import LabelEncoder

# df1.replace([np.inf, -np.inf], np.nan, inplace=True)
# df2.dropna(axis=0, how='any', inplace=True)

# df1.replace([np.inf, -np.inf], np.nan, inplace=True)
# df2.dropna(axis=0, how='any', inplace=True)

if with_sort_timestamp:
    df1[dataset1.timestamp_col] = pd.to_datetime(df1[dataset1.timestamp_col].str.strip(), format=dataset1.timestamp_format)
    df1.sort_values(dataset1.timestamp_col, inplace= True)

if with_sort_timestamp:
    df2[dataset2.timestamp_col] = pd.to_datetime(df2[dataset2.timestamp_col].str.strip(), format=dataset2.timestamp_format)
    df2.sort_values(dataset2.timestamp_col, inplace= True)

label_encoder = LabelEncoder()
label_encoder.fit(list(classes))

df1[dataset1.class_num_col] = label_encoder.transform(df1[dataset1.class_col])
df2[dataset2.class_num_col] = label_encoder.transform(df2[dataset2.class_col])
labels_names = dict(zip(label_encoder.transform(label_encoder.classes_), label_encoder.classes_))

print(f"==>> labels_names: {labels_names}")

==>> labels_names: {0: 'Benign', 1: 'Bot', 2: 'DoS GoldenEye', 3: 'DoS Hulk', 4: 'DoS Slowhttptest', 5: 'DoS slowloris', 6: 'FTP-Patator', 7: 'PortScan', 8: 'SSH-Patator', 9: 'backdoor', 10: 'bruteforce', 11: 'ddos', 12: 'dos', 13: 'injection', 14: 'mitm', 15: 'password', 16: 'ransomware', 17: 'scanning', 18: 'xss'}


In [11]:
if undersample_classes:
    # Group by the class column and get the count of records in each class
    class_counts = df1.groupby(dataset1.class_col).size()

    # Sort the counts in descending order
    class_counts_sorted = class_counts.sort_values(ascending=False)
    print(f"==>> class_counts_sorted: {class_counts_sorted}")

==>> class_counts_sorted: Attack
Benign        2514059
xss           2149308
password       340208
injection      277696
scanning        36205
backdoor        27145
ransomware       5098
mitm              517
ddos              202
dos               145
dtype: int64


In [12]:
if undersample_classes:
    # Get the classes with the highest number of records (you can choose how many to undersample)
    classes_to_undersample = class_counts_sorted.index[:2]

    # Undersample the classes with the highest number of records
    dfs = []
    for class_label in class_counts_sorted.index:
        print(f"==>> class_label: {class_label}")
        if class_label in classes_to_undersample:
            class_df = df1[df1[dataset1.class_col] == class_label]
            undersampled_df = class_df.sample(frac=0.5)  # Specify the fraction of samples to keep
            dfs.append(undersampled_df)
        else:
            dfs.append(df1[df1[dataset1.class_col] == class_label])

    df1 = []
    # Optional: shuffle the undersampled DataFrame
    df1 = pd.concat(dfs).sample(frac=1).reset_index(drop=True)


==>> class_label: Benign
==>> class_label: xss
==>> class_label: password
==>> class_label: injection
==>> class_label: scanning
==>> class_label: backdoor
==>> class_label: ransomware
==>> class_label: mitm
==>> class_label: ddos
==>> class_label: dos


In [13]:
if undersample_classes:
    # Group by the class column and get the count of records in each class
    class_counts = df1.groupby(dataset1.class_col).size()

    # Sort the counts in descending order
    class_counts_sorted = class_counts.sort_values(ascending=False)
    print(f"==>> class_counts_sorted: {class_counts_sorted}")

==>> class_counts_sorted: Attack
Benign        1257030
xss           1074654
password       340208
injection      277696
scanning        36205
backdoor        27145
ransomware       5098
mitm              517
ddos              202
dos               145
dtype: int64


In [14]:
if undersample_classes:
    # Group by the class column and get the count of records in each class
    class_counts = df2.groupby(dataset2.class_col).size()

    # Sort the counts in descending order
    class_counts_sorted = class_counts.sort_values(ascending=False)
    print(f"==>> class_counts_sorted: {class_counts_sorted}")

==>> class_counts_sorted: Attack
Benign              2265910
DoS Hulk             222563
PortScan             158804
ddos                 128025
DoS GoldenEye         10293
FTP-Patator            7935
SSH-Patator            5897
DoS slowloris          5769
DoS Slowhttptest       5499
Bot                    1956
bruteforce             1507
xss                     652
dtype: int64


In [15]:
if undersample_classes:
    # Get the classes with the highest number of records (you can choose how many to undersample)
    classes_to_undersample = class_counts_sorted.index[:1]

    # Undersample the classes with the highest number of records
    dfs = []
    for class_label in class_counts_sorted.index:
        if class_label in classes_to_undersample:
            class_df = df2[df2[dataset2.class_col] == class_label]
            undersampled_df = class_df.sample(frac=0.5)  # Specify the fraction of samples to keep
            dfs.append(undersampled_df)
        else:
            dfs.append(df2[df2[dataset2.class_col] == class_label])

    df2 = []
    # Optional: shuffle the undersampled DataFrame
    df2 = pd.concat(dfs).sample(frac=1).reset_index(drop=True)


In [16]:
if undersample_classes:
    # Group by the class column and get the count of records in each class
    class_counts = df2.groupby(dataset2.class_col).size()

    # Sort the counts in descending order
    class_counts_sorted = class_counts.sort_values(ascending=False)
    print(f"==>> class_counts_sorted: {class_counts_sorted}")

==>> class_counts_sorted: Attack
Benign              1132955
DoS Hulk             222563
PortScan             158804
ddos                 128025
DoS GoldenEye         10293
FTP-Patator            7935
SSH-Patator            5897
DoS slowloris          5769
DoS Slowhttptest       5499
Bot                    1956
bruteforce             1507
xss                     652
dtype: int64


In [17]:
with open(folder_path + '/labels_names.pkl', 'wb') as f:
    pickle.dump([labels_names, classes], f)

In [18]:
total_count = len(df1)

properties = {
    "name": dataset1.name,
    "length": total_count,
}

num_benign = len(df1[df1['Label'] == 0])
num_attack = len(df1[df1['Label'] == 1])

properties["num_benign"] = num_benign
properties["percentage_of_benign_records"] = ((num_benign * 100)/total_count)

properties["num_attack"] = num_attack
properties["percentage_of_attack_records"] = ((num_attack * 100)/total_count)

properties["attacks"] = list(df1["Attack"].unique())  # .to_list()


G = nx.from_pandas_edgelist(
    df1,
    source=dataset1.src_ip_col,
    target=dataset1.dst_ip_col,
    create_using=nx.DiGraph()
)

print(f"==>> number_of_nodes: {G.number_of_nodes()}")
print(f"==>> number_of_edges: {G.number_of_edges()}")

properties


==>> number_of_nodes: 109592
==>> number_of_edges: 210210


{'name': 'cic_ton_iot',
 'length': 3018900,
 'num_benign': 1257030,
 'percentage_of_benign_records': 41.6386763390639,
 'num_attack': 1761870,
 'percentage_of_attack_records': 58.3613236609361,
 'attacks': ['Benign',
  'xss',
  'injection',
  'password',
  'backdoor',
  'scanning',
  'ransomware',
  'ddos',
  'mitm',
  'dos']}

In [19]:
total_count = len(df2)

properties = {
    "name": dataset1.name,
    "length": total_count,
}

num_benign = len(df2[df2['Label'] == 0])
num_attack = len(df2[df2['Label'] == 1])

properties["num_benign"] = num_benign
properties["percentage_of_benign_records"] = ((num_benign * 100)/total_count)

properties["num_attack"] = num_attack
properties["percentage_of_attack_records"] = ((num_attack * 100)/total_count)

properties["attacks"] = list(df2["Attack"].unique())  # .to_list()


G = nx.from_pandas_edgelist(
    df2,
    source=dataset1.src_ip_col,
    target=dataset1.dst_ip_col,
    create_using=nx.DiGraph()
)

print(f"==>> number_of_nodes: {G.number_of_nodes()}")
print(f"==>> number_of_edges: {G.number_of_edges()}")

properties


==>> number_of_nodes: 18345
==>> number_of_edges: 93155


{'name': 'cic_ton_iot',
 'length': 1681855,
 'num_benign': 1132955,
 'percentage_of_benign_records': 67.36341717924553,
 'num_attack': 548900,
 'percentage_of_attack_records': 32.63658282075446,
 'attacks': ['Benign',
  'ddos',
  'DoS Hulk',
  'FTP-Patator',
  'PortScan',
  'DoS slowloris',
  'DoS GoldenEye',
  'SSH-Patator',
  'Bot',
  'DoS Slowhttptest',
  'xss',
  'bruteforce']}

In [20]:
train1, test1 = train_test_split(df1, test_size=0.1, shuffle= True, random_state=1, stratify=df1[dataset1.class_col])
train2, test2 = train_test_split(df2, test_size=0.1, shuffle= True, random_state=1, stratify=df2[dataset2.class_col])

In [21]:
def add_centralities(df):
    
    G = nx.from_pandas_edgelist(
        df,
        source=dataset1.src_ip_col,
        target=dataset1.dst_ip_col,
        create_using=nx.MultiDiGraph()
    )

    print(f"===============")
    print(f"==>> number_of_nodes: {G.number_of_nodes()}")
    
    print(f"==>> number_of_edges: {G.number_of_edges()}")
    print(f"===============")

    degrees = nx.degree_centrality(G)
    betwe = cal_betweenness_centrality(G)
    pagerank = nx.pagerank(G, alpha=0.85)

    df["src_degree"] = df.apply(
                lambda row: degrees.get(row[dataset1.src_ip_col], -1), axis=1)
    df["dst_degree"] = df.apply(
                lambda row: degrees.get(row[dataset1.dst_ip_col], -1), axis=1)
    
    df["src_betweenness"] = df.apply(
                lambda row: betwe.get(row[dataset1.src_ip_col], -1), axis=1)
    df["dst_betweenness"] = df.apply(
                lambda row: betwe.get(row[dataset1.dst_ip_col], -1), axis=1)
    
    df["src_pagerank"] = df.apply(
                lambda row: pagerank.get(row[dataset1.src_ip_col], -1), axis=1)
    df["dst_pagerank"] = df.apply(
                lambda row: pagerank.get(row[dataset1.dst_ip_col], -1), axis=1)
    
    return df

def add_centralities_multidigraph(df):
    
    G = nx.from_pandas_edgelist(
        df,
        source=dataset1.src_ip_col,
        target=dataset1.dst_ip_col,
        create_using=nx.MultiDiGraph()
    )

    print(f"===============")
    print(f"==>> number_of_nodes: {G.number_of_nodes()}")
    print(f"==>> number_of_edges: {G.number_of_edges()}")
    print(f"===============")

    degrees = nx.degree_centrality(G)
    betwe = cal_betweenness_centrality(G)
    pagerank = nx.pagerank(G, alpha=0.85)

    df["src_multidigraph_degree"] = df.apply(
                lambda row: degrees.get(row[dataset1.src_ip_col], -1), axis=1)
    df["dst_multidigraph_degree"] = df.apply(
                lambda row: degrees.get(row[dataset1.dst_ip_col], -1), axis=1)
    
    df["src_multidigraph_betweenness"] = df.apply(
                lambda row: betwe.get(row[dataset1.src_ip_col], -1), axis=1)
    df["dst_multidigraph_betweenness"] = df.apply(
                lambda row: betwe.get(row[dataset1.dst_ip_col], -1), axis=1)
    
    df["src_multidigraph_pagerank"] = df.apply(
                lambda row: pagerank.get(row[dataset1.src_ip_col], -1), axis=1)
    df["dst_multidigraph_pagerank"] = df.apply(
                lambda row: pagerank.get(row[dataset1.dst_ip_col], -1), axis=1)
    
    return df

In [22]:
test = pd.concat([test1, test2])
test = add_centralities(test)
test = add_centralities_multidigraph(test)
test.to_parquet(folder_path + "test.parquet")

==>> number_of_nodes: 33432
==>> number_of_edges: 470076
==>> number_of_nodes: 33432
==>> number_of_edges: 470076


In [23]:
client_data = np.array_split(train1, 5) + np.array_split(train2, 3)

for cid, data_partition in enumerate(client_data):
    
    #data_partition = add_centralities(data_partition)
    #data_partition = add_centralities_multidigraph(data_partition)

    data_partition.to_parquet(folder_path + "client_{}.parquet".format(cid))

  return bound(*args, **kwds)


In [24]:
import igraph as ig

clients_paths = [
    folder_path + "client_0.parquet",
    folder_path + "client_1.parquet",
    folder_path + "client_2.parquet",
    folder_path + "client_3.parquet",
    folder_path + "client_4.parquet",
    folder_path + "client_5.parquet",
    folder_path + "client_6.parquet",
    folder_path + "client_7.parquet"
]

In [25]:
def compute_graph_properties(G, name):
    G.remove_nodes_from(list(nx.isolates(G)))
    for node in G.nodes():
        G.nodes[node]['label'] = node
    G1 = ig.Graph.from_networkx(G)
    
    labels = [G.nodes[node]['label'] for node in G.nodes()]
    G1.vs['label'] = labels
    
    part = G1.community_infomap()
    
    communities = []
    for com in part:
        communities.append([G1.vs[node_index]['label'] for node_index in com])
    
    properties = {}
    #properties["number_of_com"] = len(communities)
    properties["number_of_nodes"] = G.number_of_nodes()
    properties["number_of_edges"] = G.number_of_edges()
    
    degrees = [degree for _, degree in G.degree()]
    properties["max_degree"] = max(degrees)
    properties["avg_degree"] = sum(degrees) / len(degrees)
    properties["transitivity"] = nx.transitivity(G)
    properties["density"] = nx.density(G)
    
    node_to_community = {}
    for community_index, community in enumerate(communities):
        for node in community:
            node_to_community[node] = community_index
    
    inter_cluster_edges = 0
    for u, v in G.edges():
        if node_to_community[u] != node_to_community[v]:
            inter_cluster_edges += 1
    
    properties["mixing_parameter"] = inter_cluster_edges / G.number_of_edges()
    
    filename = os.path.join('fl_from_2_datasets', 'datasets_properties', f"graph_{name}.json")
    with open(filename, 'w') as outfile:
        json.dump(properties, outfile)
    
    return properties

In [26]:
import json

for idx, client_path in enumerate(clients_paths):
    df_client = pd.read_parquet(client_path)
    src_ip_col = 'Src IP'
    dst_ip_col = 'Dst IP'
    G_client = nx.from_pandas_edgelist(
        df_client,
        source=src_ip_col,
        target=dst_ip_col,
        create_using=nx.DiGraph()
    )
    dataset_name = f"dataset_{idx}"
    properties = compute_graph_properties(G_client, dataset_name)
    print(f"Properties for {dataset_name}: {properties}")
    graph_path = "./fl_from_2_datasets/fl_client_graphs/graph_client_{}.gexf".format(idx)
    
    nx.write_gexf(G_client, graph_path)
    
    print(f"Graph for client {idx} saved to {graph_path}")

print("All graphs have been created and saved.")

Properties for dataset_0: {'number_of_nodes': 37781, 'number_of_edges': 44590, 'max_degree': 9876, 'avg_degree': 2.3604457266880177, 'transitivity': 0.008497341929645968, 'density': 3.1239355832292454e-05, 'mixing_parameter': 0.1317335725498991}
Graph for client 0 saved to ./fl_from_2_datasets/fl_client_graphs/graph_client_0.gexf
Properties for dataset_1: {'number_of_nodes': 38177, 'number_of_edges': 45024, 'max_degree': 10147, 'avg_degree': 2.3586976451790345, 'transitivity': 0.004805040211363703, 'density': 3.089241467386623e-05, 'mixing_parameter': 0.10963041933191187}
Graph for client 1 saved to ./fl_from_2_datasets/fl_client_graphs/graph_client_1.gexf
Properties for dataset_2: {'number_of_nodes': 37646, 'number_of_edges': 44370, 'max_degree': 9748, 'avg_degree': 2.357222546884131, 'transitivity': 0.005274043433298862, 'density': 3.130857413845306e-05, 'mixing_parameter': 0.13838178949740815}
Graph for client 2 saved to ./fl_from_2_datasets/fl_client_graphs/graph_client_2.gexf
Prop

In [61]:
#0.44.20.15
#198.55.220.44
last_client_path = clients_paths[-1]
#8.0.6.4
#8.6.0.1

df_client_last = pd.read_parquet(last_client_path)
ip_address = '8.6.0.1'
filtered_df = df_client[(df_client['Src IP'] == ip_address) | (df_client['Dst IP'] == ip_address)]
filtered_df["Label"].unique()

array([0], dtype=int64)

In [62]:
df_client_last.columns


Index(['Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Protocol',
       'Timestamp', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts',
       'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max',
       'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std',
       'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean',
       'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean',
       'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot',
       'Fwd IAT Mean', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Tot',
       'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min',
       'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags',
       'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s', 'Bwd Pkts/s',
       'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean', 'Pkt Len Std',
       'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt', 'RST Flag Cnt',
       'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt', 'CWE Flag Count',
       'ECE Flag Cnt', 'Dow

In [82]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
centrality_columns = [
    'src_degree', 'dst_degree', 'src_betweenness', 'dst_betweenness',
    'src_pagerank', 'dst_pagerank', 'src_multidigraph_degree',
    'dst_multidigraph_degree', 'src_multidigraph_betweenness',
    'dst_multidigraph_betweenness', 'src_multidigraph_pagerank',
    'dst_multidigraph_pagerank'
]

centrality_data = df_client_last[centrality_columns]

imputer = SimpleImputer(strategy='mean')
centrality_data_imputed = imputer.fit_transform(centrality_data)

if pd.isnull(centrality_data_imputed).any():
    raise ValueError("There are still NaN values after imputation.")

scaler = StandardScaler()
scaled_centrality_data = scaler.fit_transform(centrality_data_imputed)

if pd.isnull(scaled_centrality_data).any():
    raise ValueError("There are NaN values after standardization.")

pca = PCA(n_components=1)  
principal_components = pca.fit_transform(scaled_centrality_data)

if pd.isnull(principal_components).any():
    raise ValueError("PCA output contains NaN values. Check the input data or PCA parameters.")

df_pca = pd.DataFrame(data=principal_components, columns=['PC1'])

df_new = pd.concat([df_client_last, df_pca], axis=1)


print("Updated DataFrame with PCA Component:")
print(df_new.head())

print(df_new['PC1'].head(100))

Updated DataFrame with PCA Component:
                                          Flow ID          Src IP  Src Port  \
990242   172.217.10.225-192.168.10.12-443-51948-6  172.217.10.225     443.0   
1526483       172.16.0.1-192.168.10.50-45442-80-6      172.16.0.1   45442.0   
775084      192.168.10.3-192.168.10.16-53-9253-17   192.168.10.16    9253.0   
83479     192.168.10.9-54.210.193.125-10222-443-6    192.168.10.9   10222.0   
844875     192.168.10.15-217.196.33.13-52607-80-6   192.168.10.15   52607.0   

                 Dst IP  Dst Port  Protocol            Timestamp  \
990242    192.168.10.12   51948.0       6.0  03/07/2017 04:10:14   
1526483   192.168.10.50      80.0       6.0       5/7/2017 10:49   
775084     192.168.10.3      53.0      17.0        7/7/2017 9:55   
83479    54.210.193.125     443.0       6.0        4/7/2017 4:30   
844875    217.196.33.13      80.0       6.0  03/07/2017 01:58:38   

         Flow Duration  Tot Fwd Pkts  Tot Bwd Pkts  ...  dst_betweenness  \
99

In [79]:
df_new["PC1"].head(100)

990242          NaN
1526483         NaN
775084          NaN
83479     -0.264189
844875          NaN
             ...   
150326     1.023150
390864    -2.877478
533755          NaN
394422    -2.877478
1214685         NaN
Name: PC1, Length: 100, dtype: float64