**Load Libraries**

In [1]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import plotly.express as px
from umap import UMAP
from difflib import SequenceMatcher
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
warnings.filterwarnings('ignore')

**Data Loading and Initial Exploration**

In [2]:
df = pd.read_csv('pseudoProcess.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3662194 entries, 0 to 3662193
Data columns (total 9 columns):
 #   Column             Dtype  
---  ------             -----  
 0   iOSVersion         object 
 1   iOSModel           object 
 2   pid                float64
 3   readableTimestamp  object 
 4   timestamp          int64  
 5   id                 int64  
 6   device             object 
 7   scan               object 
 8   procName           object 
dtypes: float64(1), int64(2), object(6)
memory usage: 251.5+ MB


In [4]:
df.head()

Unnamed: 0,iOSVersion,iOSModel,pid,readableTimestamp,timestamp,id,device,scan,procName
0,17.4.1,"iPhone13,2",610.0,2024-04-17 11:34:21+00,1713353661,4105458,Device55,Scan1,ckdiscretionaryd
1,17.4.1,"iPhone13,2",605.0,2024-04-17 11:34:21+00,1713353661,4105459,Device55,Scan1,SiriSuggestionsLightHousePlugin
2,17.4.1,"iPhone13,2",598.0,2024-04-17 11:34:21+00,1713353661,4105460,Device55,Scan1,dprivacyd
3,17.4.1,"iPhone13,2",595.0,2024-04-17 11:34:21+00,1713353661,4105461,Device55,Scan1,activityawardsd
4,17.4.1,"iPhone13,2",713.0,2024-04-17 13:51:13+00,1713361873,4105462,Device55,Scan1,DASDelegateService


In [5]:
# Check for missing values
print(df.isnull().sum())

iOSVersion               0
iOSModel                 0
pid                  13074
readableTimestamp        0
timestamp                0
id                       0
device                   0
scan                     0
procName                 0
dtype: int64


In [6]:
## pid null?
df[df['pid'].isna()].sample(10)

Unnamed: 0,iOSVersion,iOSModel,pid,readableTimestamp,timestamp,id,device,scan,procName
1409959,17.3.1,"iPhone16,1",,2024-02-16 20:32:32+00,1708119152,15644018,Device36,Scan154,GraphicsServices
1410208,17.3.1,"iPhone16,1",,2024-02-15 10:25:39+00,1707996339,15644279,Device36,Scan154,dyld
3176954,17.3.1,"iPhone13,2",,2024-03-18 09:53:22+00,1710755602,1559653,Device47,Scan888,libsystem_pthread.dylib
520617,17.3.1,"iPad13,10",,2024-01-30 14:21:52+00,1706628112,13486836,Device20,Scan64,Organizations
2386967,17.4.1,"iPhone16,1",,2024-03-27 20:46:34+00,1711557994,17449458,Device37,Scan721,ExtensionFoundation
1393047,17.3.1,"iPhone16,1",,2024-02-18 18:57:13+00,1708286233,15627162,Device36,Scan154,libxpc.dylib
519495,17.3.1,"iPad13,10",,2024-02-25 08:12:22+00,1708852342,13483038,Device20,Scan64,libc++abi.dylib
2697752,17.3.1,"iPhone13,2",,2024-03-30 16:46:52+00,1711817212,306212,Device47,Scan900,Outlook-iOS
1503974,17.4.1,"iPhone16,1",,2024-04-04 11:53:55+00,1712238835,15778724,Device38,Scan172,BoardServices
3169108,17.3.1,"iPhone13,2",,2024-03-19 10:18:58+00,1710843538,1551838,Device47,Scan888,IvyCommon


In [7]:
# Convert timestamp to datetime
df['readableTimestamp'] = pd.to_datetime(df['readableTimestamp'])
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')


In [8]:
print(df['readableTimestamp'].min())
print(df['timestamp'].min())

1970-01-25 10:59:24+00:00
1970-01-25 10:59:24


Check for these timestamps

In [9]:
df['readableTimestamp'].dt.year.value_counts()

2024    2578827
2023     653614
2022     411049
2021      11136
1970       7568
Name: readableTimestamp, dtype: int64

Remove dates before 2024 to create smaller dataset for quick exploration and showing code

In [10]:
# Remove date before 2024
df = df[df['readableTimestamp'].dt.year >= 2024]

In [11]:
# Ensure PID is an integer
df['pid'] = pd.to_numeric(df['pid'], errors='coerce')

# Handle missing or incorrect data (here we drop missing)
df= df.dropna()

print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2565753 entries, 0 to 3662193
Data columns (total 9 columns):
 #   Column             Dtype              
---  ------             -----              
 0   iOSVersion         object             
 1   iOSModel           object             
 2   pid                float64            
 3   readableTimestamp  datetime64[ns, UTC]
 4   timestamp          datetime64[ns]     
 5   id                 int64              
 6   device             object             
 7   scan               object             
 8   procName           object             
dtypes: datetime64[ns, UTC](1), datetime64[ns](1), float64(1), int64(1), object(5)
memory usage: 195.8+ MB
None


In [12]:
# Basic info about the dataset
print(f"Total rows: {len(df)}")
print(f"Number of unique devices: {df['device'].nunique()}")
print(f"Number of unique scans: {df['scan'].nunique()}")
print(f"Number of unique iOS versions: {df['iOSVersion'].nunique()}")
print(f"Number of unique pid: {df['pid'].nunique()}")
print(f"Number of unique process names: {df['procName'].nunique()}")


Total rows: 2565753
Number of unique devices: 25
Number of unique scans: 910
Number of unique iOS versions: 14
Number of unique pid: 23071
Number of unique process names: 1519


In [13]:

# Devices and scans
device_scan_counts = df.groupby('device')['scan'].nunique()
print(f"\nAverage scans per device: {device_scan_counts.mean():.2f}")
print(f"Max scans for a device: {device_scan_counts.max()}")
print(f"Min scans for a device: {device_scan_counts.min()}")



Average scans per device: 36.40
Max scans for a device: 256
Min scans for a device: 1


In [14]:

# iOS versions
ios_versions = df['iOSVersion'].value_counts()
print("\nTop 5 iOS versions:")
print(ios_versions.head())



Top 5 iOS versions:
17.4.1    1222395
17.3.1     428493
15.5       259380
15.5       196403
16.7.7     164414
Name: iOSVersion, dtype: int64


procName

In [15]:

# Analyze process names
process_counts = df['procName'].value_counts()
print(f"\nTotal unique process names: {len(process_counts)}")
print("\nTop 10 most common processes:")
print(process_counts.head(10))


Total unique process names: 1519

Top 10 most common processes:
MTLCompilerService             65263
com.apple.WebKit.WebContent    22421
extensionkitservice            20020
CommCenter                     16281
com.apple.datamigrator         15747
backupd                        15718
kernel                         15479
CategoriesService              15049
duetexpertd                    12586
filecoordinationd              12065
Name: procName, dtype: int64


In [16]:
# Analyze processes per scan
processes_per_scan = df.groupby(['device', 'scan'])['procName'].count()
print("\nSummary of processes per scan:")
print(processes_per_scan.describe())




Summary of processes per scan:
count       910.000000
mean       2819.508791
std       16792.266809
min         193.000000
25%         385.000000
50%         508.000000
75%         983.500000
max      422302.000000
Name: procName, dtype: float64


In [17]:

# Check for scans with only one process
single_process_scans = processes_per_scan[processes_per_scan == 1]
print(f"\nNumber of scans with only one process: {len(single_process_scans)}")
if len(single_process_scans) > 0:
    print("Scans with only one process:")
    print(single_process_scans)



Number of scans with only one process: 0


In [18]:
# Check for multiple process lists within a scan
def check_multiple_lists(group):
    timestamps = group['readableTimestamp'].nunique()
    return timestamps > 1

multiple_lists = df.groupby(['device', 'scan']).apply(check_multiple_lists)
print(f"\nNumber of scans with multiple process lists: {multiple_lists.sum()}")
if multiple_lists.sum() > 0:
    print("Scans with multiple process lists:")
    print(multiple_lists[multiple_lists].head())


Number of scans with multiple process lists: 203
Scans with multiple process lists:
device   scan   
Device1  Scan423    True
         Scan432    True
         Scan435    True
         Scan442    True
         Scan447    True
dtype: bool


In [19]:
# Identify rare processes
rare_processes = process_counts[process_counts == 1].index.tolist()
print(f"\nNumber of rare processes (appearing only once): {len(rare_processes)}")
print("Sample of rare processes:", rare_processes[:10])


Number of rare processes (appearing only once): 67
Sample of rare processes: ['NumbersSpotlightExtension', 'RTLplusPushNotification', 'TooGoodToGo', 'com.apple.BarcodeSupport.ParsingService', 'Core', 'de-lieferando-notification-serv', 'zooplus', 'com.apple.DataDetectorsUI.Actio', 'WaterSort', 'Minesweeper']


**Time-based analysis**

In [20]:
df['date'] = df['timestamp'].dt.date
# Number of processes over time
plt.figure(figsize=(14,7))
temp_df = df.groupby('date').size().reset_index(name='count')

fig = px.line(temp_df, x='date', y='count', 
              title='Number of Processes per Day',
              labels={'Date': 'Date', 'Number of Processes': 'Number of Processes'},
              template='plotly_white')

# Update layout to follow good data visualization practices
fig.update_layout(
    title={'text': 'Number of Processes per Day', 'x': 0.5, 'xanchor': 'center'},
    xaxis_title='Date',
    yaxis_title='Number of Processes',
    xaxis=dict(
        rangeselector=dict(
            buttons=list([
                dict(count=1, label="1m", step="month", stepmode="backward"),
                dict(count=6, label="6m", step="month", stepmode="backward"),
                dict(step="all")
            ])
        ),
        rangeslider=dict(visible=True),
        type="date"
    ),
    yaxis=dict(tickformat=","),
    margin=dict(l=50, r=50, t=50, b=50),
    width=1000,
    height=600
)

# Show the plot
fig.show()

# Save the plot
fig.write_html("number_processes.html")


<Figure size 1400x700 with 0 Axes>

In [21]:
# Calculate the number of unique processes per day
daily_process_counts = df.groupby('date')['procName'].nunique().reset_index()
daily_process_counts.columns = ['Date', 'UniqueProcesses']

# Create the bar chart using Plotly Express with color and style options
fig = px.bar(daily_process_counts, x='Date', y='UniqueProcesses', 
             title='Unique Processes per Day',
             labels={'Date': 'Date', 'UniqueProcesses': 'Number of Unique Processes'},
             text='UniqueProcesses',
             color='UniqueProcesses',  # Color bars by the count of unique processes
             color_continuous_scale='Viridis')  # Choose a color scale

# Customize layout for a more colorful appearance
fig.update_layout(
    xaxis_title='Date',
    yaxis_title='Number of Unique Processes',
    plot_bgcolor='white',  # Background color of the plotting area
    paper_bgcolor='white',     # Background color of the entire figure
    title_font=dict(size=20, color='darkblue'),
    xaxis=dict(
        tickangle=-90,            # Angle of x-axis tick labels for better readability
        title_font=dict(size=16),
        tickfont=dict(size=14)
    ),
    yaxis=dict(
        title_font=dict(size=16),
        tickfont=dict(size=14)
    ),
    showlegend=False            # Hide legend if not necessary
)

# Display the bar chart
fig.show()

# 
fig.write_html("unique_processes.html")

In [22]:
# Analyze PID distribution
plt.figure(figsize=(12, 6))
plt.hist(df['pid'], bins=50)
plt.title('Distribution of Process IDs')
plt.xlabel('PID')
plt.ylabel('Frequency')
#plt.show()
plt.savefig('pid_distribution.png')
plt.close()


In [23]:
# Treat PIDs as categorical
pid_frequency = df['pid'].value_counts()
print("\nMost common PIDs:")
print(pid_frequency.head())

# Visualize PID frequency
plt.figure(figsize=(12, 6))
pid_frequency.head(20).plot(kind='bar')
plt.title('Frequency of Top 20 PIDs')
plt.xlabel('PID')
plt.ylabel('Frequency')
plt.tight_layout()
#plt.show()
plt.savefig('pid_frequency.png')
plt.close()

# Analyze PID uniqueness within scans
def check_pid_uniqueness(group):
    return group['pid'].nunique() == len(group)

pid_uniqueness = df.groupby(['device', 'scan']).apply(check_pid_uniqueness)
print(f"\nPercentage of scans with all unique PIDs: {pid_uniqueness.mean()*100:.2f}%")


Most common PIDs:
0.0      17639
94.0      7642
100.0     6513
98.0      6385
99.0      6369
Name: pid, dtype: int64

Percentage of scans with all unique PIDs: 63.41%


In [24]:
# Analyze PID assignment patterns
def check_pid_linearity(group):
    pids = group['pid'].sort_values()
    diff = pids.diff()
    return (diff != 1).sum() / len(diff)

pid_nonlinearity = df.groupby(['device', 'scan']).apply(check_pid_linearity)
pid_nonlinearity = pid_nonlinearity.reset_index(name = 'non_linearity_score')

# Create a new column to highlight scores above 0.90
pid_nonlinearity['highlight'] = pid_nonlinearity['non_linearity_score'] > 0.90

# Create the plot
fig = px.scatter(pid_nonlinearity, x='scan', y='non_linearity_score', color='device',
                 hover_data=['device', 'scan', 'non_linearity_score'],
                 labels={'non_linearity_score': 'Non-linearity Score',
                         'scan': 'Scan',
                         'device': 'Device'},
                 title='Non-linearity Scores by Device and Scan')

# Highlight points with score > 0.90
fig.add_trace(px.scatter(pid_nonlinearity[pid_nonlinearity['highlight']], x='scan', y='non_linearity_score',
                         color_discrete_sequence=['red'],
                         hover_data=['device', 'scan', 'non_linearity_score']).data[0])

# Update layout for better readability
fig.update_layout(legend_title_text='Device')
fig.update_traces(marker=dict(size=8))

# Show the plot
fig.show()

fig.write_html("pid_nonlinearity.html")

### Process Name

In [25]:
# Analyze process name patterns
def extract_prefix(name):
    return name.split('.')[0] if '.' in name else name

process_prefixes = df['procName'].apply(extract_prefix)
prefix_counts = process_prefixes.value_counts()

print("\nTop 10 process name prefixes:")
print(prefix_counts.head(10))


Top 10 process name prefixes:
com                          176814
MTLCompilerService            65263
extensionkitservice           20020
CommCenter                    16281
backupd                       15718
kernel                        15479
CategoriesService             15049
duetexpertd                   12586
filecoordinationd             12065
WiFiCloudAssetsXPCService     11786
Name: procName, dtype: int64


**Process Persistence:**

We analyzed which processes appear in a high percentage of scans, as persistent processes could be either essential system processes or potentially malicious software.

In [26]:
# Process persistence
process_persistence = df.groupby('procName')['scan'].nunique() / df['scan'].nunique()

persistent_processes = process_persistence[process_persistence > 0.95]
print("\nProcesses present in over 95% of scans:")
print(persistent_processes.head(10))

# Convert to DataFrame for Plotly Express
persistent_processes_df = persistent_processes.reset_index()
persistent_processes_df.columns = ['ProcessName', 'Persistence']

# Create the bar chart using Plotly Express
fig = px.bar(persistent_processes_df, x='ProcessName', y='Persistence',
             title='Processes Present in Over 95% of Scans',
             labels={'ProcessName': 'Process Name', 'Persistence': 'Persistence'},
             text='Persistence')

# Display the bar chart
fig.show()

fig.write_html("persistent_processes.html")


Processes present in over 95% of scans:
procName
AppleCredentialManagerDaemon     0.995604
AssetCacheLocatorService         0.961538
CMFSyncAgent                     0.992308
CacheDeleteAppContainerCaches    0.985714
CalendarWidgetExtension          0.984615
CommCenter                       1.000000
ContextService                   1.000000
HeuristicInterpreter             0.995604
IMDPersistenceAgent              0.998901
MTLCompilerService               0.987912
Name: scan, dtype: float64


### Anomaly Detection
1. String Similarity in Process Names: Detect process names that are similar to benign processes but have slight deviations.

In [27]:
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

# Example to find similar process names
def find_similar_processes(data):
    process_names = data['procName'].unique()
    similar_process_pairs = []

    for i, name1 in enumerate(process_names):
        for name2 in process_names[i+1:]:
            if name1 != name2 and similar(name1, name2) > 0.8:
                similar_process_pairs.append((name1, name2))

    return similar_process_pairs

similar_processes = find_similar_processes(df)
print("Similar Process Pairs:", similar_processes)


Similar Process Pairs: [('dprivacyd', 'adprivacyd'), ('backupd', '(backupd)'), ('com.apple.SafariServices.Conten', 'com.apple.SafariServices.ContentBlockerLoader'), ('com.apple.SafariServices.Conten', 'com.apple.SafariServices.Content'), ('PFLHRPeriodPredCK', 'PFLHRPeriodPredMLH'), ('healthappd', 'healthd'), ('CoreSpotlightImportExtension1_i', 'CoreSpotlightImportExtension1_iOS'), ('CoreSpotlightImportExtension1_i', 'CoreSpotlightImportExtension1_iO'), ('EAUpdaterService', 'UARPUpdaterServiceHID'), ('EAUpdaterService', 'UARPUpdaterServiceAFU'), ('adattributiond', 'spaceattributiond'), ('adattributiond', 'attributionkitd'), ('tvremoted', 'remoted'), ('MTLCompilerService', 'ANECompilerService'), ('ProactiveShareSheetLighthouseBa', 'ProactiveShareSheetLighthouseBackgroundPlugin'), ('ProactiveShareSheetLighthouseBa', 'ProactiveShareSheetLighthouseBac'), ('MetricsExtension', 'ServiceExtension'), ('UARPUpdaterServiceHID', 'UARPUpdaterServiceUSBPD'), ('UARPUpdaterServiceHID', 'UARPUpdaterServ

In [28]:
# Extract unique process names
proc_names = df['procName'].unique()

# Load a pre-trained sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for process names
embeddings = model.encode(proc_names)

# Normalize the embeddings
scaler = StandardScaler()
normalized_embeddings = scaler.fit_transform(embeddings)

# Perform clustering using HDBSCAN
clusterer = HDBSCAN(min_cluster_size=30, min_samples=1)
cluster_labels = clusterer.fit_predict(normalized_embeddings)

# Create a DataFrame with results
results = pd.DataFrame({
    'procName': proc_names,
    'cluster': cluster_labels,
    'outlier_score': clusterer.outlier_scores_
})

# Sort by outlier score in descending order
results = results.sort_values('outlier_score', ascending=False)

# Identify outliers (you can adjust the threshold as needed)
outlier_threshold = results['outlier_score'].mean() + 2 * results['outlier_score'].std()
outliers = results[results['outlier_score'] > outlier_threshold]

# Apply UMAP for dimensionality reduction
umap = UMAP(n_neighbors=20, n_components=2, min_dist=0.1, metric='cosine')
umap_embeddings = umap.fit_transform(normalized_embeddings)
# Convert proc_names to a list for indexing
proc_names_list = proc_names.tolist()

# Create the plot
plt.figure(figsize=(30, 15))

# Scatter plot with transparency
scatter = plt.scatter(umap_embeddings[:, 0], umap_embeddings[:, 1], c=cluster_labels, cmap='viridis', alpha=0.7)

# Add color bar
cbar = plt.colorbar(scatter)
cbar.set_label('Cluster Labels')

# Display only a subset of process names to reduce clutter
for i, txt in enumerate(proc_names_list):
    if np.random.rand() > 0.99:  # Display approximately 2% of labels
        plt.annotate(txt, (umap_embeddings[i, 0], umap_embeddings[i, 1]), fontsize=8, alpha=0.7)

# Highlight outliers
outlier_indices = [proc_names_list.index(name) for name in outliers['procName'] if name in proc_names_list]
plt.scatter(umap_embeddings[outlier_indices, 0], umap_embeddings[outlier_indices, 1], 
            c='red', edgecolor='k', s=100, label='Outliers')

# Annotate some of the outliers
for i in outlier_indices:
    plt.annotate(proc_names_list[i], (umap_embeddings[i, 0], umap_embeddings[i, 1]), fontsize=8, color='red', alpha=0.7)

# Add labels and title
plt.title('Process Name Clustering with Outliers Highlighted')
plt.xlabel('UMAP Dimension 1')
plt.ylabel('UMAP Dimension 2')

# Show legend
plt.legend()

# Display the plot
#plt.show()

plt.savefig('procname_outliers.png')
plt.close()


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


In [29]:
# Apply UMAP for dimensionality reduction
umap = UMAP(n_neighbors=20, n_components=2, min_dist=0.1, metric='cosine')
umap_embeddings = umap.fit_transform(normalized_embeddings)
# Convert proc_names to a list for indexing
proc_names_list = proc_names.tolist()

# Create a DataFrame for the embeddings
df_embeddings = pd.DataFrame(umap_embeddings, columns=['UMAP_1', 'UMAP_2'])
df_embeddings['procName'] = proc_names_list
df_embeddings['cluster'] = cluster_labels

# Mark outliers in the DataFrame
df_embeddings['outlier'] = df_embeddings['procName'].isin(outliers['procName'])
df_embeddings['outlier_score'] = df_embeddings.apply(
    lambda row: outliers[outliers['procName'] == row['procName']]['outlier_score'].values[0] 
    if row['procName'] in outliers['procName'].values 
    else None, axis=1)

# Create the interactive plot
fig = px.scatter(
    df_embeddings, x='UMAP_1', y='UMAP_2', color='cluster', 
    hover_data=['procName', 'outlier_score'],
    title='Process Name Clustering with Outliers Highlighted',
    labels={'UMAP_1': 'UMAP Dimension 1', 'UMAP_2': 'UMAP Dimension 2'},
    opacity=0.7
)

# Highlight outliers
outliers_data = df_embeddings[df_embeddings['outlier']]
fig.add_scatter(
    x=outliers_data['UMAP_1'], y=outliers_data['UMAP_2'],
    mode='markers', marker=dict(color='red', size=10, line=dict(width=2, color='DarkSlateGrey')),
    name='Outliers', text=outliers_data['procName']
)

# Update layout
fig.update_layout(
    legend=dict(title='Clusters'),
    xaxis_title='UMAP Dimension 1',
    yaxis_title='UMAP Dimension 2'
)

# Show the plot
fig.show()

# 
fig.write_html("procName_outliers.html")

## Future Work
### PID resets

In [30]:
from scipy.stats import zscore

# Function to detect PID resets
def detect_pid_resets(pids):
    return np.where(np.diff(pids) < 0)[0] + 1

# Function to calculate PID growth rate
def pid_growth_rate(pids, timestamps):
    return np.diff(pids) / np.diff(timestamps.astype(int) / 1e9)

# Detect PID resets
reset_indices = detect_pid_resets(df['pid'].values)
reset_timestamps = df['readableTimestamp'].iloc[reset_indices]

# Calculate PID growth rate
growth_rate = pid_growth_rate(df['pid'].values, df['readableTimestamp'].values)


# Calculate z-score of growth rate
growth_rate_zscore = zscore(growth_rate)

# Identify anomalies (excluding reset points)
anomaly_threshold = 3  # z-score threshold for anomalies
anomalies = np.abs(growth_rate_zscore) > anomaly_threshold
anomalies = np.insert(anomalies, 0, False)  # Align with original dataframe

# Remove anomalies at reset points
for idx in reset_indices:
    if idx > 0:
        anomalies[idx-1:idx+1] = False

# Analyze anomalies
anomaly_df = df[anomalies].copy()
anomaly_df['growth_rate'] = np.insert(growth_rate, 0, 0)[anomalies]
anomaly_df['growth_rate_zscore'] = np.insert(growth_rate_zscore, 0, 0)[anomalies]

print("Top 10 PID anomalies (excluding resets):")
print(anomaly_df.sort_values('growth_rate_zscore', key=abs, ascending=False)[['readableTimestamp', 'pid', 'procName', 'growth_rate', 'growth_rate_zscore']].head(10))

# Calculate statistics
total_processes = len(df)
total_anomalies = anomalies.sum()
anomaly_percentage = (total_anomalies / total_processes) * 100

print(f"\nTotal processes analyzed: {total_processes}")
print(f"Number of PID resets detected: {len(reset_timestamps)}")
print(f"Number of anomalies detected: {total_anomalies}")
print(f"Percentage of processes flagged as anomalous: {anomaly_percentage:.2f}%")

# Analyze processes around reset points
reset_window = 5  # Number of processes to check before and after reset
processes_around_resets = []

for idx in reset_indices:
    start = max(0, idx - reset_window)
    end = min(len(df), idx + reset_window)
    processes_around_resets.extend(df['procName'].iloc[start:end].tolist())

print("\nTop 10 processes observed around PID resets:")
print(pd.Series(processes_around_resets).value_counts().head(10))

Top 10 PID anomalies (excluding resets):
Empty DataFrame
Columns: [readableTimestamp, pid, procName, growth_rate, growth_rate_zscore]
Index: []

Total processes analyzed: 2565753
Number of PID resets detected: 1198527
Number of anomalies detected: 0
Percentage of processes flagged as anomalous: 0.00%

Top 10 processes observed around PID resets:
MTLCompilerService             262120
kernel                         109476
com.apple.WebKit.WebContent    107201
CommCenter                     103943
filecoordinationd               79532
duetexpertd                     79429
extensionkitservice             78572
terminusd                       74885
installcoordinationd            74601
backupd                         68726
dtype: int64


### Anomalies

In [31]:
from pyod.models.knn import KNN
from pyod.models.iforest import IForest
from pyod.models.lof import LOF
from sklearn.preprocessing import StandardScaler

# Prepare the data for anomaly detection
# We'll use PID and readableTimestamp as features

outliers_df = pd.read_csv('pseudoProcess.csv')
outliers_df['readableTimestamp'] = pd.to_datetime(outliers_df['readableTimestamp'])
# Remove date before 2024
outliers_df = outliers_df[outliers_df['readableTimestamp'].dt.year >= 2024]

# Handle missing or incorrect data (here we drop missing)
outliers_df= outliers_df.dropna()


#df['readableTimestamp'] = pd.to_numeric(df['readableTimestamp'])
# Prepare the data for anomaly detection
# We'll use PID and timestamp as features
X = outliers_df[['pid', 'timestamp']].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Initialize and fit the anomaly detection models
knn = KNN()
iforest = IForest(random_state=42)
lof = LOF()

knn.fit(X_scaled)
iforest.fit(X_scaled)
lof.fit(X_scaled)

# Get the anomaly scores
knn_scores = knn.decision_function(X_scaled)
iforest_scores = iforest.decision_function(X_scaled)
lof_scores = lof.decision_function(X_scaled)

# Add the anomaly scores to the dataframe
outliers_df['knn_score'] = knn_scores
outliers_df['iforest_score'] = iforest_scores
outliers_df['lof_score'] = lof_scores

# Function to get top anomalies
def get_top_anomalies(df, score_column, n=10):
    return df.nlargest(n, score_column)

# Get top anomalies for each method
top_knn = get_top_anomalies(outliers_df, 'knn_score')
top_iforest = get_top_anomalies(outliers_df, 'iforest_score')
top_lof = get_top_anomalies(outliers_df, 'lof_score')

# Print top anomalies
print("Top KNN Anomalies:")
print(top_knn[['readableTimestamp', 'pid', 'procName', 'knn_score']])

print("\nTop Isolation Forest Anomalies:")
print(top_iforest[['readableTimestamp', 'pid', 'procName', 'iforest_score']])

print("\nTop LOF Anomalies:")
print(top_lof[['readableTimestamp', 'pid', 'procName', 'lof_score']])


### NEED MORE WORK

# # Define a function to plot anomalies
# def plot_anomalies(data, score_column, title, color_label):
#     plt.figure(figsize=(12, 8))
#     plt.scatter(data['readableTimestamp'], data['pid'], c=data[score_column], cmap='viridis')
#     plt.colorbar(label=color_label)
#     plt.title(title)
#     plt.xlabel('Timestamp')
#     plt.ylabel('PID')
#     plt.show()
#     # Uncomment the following lines to save the plots as images
#     # plt.savefig(f'{score_column}_anomalies.png')
#     # plt.close()

# # Plot KNN anomalies
# plot_anomalies(outliers_df, 'knn_score', 'Process Anomalies (KNN)', 'Anomaly Score (KNN)')

# # Plot Isolation Forest anomalies
# plot_anomalies(outliers_df, 'iforest_score', 'Process Anomalies (Isolation Forest)', 'Anomaly Score (Isolation Forest)')

# # Plot LOF anomalies
# plot_anomalies(outliers_df, 'lof_score', 'Process Anomalies (LOF)', 'Anomaly Score (LOF)')

# Analyze agreement between methods
def get_top_n_indices(scores, n=10):
    return set(scores.argsort()[-n:])

top_knn_indices = get_top_n_indices(knn_scores)
top_iforest_indices = get_top_n_indices(iforest_scores)
top_lof_indices = get_top_n_indices(lof_scores)

agreement = top_knn_indices & top_iforest_indices & top_lof_indices

print("\nAnomalies detected by all three methods:")
print(outliers_df.iloc[list(agreement)][['readableTimestamp', 'pid', 'procName']])

# Analyze anomalies by process name
process_anomaly_counts = outliers_df.groupby('procName').agg({
    'knn_score': 'mean',
    'iforest_score': 'mean',
    'lof_score': 'mean'
}).sort_values('knn_score', ascending=False)

print("\nTop potentially anomalous processes (by average KNN score):")
print(process_anomaly_counts.head(10))

Top KNN Anomalies:
                readableTimestamp      pid           procName  knn_score
1379012 2024-02-01 17:01:11+00:00  19525.0  filecoordinationd   0.309034
1379025 2024-02-01 17:01:11+00:00  19525.0  filecoordinationd   0.309034
2471939 2024-03-11 09:33:16+00:00  12051.0        duetexpertd   0.249218
2471950 2024-03-11 09:33:16+00:00  12051.0        duetexpertd   0.249218
2471962 2024-03-11 09:33:16+00:00  12051.0        duetexpertd   0.249218
1385191 2024-02-20 14:22:33+00:00  19525.0  filecoordinationd   0.139527
1385204 2024-02-20 14:22:33+00:00  19525.0  filecoordinationd   0.139527
1375346 2024-02-20 14:26:18+00:00  19525.0  filecoordinationd   0.139401
1375359 2024-02-20 14:26:18+00:00  19525.0  filecoordinationd   0.139401
2471831 2024-03-11 09:33:16+00:00   5972.0       searchpartyd   0.131080

Top Isolation Forest Anomalies:
                readableTimestamp      pid                           procName  \
1928259 2024-04-22 18:09:46+00:00  86414.0                      