In [None]:
%pip install numpy
%pip install pandas
%pip install scikit-learn
%pip install matplotlib

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score
import pickle
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN

In [None]:

def read_node_data(csv_file_path):
    df = pd.read_csv(csv_file_path)
    return df

csv_file_path = 'node_data.csv'
df = read_node_data(csv_file_path)
print(df.head())

In [None]:


window_size = 50
df['ph_trend'] = df.groupby('node_id')['ph'].rolling(window=window_size).mean().reset_index(level=0, drop=True)
df['tds_trend'] = df.groupby('node_id')['tds'].rolling(window=window_size).mean().reset_index(level=0, drop=True)

def isolation_forest_scorer(estimator, X):
    y_pred = estimator.predict(X)
    y_true = np.ones(len(X))
    return f1_score(y_true, y_pred == -1)

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_samples': ['auto', 0.5, 0.75],
    'contamination': [0.05, 0.1, 0.15],
    'max_features': [1.0, 0.75, 0.5]
}

iso_forest = IsolationForest(random_state=42)

grid_search = GridSearchCV(
    estimator=iso_forest,
    param_grid=param_grid,
    cv=3,
    n_jobs=-1,
    scoring=make_scorer(isolation_forest_scorer),
    verbose=2
)

grid_search.fit(df[['ph', 'tds']])

best_params = grid_search.best_params_

print("Best parameters found: ", best_params)

optimized_clf = IsolationForest(
    n_estimators=best_params['n_estimators'], 
    max_samples=best_params['max_samples'], 
    contamination=best_params['contamination'], 
    max_features=best_params['max_features'],
    random_state=42
)
optimized_clf.fit(df[['ph', 'tds']])

with open('optimized_clf.pkl', 'wb') as model_file:
    pickle.dump(optimized_clf, model_file)

with open('optimized_clf.pkl', 'rb') as model_file:
    loaded_clf = pickle.load(model_file)

df['anomaly_optimized'] = loaded_clf.predict(df[['ph', 'tds']])

anomalous_nodes_optimized = df[df['anomaly_optimized'] == -1]

db = DBSCAN(eps=1.5, min_samples=10).fit(df[['ph', 'tds']])

df['cluster_optimized'] = db.labels_

affected_nodes_optimized = []
core_nodes_optimized = anomalous_nodes_optimized['node_id'].unique()

affected_by = {}

for node_id in core_nodes_optimized:
    node_cluster = df[df['node_id'] == node_id]['cluster_optimized'].iloc[0]
    if node_cluster != -1:
        affected_nodes = df[df['cluster_optimized'] == node_cluster]['node_id'].unique()
        affected_nodes = [node for node in affected_nodes if node != node_id]
        affected_nodes_optimized.extend(affected_nodes)
        for affected_node in affected_nodes:
            if affected_node not in affected_by:
                affected_by[affected_node] = []
            affected_by[affected_node].append(node_id)

affected_nodes_optimized = list(set(affected_nodes_optimized))

affected_nodes_optimized = [node for node in affected_nodes_optimized if node not in core_nodes_optimized]
risk_threshold_ph = 8.5
risk_threshold_tds = 400
at_risk_nodes = df[(df['ph_trend'] > risk_threshold_ph) | (df['tds_trend'] > risk_threshold_tds)]
at_risk_node_ids = at_risk_nodes['node_id'].unique()

plt.figure(figsize=(10, 8))
plt.scatter(df[df['anomaly_optimized'] == 1]['ph'], df[df['anomaly_optimized'] == 1]['tds'], color='blue', label='Các node bình thường')
plt.scatter(anomalous_nodes_optimized['ph'], anomalous_nodes_optimized['tds'], color='red', label='Các node bị ô nhiễm')
plt.scatter(df[df['node_id'].isin(at_risk_node_ids)]['ph'], df[df['node_id'].isin(at_risk_node_ids)]['tds'], color='purple', label='Các node có nguy cơ ô nhiễm')
plt.xlabel('pH')
plt.ylabel('TDS')
plt.title('Phân bố các node trong mạng IOT')
plt.legend()
plt.grid(True)
plt.show()

result_optimized = []

for node_id in df['node_id'].unique():
    node_data = df[df['node_id'] == node_id]
    node_type = 'normal'
    extra_info = {}
    if node_id in anomalous_nodes_optimized['node_id'].unique():
        node_type = 'polluted'
    elif node_id in affected_nodes_optimized:
        node_type = 'effected'
    elif node_id in at_risk_node_ids:
        node_type = 'risk'
        extra_info = {
            'meanPh': node_data['ph_trend'].mean(),
            'meanTDS': node_data['tds_trend'].mean()
        }
    result = {
        'node_id': node_id,
        'node_type': node_type,
        'meanPh': node_data['ph'].mean(),
        'meanTDS': node_data['tds'].mean()
    }
    result.update(extra_info)
    result_optimized.append(result)

result_optimized_df = pd.DataFrame(result_optimized)
print(result_optimized_df)
