In [1]:
import pandas as pd
from src.StreamPort.machine_learning.analyses import MachineLearningAnalyses

train_data = pd.read_csv("dev/train_features.csv")
train_metadata = pd.read_csv("dev/train_metadata.csv")
ana = MachineLearningAnalyses(variables = train_data, metadata = train_metadata)
print(ana)
ana.plot_data()


MachineLearningAnalyses 
  variables: 20 rows, 13 columns
  metadata: 20 rows, 14 columns



In [2]:
import numpy as np
from src.StreamPort.machine_learning.methods import MachineLearningMethodIsolationForestSklearn
from src.StreamPort.machine_learning.methods import MachineLearningScaleFeaturesScalerSklearn

scl = MachineLearningScaleFeaturesScalerSklearn(scaler_type="MaxAbsScaler")
iso = MachineLearningMethodIsolationForestSklearn(random_state = 30, contamination = "auto")

#scales and fits the model and upgrades analyses class to IsolationForestAnalyses
ana = scl.run(ana)
ana = iso.run(ana)
print(ana.__class__)

ana.plot_data()

<class 'src.StreamPort.machine_learning.analyses.IsolationForestAnalyses'>


In [3]:
fig_train_scores=ana.plot_scores()
fig_train_scores.write_image("dev/figures/fig_train_scores.png", width=1100, height= 350, scale = 3)
fig_train_scores.show()

In [4]:
test_data = pd.read_csv("dev/test_features.csv")
test_metadata = pd.read_csv("dev/test_metadata.csv")

ana.predict(test_data, test_metadata)
outliers_test = ana.test_prediction_outliers()
print(outliers_test)

fig_test_scores = ana.plot_scores()
fig_test_scores.write_image("dev/figures/fig_test_scores.png", width=1100, height= 350, scale = 3)
fig_test_scores.show()

outliers_test["outlier"] = outliers_test["outlier"].map({True: "outlier", False: "normal"})
outliers_test["class"] = outliers_test["outlier"]
outliers_test = outliers_test.drop(columns=["outlier"])
if "dev/test_metadata_classified.csv":
    test_metadata = pd.read_csv("dev/test_metadata_classified.csv")  
else:
    test_metadata = pd.concat([test_metadata, outliers_test], axis=1)
test_metadata.to_csv("dev/test_metadata_classified.csv", index=False)

    outlier     score
0      True -0.165777
1     False -0.050186
2     False -0.011833
3     False -0.050641
4     False -0.025044
5     False -0.035669
6     False -0.026595
7     False -0.076644
8     False -0.047640
9     False -0.041304
10    False -0.065239
11    False -0.032293
12    False -0.005843
13    False -0.021890
14    False -0.037300
15    False -0.037234
16    False -0.066698


# First addition:
of new training data based on previous prediction results

In [5]:
ana.add_prediction()
print(ana)


IsolationForestAnalyses 
  variables: 36 rows, 13 columns
  metadata: 36 rows, 14 columns



In [6]:
fig_test_features = ana.plot_data()
fig_test_features.write_image("dev/figures/fig_test_features.png", width=1100, height= 350, scale = 3)
fig_test_features.show()

In [7]:
test_data = pd.read_csv("dev/test3_features.csv")
test_metadata = pd.read_csv("dev/test3_metadata.csv")
ana.predict(test_data, test_metadata)
outliers_test = ana.test_prediction_outliers()
print(outliers_test)
ana.plot_scores()

   outlier     score
0     True -0.174050
1     True -0.081416
2     True -0.155085
3     True -0.133496
4     True -0.152018


In [8]:
ana.add_prediction()
print(ana)


IsolationForestAnalyses 
  variables: 36 rows, 13 columns
  metadata: 36 rows, 14 columns



In [9]:
test2_data = pd.read_csv("dev/test2_features.csv")
test2_metadata = pd.read_csv("dev/test2_metadata.csv")
ana.predict(test2_data, test2_metadata)
outliers_test2 = ana.test_prediction_outliers()
print(outliers_test2)
fig_test2_scores = ana.plot_scores()
fig_test2_scores.write_image("dev/figures/fig_test2_scores.png", width=1100, height= 350, scale = 3)
fig_test2_scores.show()

   outlier     score
0     True -0.127485
1     True -0.139783
2     True -0.133496
3     True -0.174050
4     True -0.155085


In [10]:
ana.add_prediction()
print(ana)


IsolationForestAnalyses 
  variables: 36 rows, 13 columns
  metadata: 36 rows, 14 columns



In [11]:
fig_test2_features = ana.plot_data()
fig_test2_features.write_image("dev/figures/fig_test2_features.png", width=1100, height= 350, scale = 3)
fig_test2_features.show()

In [12]:
test3_data = pd.read_csv("dev/test3_features.csv")
test3_metadata = pd.read_csv("dev/test3_metadata.csv")
ana.predict(test3_data, test3_metadata)
outliers_test3 = ana.test_prediction_outliers()
print(outliers_test3)
fig_test3_scores = ana.plot_scores()
fig_test3_scores.write_image("dev/figures/fig_test3_scores.png", width=1100, height= 350, scale = 3)
fig_test3_scores.show()

   outlier     score
0     True -0.174050
1     True -0.081416
2     True -0.155085
3     True -0.133496
4     True -0.152018


In [13]:
ana.add_prediction()
print(ana)
#ana.plot_scores()


IsolationForestAnalyses 
  variables: 36 rows, 13 columns
  metadata: 36 rows, 14 columns



In [14]:
fig_test3_features = ana.plot_data()
fig_test3_features.show()

In [15]:
threshold_record = pd.read_csv("dev/threshold_record.csv", index_col=0) if "dev/threshold_record.csv" else None
import plotly.graph_objects as go

fig = go.Figure()
if threshold_record is not None:
    fig.add_trace(
        go.Scatter(
            x=threshold_record["training set"],
            y=threshold_record["threshold"],
            mode="lines+markers",
            name="Threshold",
            text=threshold_record.index,
            hovertemplate=["<br>Trained on: " + threshold_record.index[i]
                            + "<br>Threshold: " + str(threshold_record["threshold"][i]) 
                            + "<br>Outliers in test set: " + str(threshold_record["outliers"][i]) 
                            + "<br>Outliers %: " + str(threshold_record["outliers %"][i]) + "%" for i in range(len(threshold_record))
                            ],
            line=dict(color="red", width=2, dash='dash'),
            marker=dict(size=8, symbol="circle")
        )
    )
fig.update_layout(
    title="Threshold Increase Over Size of Training Set",
    xaxis_title="Number of Training Curves",
    yaxis_title="Threshold",
    template="plotly_white"
)


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`

