## MachineLearningEngine Class

The MachineLearningEngine class is builds on the CoreEngine class. The CoreEngine class serves as a parent class engines that focus on data, while the MachineLearningEngine class is for engines that focus on learning from data.

In [None]:
from src.StreamPort.ml.MachineLearningEngine import MachineLearningEngine

#Creates an empty MachineLearningEngine object and prints it
engine = MachineLearningEngine()
engine.print()

## MachineLearningAnalysis Class

The MachineLearningAnalysis class is builds on the class Analysis. The Analysis class that is used to perform analysis on the data. 

In [None]:
from src.StreamPort.ml.MachineLearningAnalysis import MachineLearningAnalysis

#Creates an empty MachineLearningAnalysis obejct and prints it
analysis = MachineLearningAnalysis()
analysis.print()

#### Load the CSV File  

This method loads the dataset from csv file and create a list of analysis object. Used the data to make a matrix with the analysis names and visualizes the results using a scatter plot.  

In [None]:
from src.StreamPort.ml.MachineLearningEngine import MachineLearningEngine
from sklearn.decomposition import PCA 
import matplotlib.pyplot as plt

#Creates an empty MachineLearningEngine object and prints it
path = 'feature_list.csv'
engine = MachineLearningEngine()
engine.add_analyses_from_csv(path)

engine.print()

print("Create a list of analysis object and prints it" )
for analysis in engine._analyses:
    print(f"Analysis: {analysis.name}")
    for key, value in analysis.data.items():
        print(f"{key}: {value}")
    print("\n")

rownames = engine.get_analyses_names()
print("Analysename: ", rownames)

mat = engine.get_data()
mat.index = rownames
print("Matrix: \n", mat)


#### Make a Principle Conponent Analysis (PCA)

The method implements a machine learning engine that perfporms PCA on the dataset and visualizes the results. ProcessingSetting is the parent of MakePCA. The ProcessingSettings used to assemble data processing workflows within the each engine. The subclass MakePCASKL of MakePCA using skitklearn algorithm to perform the PCA.

In [None]:
from src.StreamPort.ml.MachineLearningEngine import MachineLearningEngine
from src.StreamPort.ml.MachineLearningProcessingSettings import  MakeModelPCASKL
import webbrowser

#Creates an empty MachineLearningEngine object and prints it
path = 'feature_list.csv'
engine = MachineLearningEngine()
engine.add_analyses_from_csv(path)

class_path = 'feature_metadata.csv'
engine.add_classes_from_csv(class_path)

engine.print()
#print(engine.get_classes())

# !!! make a general data plot
engine.plot_data()
webbrowser.open('general_data_plot.html')
# x axis in the index of the features (i.e., col names)
# y axis is the valule for each analysis
# color legend is applied for each analysis


# Add the ProcessingSettings to the _settings attribute with add settings
pca_model = MakeModelPCASKL(n_components = 2, center_data= True)
engine.add_settings(pca_model)
engine.print()
# Create a method in the ML engine to perfom PCA and collect the results
engine.run_workflow()
# The results are added to the _results atribute of the engine
# make a plot method in the ML engine for the PCA results and classes
engine.plot_pca()
webbrowser.open('pca_scores_plot.html')
webbrowser.open('pca_loadings_plot.html')
# make a loadings plot after confirming the scores plot


#### Make a Density-Based Spatial Clustering of Application with Noise (DBSCAN)



In [30]:
from src.StreamPort.ml.MachineLearningEngine import MachineLearningEngine
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
import plotly.express as px

eps = 1.5  # Radius um einen Punkt, um andere Punkte als Nachbarn zu betrachten
min_samples = 5  # Minimale Anzahl an Nachbarn, die ein Punkt haben muss, um kein Rauschpunkt zu sein

path = 'feature_list.csv'  # Pfad zu deiner CSV-Datei
engine = MachineLearningEngine()  # Instanziierung der MachineLearningEngine
engine.add_analyses_from_csv(path)  # Daten laden
engine.print()  # Informationen über die geladenen Daten ausgeben

df = pd.read_csv(path)
data = df.drop(columns=['name'])  # Entferne die Spalte 'name' (falls vorhanden)

dbscan = DBSCAN(eps=eps, min_samples=min_samples)
labels = dbscan.fit_predict(data)  # Cluster-Labels (Rauschen = -1)

n_clusters = len(set(labels)) - (1 if -1 in labels else 0)  # Anzahl der Cluster
n_noise = list(labels).count(-1)  # Anzahl der Rauschpunkte
print(f"Anzahl der geschätzten Cluster: {n_clusters}")
print(f"Anzahl der Rauschpunkte: {n_noise}")

data['Cluster'] = labels  # Füge Cluster-Labels zu den Daten hinzu
data['Cluster'] = data['Cluster'].astype(str)  # Um Cluster als Kategorien anzuzeigen

fig = px.scatter(data, x=data.columns[0], y=data.columns[1], color='Cluster',
                 title="DBSCAN Clustering Results",
                 color_continuous_scale=px.colors.diverging.Tealrose,
                 labels={'color': 'Cluster ID'})

fig.show()  # Zeige den Plot an

Structure of the CSV file: {'number_of_rows': 45, 'number_of_columns': 4445}

MachineLearningEngine 
  name: None 
  author: None 
  path: None 
  date: 2024-10-29 17:12:24.115177 
  analyses: 45 
  settings: 0 

Anzahl der geschätzten Cluster: 0
Anzahl der Rauschpunkte: 45


### Uniform Manifold Approximation and Projection (UMAP)

In [None]:
from src.StreamPort.ml.MachineLearningEngine import MachineLearningEngine
from src.StreamPort.ml.MachineLearningProcessingSettings import  MakeModelUMAP

#Creates an empty MachineLearningEngine object and prints it
path = 'feature_list.csv'
engine = MachineLearningEngine()
engine.add_analyses_from_csv(path)

class_path = 'feature_metadata.csv'
engine.add_classes_from_csv(class_path)
engine.print()

umap_model = MakeModelUMAP(n_neighbors=15, min_dist=0.1, n_components=2,random_state=42)
engine.add_settings(umap_model)
engine.print()
engine.run_workflow()
engine.plot_umap()


In [None]:
from src.StreamPort.ml.MachineLearningEngine import MachineLearningEngine
from src.StreamPort.ml.MachineLearningProcessingSettings import  MakeModelUMAP

path = 'feature_list.csv'
engine = MachineLearningEngine()
engine.add_analyses_from_csv(path)

class_path = 'new_metadata.csv'

#for march
engine.month_march(class_path)
engine.print()
umap_model_march = MakeModelUMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
engine.add_settings(umap_model_march)
engine.run_workflow()
engine.plot_umap()

# for both
engine.month_april(class_path)
engine.print()
umap_model_april = MakeModelUMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
engine.add_settings(umap_model_april)
engine.run_workflow()
engine.plot_umap()

In [None]:
from src.StreamPort.ml.MachineLearningEngine import MachineLearningEngine
from src.StreamPort.ml.MachineLearningProcessingSettings import  MakeModelUMAP

path = 'feature_list.csv'
engine = MachineLearningEngine()
engine.add_analyses_from_csv(path)

class_path = 'new_metadata.csv'

#for april
engine.month_april(class_path)
engine.print()
umap_model_april = MakeModelUMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
engine.add_settings(umap_model_april)
engine.run_workflow()
engine.plot_umap()

### Random Forest Classifier

In [None]:
import pandas as pd
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder


df = pd.read_csv('new_metadata.csv')
target_column = 'class'

# labelencoder to convert categorical data into numerical values
label_encoders = {}
for column in df.columns:
    if df[column].dtype == 'object':  
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        label_encoders[column] = le  

# splitting the data
X = df.drop(columns=[target_column])
y = df[target_column]

X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)
y_train, y_test = train_test_split(y, test_size=0.2, random_state=42)


# fitting and evaluation the random forest model
rfc = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=42)
rfc.fit(X_train, y_train)

# make a prediction
y_pred = rfc.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("accuracy:", accuracy)
class_report = classification_report(y_test,y_pred)
print("classificatonReport:\n", class_report)

# vizualize confusionsmatrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("confusionMatrix:\n", conf_matrix)

conf_matrix_df = pd.DataFrame(conf_matrix, 
                              index=label_encoders[target_column].classes_, 
                              columns=label_encoders[target_column].classes_)

fig_conf_matrix = px.imshow(conf_matrix_df, 
                            labels=dict(x="Predicted", y="Actual", color="Count"), 
                            x=label_encoders[target_column].classes_, 
                            y=label_encoders[target_column].classes_,
                            title="Confusion Matrix")

fig_conf_matrix.update_layout(coloraxis_showscale=True)
fig_conf_matrix.show()

# plot the importance of each feature, using ploty
feature_importances = rfc.feature_importances_
features = X.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
fig = px.bar(importance_df, x='Feature', y='Importance', title='Random Forest Classifier')
fig.show()


In [None]:
import pandas as pd
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('groups_classes.csv')
target_column = 'class'

# labelencoder to convert categorical data into numerical values
label_encoders = {}
for column in df.columns:
    if df[column].dtype == 'object':  
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        label_encoders[column] = le  

# splitting the data
X = df.drop(columns=[target_column])
y = df[target_column]

X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)
y_train, y_test = train_test_split(y, test_size=0.2, random_state=42)

# fitting and evaluation the random forest model
rfc = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=42)
rfc.fit(X_train, y_train)

# make a prediction
y_pred = rfc.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("accuracy:", accuracy)
# class_report = classification_report(y_test,y_pred)
# print("classificatonReport:\n", class_report)

# vizualize confusionsmatrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("confusionMatrix:\n", conf_matrix)

conf_matrix_df = pd.DataFrame(conf_matrix, 
                              index=label_encoders[target_column].classes_, 
                              columns=label_encoders[target_column].classes_)

fig_conf_matrix = px.imshow(conf_matrix_df, 
                            labels=dict(x="Predicted", y="Actual", color="Count"), 
                            x=label_encoders[target_column].classes_, 
                            y=label_encoders[target_column].classes_,
                            title="Confusion Matrix")

fig_conf_matrix.update_layout(coloraxis_showscale=True)
fig_conf_matrix.show()

# plot the importance of each feature, using ploty
feature_importances = rfc.feature_importances_
features = X.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
fig = px.bar(importance_df, x='Feature', y='Importance', title='Random Forest Classifier')
fig.show()


### Hierarchical Density-Based Spatial Clustering of Application (HDBSCAN)

In [37]:
import pandas as pd
import plotly.express as px
from sklearn.cluster import HDBSCAN
from sklearn.preprocessing import LabelEncoder

# Lade die Daten
path = 'feature_list.csv'
df = pd.read_csv(path)
data = df.drop(columns=['name'])

# Wandle kategorische Daten in numerische Werte um
label_encoders = {}
for column in data.columns:
    if data[column].dtype == 'object':  # Nur auf Spalten mit Objektdatentyp anwenden
        le = LabelEncoder()
        data[column] = le.fit_transform(data[column])
        label_encoders[column] = le

# Wende HDBSCAN-Clustering an
clusterer = HDBSCAN()
data['cluster'] = clusterer.fit_predict(data)

# Plot die Cluster mit Plotly
fig = px.scatter(data, x=data.columns[0], y=data.columns[1], color='cluster', title='HDBSCAN Clusters')
fig.show()


### K-Means Cluster

In [None]:
from src.StreamPort.ml.MachineLearningEngine import MachineLearningEngine

import pandas as pd
import plotly.express as px
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder

path = 'feature_list.csv'
engine = MachineLearningEngine()
engine.add_analyses_from_csv(path)
engine.print()

# Load the data
df = pd.read_csv(path)
data = df.drop(columns=['name'])

# Apply LabelEncoder to convert categorical data into numerical values
label_encoders = {}
for column in data.columns:
    if data[column].dtype == 'object':  # Apply only to columns with object data type
        le = LabelEncoder()
        data[column] = le.fit_transform(data[column])
        label_encoders[column] = le  

# PCA for dimensionality reduction (optional, to visualize clusters in 2D)
pca = PCA(n_components=2)
data_pca = pca.fit_transform(data)

# Apply KMeans clustering
kmeans = KMeans(n_clusters=5, random_state=42)  # You can change 'n_clusters' based on your requirement
cluster_labels = kmeans.fit_predict(data)

# Create a DataFrame for the PCA-transformed data
pca_df = pd.DataFrame(data_pca, columns=['PCA1', 'PCA2'])
pca_df['Cluster'] = cluster_labels.astype(str)  # Convert cluster labels to string for categorical coloring

# Plot the results using Plotly
fig = px.scatter(pca_df, x='PCA1', y='PCA2', color='Cluster', title='KMeans Clustering Results',
                 labels={'PCA1': 'Principal Component 1', 'PCA2': 'Principal Component 2'})

fig.show()

### NEW DATA ###

plot the 'neg' and 'pos' classes with umap. 

In [None]:
import pandas as pd
import numpy as np
from src.StreamPort.ml.MachineLearningEngine import MachineLearningEngine, MachineLearningAnalysis
from src.StreamPort.ml.MachineLearningProcessingSettings import MakeModelUMAP

path = 'groups_ints.csv'
class_path = 'groups_classes.csv'
df = pd.read_csv(class_path)

df_pos = df[df['polarity'] == 'positive']
df_neg = df[df['polarity'] == 'negative']

#process and plot neg class
print("plot neg class")
engine = MachineLearningEngine() 
engine.add_analyses_from_csv(path)

#add neg classes to the engine
for index, row in df_neg.iterrows():
    row_value = row.tolist()[1:]
    class_name = row['class']
    ana = MachineLearningAnalysis(name=str(class_name), data={"x": np.array(df_neg.columns.tolist()[1:]), "y": np.array(row_value)})
    if ana.validate():
        engine.add_classes(class_name)
    else:
        print(f"Analysis {class_name} did not pass validation.")

umap_model_neg = MakeModelUMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
engine.add_settings(umap_model_neg)
engine.run_workflow()
engine.plot_umap()  

#process and plot the pos classes
print("plot pos classes")
engine = MachineLearningEngine()  
engine.add_analyses_from_csv(path)

#add pos classes to the engine
for index, row in df_pos.iterrows():
    row_value = row.tolist()[1:]
    class_name = row['class']
    ana = MachineLearningAnalysis(name=str(class_name), data={"x": np.array(df_pos.columns.tolist()[1:]), "y": np.array(row_value)})
    if ana.validate():
        engine.add_classes(class_name)
    else:
        print(f"Analysis {class_name} did not pass validation.")

umap_model_pos = MakeModelUMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=42)
engine.add_settings(umap_model_pos)
engine.run_workflow()
engine.plot_umap()  


pca plot for the new data

In [None]:
from src.StreamPort.ml.MachineLearningEngine import MachineLearningEngine
from src.StreamPort.ml.MachineLearningProcessingSettings import  MakeModelPCASKL
import webbrowser

#Creates an empty MachineLearningEngine object and prints it
path = 'groups_ints.csv'
engine = MachineLearningEngine()
engine.add_analyses_from_csv(path)

class_path = 'groups_classes.csv'
engine.add_classes_from_csv(class_path)
engine.print()
engine.plot_data()
webbrowser.open('general_data_plot.html')

pca_model = MakeModelPCASKL(n_components = 2, center_data= True)
engine.add_settings(pca_model)
engine.print()
engine.run_workflow()
engine.plot_pca()
webbrowser.open('pca_scores_plot.html')
webbrowser.open('pca_loadings_plot.html')

In [None]:
from src.StreamPort.ml.MachineLearningEngine import MachineLearningEngine, MachineLearningAnalysis
from src.StreamPort.ml.MachineLearningProcessingSettings import  MakeModelPCASKL
import pandas as pd
import numpy as np

#Creates an empty MachineLearningEngine object and prints it
path = 'groups_ints.csv'
class_path = 'groups_classes.csv'
df = pd.read_csv(class_path)

#neg plot
print("plot neg class")
engine = MachineLearningEngine() 
engine.add_analyses_from_csv(path)

df_pos = df[df['polarity'] == 'positive']
df_neg = df[df['polarity'] == 'negative']

#add neg classes to the engine
for index, row in df_neg.iterrows():
    row_value = row.tolist()[1:]
    class_name = row['class']
    ana = MachineLearningAnalysis(name=str(class_name), data={"x": np.array(df_neg.columns.tolist()[1:]), "y": np.array(row_value)})
    if ana.validate():
        engine.add_classes(class_name)
    else:
        print(f"Analysis {class_name} did not pass validation.")

pca_model_neg = MakeModelPCASKL(n_components = 2, center_data= True)
engine.add_settings(pca_model_neg)
engine.run_workflow()
engine.plot_pca()

#process and plot the pos classes
print("plot pos classes")
engine = MachineLearningEngine()  
engine.add_analyses_from_csv(path)

#add pos classes to the engine
for index, row in df_pos.iterrows():
    row_value = row.tolist()[1:]
    class_name = row['class']
    ana = MachineLearningAnalysis(name=str(class_name), data={"x": np.array(df_pos.columns.tolist()[1:]), "y": np.array(row_value)})
    if ana.validate():
        engine.add_classes(class_name)
    else:
        print(f"Analysis {class_name} did not pass validation.")

pca_model_pos = MakeModelPCASKL(n_components = 2, center_data= True)
engine.add_settings(pca_model_pos)
engine.run_workflow()
engine.plot_pca()  


dbscan for the new dataset

In [10]:
from src.StreamPort.ml.MachineLearningEngine import MachineLearningEngine
from src.StreamPort.ml.MachineLearningProcessingSettings import  MakeModelDBSCANSKL

import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
import plotly.express as px

# Creates an empty MachineLearningEngine object and prints it
path = 'feature_list.csv'
engine = MachineLearningEngine()
engine.add_analyses_from_csv(path)

class_path = 'feature_metadata.csv'
engine.add_classes_from_csv(class_path)

engine.print()

data = engine.get_data()
mean = np.mean(data, axis=0)
data = data - mean

pca = PCA(n_components=2)
data_2d = pca.fit_transform(data)
print("Reduced data shape:", data_2d.shape)

# Experiment with DBSCAN parameters
eps = 1.5E6  # Adjust based on the k-distance plot
min_samples = 3  # Adjust based on your data

dbscan = DBSCAN(eps=eps, min_samples=min_samples)
dbscan.fit(data_2d)

labels = dbscan.labels_

# Analyze the clusters
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)

# Create a DataFrame for Plotly
df = pd.DataFrame(data_2d, columns=['PC1', 'PC2'])
df['Cluster'] = labels.astype(str)  # Convert to string for categorical coloring

# Plot the results using Plotly
fig = px.scatter(df, x='PC1', y='PC2', color='Cluster', title='DBSCAN Clustering Results',
                 labels={'PC1': 'Principal Component 1', 'PC2': 'Principal Component 2'})

fig.show()

Structure of the CSV file: {'number_of_rows': 45, 'number_of_columns': 4445}
Structure of the CSV file: {'number_of_rows': 45, 'number_of_columns': 2}

MachineLearningEngine 
  name: None 
  author: None 
  path: None 
  date: 2024-10-29 16:01:52.081486 
  analyses: 45 
  settings: 0 

Reduced data shape: (45, 2)
Estimated number of clusters: 3
Estimated number of noise points: 0


hdbscan plot for the new data


In [None]:
from src.StreamPort.ml.MachineLearningEngine import MachineLearningEngine

import pandas as pd
import plotly.express as px
from sklearn.cluster import HDBSCAN

path = 'feature_list.csv'
engine = MachineLearningEngine()
engine.add_analyses_from_csv(path)
engine.print()

# Load the data
df = pd.read_csv(path)
data = df.drop(columns=['name'])

# Perform clustering
clusterer = HDBSCAN(min_cluster_size=3).fit(data)
labels = clusterer.labels_
probabilities = clusterer.probabilities_
print(labels)

# Create scatter plot with color based on labels
fig = px.scatter(data, x=data.columns[0], y=data.columns[1], color=labels)  # Adjust opacity based on probabilities
fig.show()


k-means cluster

In [None]:
import pandas as pd
import plotly.express as px
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder

# Load the data
df = pd.read_csv('groups_ints.csv')
data = df.drop(columns=['name'])

# Apply LabelEncoder to convert categorical data into numerical values
label_encoders = {}
for column in data.columns:
    if data[column].dtype == 'object':  # Apply only to columns with object data type
        le = LabelEncoder()
        data[column] = le.fit_transform(data[column])
        label_encoders[column] = le  

# PCA for dimensionality reduction (optional, to visualize clusters in 2D)
pca = PCA(n_components=2)
data_pca = pca.fit_transform(data)

# Apply KMeans clustering
kmeans = KMeans(n_clusters=5, random_state=42)  # You can change 'n_clusters' based on your requirement
cluster_labels = kmeans.fit_predict(data)

# Create a DataFrame for the PCA-transformed data
pca_df = pd.DataFrame(data_pca, columns=['PCA1', 'PCA2'])
pca_df['Cluster'] = cluster_labels.astype(str)  # Convert cluster labels to string for categorical coloring

# Plot the results using Plotly
fig = px.scatter(pca_df, x='PCA1', y='PCA2', color='Cluster', title='KMeans Clustering Results',
                 labels={'PCA1': 'Principal Component 1', 'PCA2': 'Principal Component 2'})

fig.show()
