In [1]:
import pandas as pd
import scipy.stats as stats

## ANOVA (Analysis of Variance)  (Dataset)


ANOVA (Analysis of Variance) is a statistical test used to compare the means of two or more groups to determine if there are any statistically significant differences between them. It is commonly used when you have multiple groups and want to understand whether the variation between the group means is greater than the variation within the groups.

In [2]:
# Assuming you have three groups, you can add more groups as needed.
group1 = [10, 12, 14, 16, 18]
group2 = [20, 22, 24, 26, 28]
group3 = [30, 32, 34, 36, 38]

# Perform the ANOVA
f_statistic, p_value = stats.f_oneway(group1, group2, group3)

In [3]:
alpha = 0.05  # Set your significance level
if p_value < alpha:
    print("Reject null hypothesis: There is a significant difference between the group means.")
else:
    print("Fail to reject null hypothesis: There is no significant difference between the group means.")

Reject null hypothesis: There is a significant difference between the group means.


In this example, if the p-value is less than the significance level (usually 0.05), you can conclude that there is a significant difference between at least one pair of groups.

Keep in mind that ANOVA assumes the data follows a normal distribution and has equal variances between groups. If your data violates these assumptions, you might consider using non-parametric tests or data transformations before applying ANOVA.

In [4]:
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming you have a DataFrame with 'group' column and 'metric' column
# df = pd.DataFrame({'group': ['A']*5 + ['B']*5 + ['C']*5, 'metric': [10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38]})

# Bar plot
sns.barplot(x='group', y='metric', data=df)
plt.title('Bar Plot')
plt.show()

# Box plot
sns.boxplot(x='group', y='metric', data=df)
plt.title('Box Plot')
plt.show()

# Violin plot
sns.violinplot(x='group', y='metric', data=df)
plt.title('Violin Plot')
plt.show()

ModuleNotFoundError: No module named 'seaborn'

In [None]:
import mlflow

# Set MLflow tracking URI if it's not the default one
mlflow.set_tracking_uri("your_mlflow_tracking_uri")

# Fetch all runs for a specific experiment
experiment_id = "your_experiment_id"
runs = mlflow.search_runs(experiment_ids=[experiment_id], filter_string="", max_results=1000)

# `runs` is a DataFrame with one row for each run


In [None]:
import pandas as pd

# Let's say you logged "accuracy" for each model and dataset
# Assuming you have a tag or parameter named "model_name" and "dataset_name"
df = runs[["params.model_name", "params.dataset_name", "metrics.accuracy"]]

# Pivot the data to have datasets as columns, model names as index, and accuracies as values
df_pivot = df.pivot_table(index="params.model_name", columns="params.dataset_name", values="metrics.accuracy", aggfunc=list).transpose()
 

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 8))
sns.boxplot(data=df_pivot)
plt.title("Model Comparison across Datasets")
plt.ylabel("Accuracy")
plt.xlabel("Model Name")
plt.xticks(rotation=45)
plt.show()
