# Exploratory Data Analysis

This notebook is used for exploratory data analysis of the Tobamovirus classification project. It includes visualizations and statistical summaries of the data used in the model training and evaluation.

In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the training input data
training_data = pd.read_csv('../results/training/training_input.csv')

# Display the first few rows of the training data
training_data.head()

In [None]:
# Summary statistics of the training data
training_data.describe()

In [None]:
# Visualize the distribution of a specific feature
plt.figure(figsize=(10, 6))
sns.histplot(training_data['feature_name'], bins=30, kde=True)
plt.title('Distribution of Feature Name')
plt.xlabel('Feature Name')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(12, 8))
correlation_matrix = training_data.corr()
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()