In [1]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load datasets
original_data = pd.read_csv('/content/chikungunya.csv')
merged_data = pd.read_csv('/content/merged_chikungunya_dataset.csv')

# Fix arthritis column for consistency
original_data['arthritis'] = original_data['arthritis'].replace({'yes': 1, 'no': 0}).astype(int)
merged_data['arthritis'] = merged_data['arthritis'].replace({'yes': 1, 'no': 0}).astype(int)

# Calculate sample sizes
sample_sizes = {
    'Dataset': ['Original', 'Merged'],
    'Sample Size': [len(original_data), len(merged_data)]
}

# Calculate class distributions (proportions)
original_dist = original_data['arthritis'].value_counts(normalize=True) * 100
merged_dist = merged_data['arthritis'].value_counts(normalize=True) * 100

# Prepare data for class distribution
class_dist_data = {
    'Dataset': ['Original', 'Original', 'Merged', 'Merged'],
    'Class': ['No Arthritis (0)', 'Arthritis (1)', 'No Arthritis (0)', 'Arthritis (1)'],
    'Proportion (%)': [
        original_dist.get(0, 0),
        original_dist.get(1, 0),
        merged_dist.get(0, 0),
        merged_dist.get(1, 0)
    ]
}

# Create DataFrames
sample_df = pd.DataFrame(sample_sizes)
class_dist_df = pd.DataFrame(class_dist_data)

# Set up figure with two subplots
plt.figure(figsize=(10, 5), dpi=300)

# Subplot 1: Sample Sizes
plt.subplot(1, 2, 1)
sns.barplot(x='Dataset', y='Sample Size', data=sample_df, palette='Blues')
plt.title('Sample Size Comparison', fontsize=12)
plt.ylabel('Number of Samples', fontsize=10)
plt.xlabel('Dataset', fontsize=10)
for i, v in enumerate(sample_df['Sample Size']):
    plt.text(i, v + 50, str(v), ha='center', fontsize=10)

# Subplot 2: Class Distribution
plt.subplot(1, 2, 2)
sns.barplot(x='Dataset', y='Proportion (%)', hue='Class', data=class_dist_df, palette='Set2')
plt.title('Class Distribution Comparison', fontsize=12)
plt.ylabel('Proportion (%)', fontsize=10)
plt.xlabel('Dataset', fontsize=10)
plt.legend(title='Class', fontsize=8)
for p in plt.gca().patches:
    plt.text(p.get_x() + p.get_width()/2, p.get_height() + 1, f'{p.get_height():.1f}%',
             ha='center', fontsize=8)

# Adjust layout and save
plt.tight_layout()
plt.savefig('/content/sample_class_comparison.png', dpi=300, bbox_inches='tight')
plt.close()


  original_data['arthritis'] = original_data['arthritis'].replace({'yes': 1, 'no': 0}).astype(int)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Dataset', y='Sample Size', data=sample_df, palette='Blues')


In [2]:

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt



# Fix data types for consistency
merged_data['arthritis'] = merged_data['arthritis'].replace({'yes': 1, 'no': 0}).astype(int)
merged_data['sex'] = merged_data['sex'].replace({'male': 1, 'female': 0}).astype(int)
binary_cols = ['fever', 'cold', 'joint pains', 'myalgia', 'headache', 'fatigue', 'vomitting',
               'Conjuctivitis', 'Nausea', 'Maculopapular rash', 'Eye Pain', 'Chills', 'Swelling']
for col in binary_cols:
    merged_data[col] = merged_data[col].replace({'yes': 1, 'no': 0, '1': 1, '0': 0}).astype(int)

# Encode categorical features
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
categorical_cols = ['arthralgia_type', 'pain_score', 'duration_of_pain_in_first_two_weeks']
for col in categorical_cols:
    merged_data[col] = le.fit_transform(merged_data[col].astype(str))

# Recalculate engineered features
merged_data['key_symptom_count'] = merged_data[['Swelling', 'vomitting', 'joint pains']].sum(axis=1).astype(int)
merged_data['Swelling_joint_pains'] = merged_data['Swelling'] * merged_data['joint pains']
merged_data['joint_stiffness_pain_score'] = merged_data['joint_stiffness'] * merged_data['pain_score']
merged_data['pain_score_duration'] = merged_data['pain_score'] * merged_data['duration_of_pain_in_first_two_weeks']
merged_data['joint_stiffness_duration'] = merged_data['joint_stiffness'] * merged_data['duration_of_pain_in_first_two_weeks']

# Drop redundant columns
drop_columns = ['Severe Chikungunya', 'Unnamed: 16', 'Unnamed: 17']
merged_data = merged_data.drop(columns=[col for col in drop_columns if col in merged_data.columns])

# Compute correlation matrix
corr_matrix = merged_data.corr()

# Create heatmap
plt.figure(figsize=(10, 8), dpi=300)
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f', square=True,
            cbar_kws={'label': 'Correlation Coefficient'})
plt.title('Feature Correlation Matrix (Merged Dataset)', fontsize=12)
plt.xticks(rotation=45, ha='right', fontsize=10)
plt.yticks(fontsize=10)
plt.tight_layout()

# Save heatmap
plt.savefig('/content/correlation_heatmap_merged.png', dpi=300, bbox_inches='tight')
plt.close()


  merged_data[col] = merged_data[col].replace({'yes': 1, 'no': 0, '1': 1, '0': 0}).astype(int)
  merged_data[col] = merged_data[col].replace({'yes': 1, 'no': 0, '1': 1, '0': 0}).astype(int)
  merged_data[col] = merged_data[col].replace({'yes': 1, 'no': 0, '1': 1, '0': 0}).astype(int)
  merged_data[col] = merged_data[col].replace({'yes': 1, 'no': 0, '1': 1, '0': 0}).astype(int)
  merged_data[col] = merged_data[col].replace({'yes': 1, 'no': 0, '1': 1, '0': 0}).astype(int)
  merged_data[col] = merged_data[col].replace({'yes': 1, 'no': 0, '1': 1, '0': 0}).astype(int)
  merged_data[col] = merged_data[col].replace({'yes': 1, 'no': 0, '1': 1, '0': 0}).astype(int)
  merged_data[col] = merged_data[col].replace({'yes': 1, 'no': 0, '1': 1, '0': 0}).astype(int)
  merged_data[col] = merged_data[col].replace({'yes': 1, 'no': 0, '1': 1, '0': 0}).astype(int)
  merged_data[col] = merged_data[col].replace({'yes': 1, 'no': 0, '1': 1, '0': 0}).astype(int)
  merged_data[col] = merged_data[col].replace({'ye

In [3]:

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Define confusion matrices from notebook outputs
cm_original = np.array([[29, 53], [39, 80]])  # Original: Refined Hybrid Ensemble
cm_merged = np.array([[200, 68], [43, 115]])  # Merged: Hybrid Ensemble with Polynomial Features

# Set up figure with two subplots
plt.figure(figsize=(10, 4), dpi=300)

# Subplot 1: Original Dataset Confusion Matrix
plt.subplot(1, 2, 1)
sns.heatmap(cm_original, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Predicted 0', 'Predicted 1'], yticklabels=['Actual 0', 'Actual 1'])
plt.title('Original Dataset (1001 samples)\nMacro F1: 0.51', fontsize=10)
plt.xlabel('Predicted', fontsize=9)
plt.ylabel('Actual', fontsize=9)

# Subplot 2: Merged Dataset Confusion Matrix
plt.subplot(1, 2, 2)
sns.heatmap(cm_merged, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Predicted 0', 'Predicted 1'], yticklabels=['Actual 0', 'Actual 1'])
plt.title('Merged Dataset (2130 samples)\nMacro F1: 0.73', fontsize=10)
plt.xlabel('Predicted', fontsize=9)
plt.ylabel('Actual', fontsize=9)

# Adjust layout and save
plt.tight_layout()
plt.savefig('/content/confusion_matrix_comparison.png', dpi=300, bbox_inches='tight')
plt.close()
