In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
print("Loading dataset...")
df = pd.read_csv('Cancer2025exam.csv')

# Basic information about the dataset
print("\
=== DATASET OVERVIEW ===")
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

print("\
=== DATA TYPES ===")
print(df.dtypes)

print("\
=== FIRST 5 ROWS ===")
print(df.head())

print("\
=== BASIC STATISTICS ===")
print(df.describe())

print("\
=== MISSING VALUES ===")
print(df.isnull().sum())

print("\
=== UNIQUE VALUES IN CATEGORICAL COLUMNS ===")
for col in df.columns:
    if df[col].dtype == 'object' or df[col].nunique() < 20:
        print(f"{col}: {df[col].nunique()} unique values - {df[col].unique()[:10]}")

print("\
=== CLASS DISTRIBUTION (if applicable) ===")
# Check if there's a target variable
possible_targets = ['diagnosis', 'Diagnosis', 'target', 'Target', 'class', 'Class', 'label', 'Label']
for target in possible_targets:
    if target in df.columns:
        print(f"\
{target} distribution:")
        print(df[target].value_counts())
        print(f"\
Percentage distribution:")
        print(df[target].value_counts(normalize=True) * 100)

Loading dataset...
=== DATASET OVERVIEW ===
Shape: (1715, 202)
Columns: ['V1', 'V102', 'V103', 'V104', 'V105', 'V106', 'V107', 'V108', 'V109', 'V110', 'V111', 'V112', 'V113', 'V114', 'V115', 'V116', 'V117', 'V118', 'V119', 'V120', 'V121', 'V122', 'V123', 'V124', 'V125', 'V126', 'V127', 'V128', 'V129', 'V130', 'V131', 'V132', 'V133', 'V134', 'V135', 'V136', 'V137', 'V138', 'V139', 'V140', 'V141', 'V142', 'V143', 'V144', 'V145', 'V146', 'V147', 'V148', 'V149', 'V150', 'V151', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50', 'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60', 'V61', 'V62', 'V63', 'V64', 'V65', 'V66', 'V67', 'V68', 'V69', 'V70', 'V71', 'V72', 'V73', 'V74', 'V75', 'V76', 'V77

In [4]:
# Identify the target variable (V1) and features
target = 'V1'
features = [col for col in df.columns if col != target]

print("=== TARGET VARIABLE ANALYSIS ===")
print(f"\
Target variable: {target}")
print(f"Number of classes: {df[target].nunique()}")
print(f"\
Class distribution:")
class_counts = df[target].value_counts().sort_index()
print(class_counts)
print(f"\
Class percentages:")
print((class_counts / len(df) * 100).round(2))

# Create visualizations directory
import os
if not os.path.exists('visualizations'):
    os.makedirs('visualizations')

# 1. Target distribution plot
plt.figure(figsize=(10, 6))
ax = class_counts.plot(kind='bar', color='steelblue', alpha=0.8)
plt.title('Distribution of Cancer Types', fontsize=16, fontweight='bold')
plt.xlabel('Cancer Type (Class)', fontsize=12)
plt.ylabel('Number of Samples', fontsize=12)
plt.xticks(rotation=0)

# Add value labels on bars
for i, v in enumerate(class_counts.values):
    ax.text(i, v + 5, str(v), ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.savefig('visualizations/target_distribution.png', dpi=300, bbox_inches='tight')
plt.close()

# 2. Feature correlation analysis
print("\
=== FEATURE ANALYSIS ===")
print(f"Number of features: {len(features)}")

# Calculate correlation matrix for a subset of features (too many to visualize all)
subset_features = features[:30]  # First 30 features
correlation_matrix = df[subset_features].corr()

plt.figure(figsize=(12, 10))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Matrix (First 30 Features)', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.savefig('visualizations/correlation_heatmap.png', dpi=300, bbox_inches='tight')
plt.close()

# 3. Class imbalance check
print("\
=== CLASS IMBALANCE ANALYSIS ===")
min_class_size = class_counts.min()
max_class_size = class_counts.max()
imbalance_ratio = max_class_size / min_class_size
print(f"Smallest class size: {min_class_size}")
print(f"Largest class size: {max_class_size}")
print(f"Imbalance ratio: {imbalance_ratio:.2f}")

# 4. Feature statistics by class
print("\
=== FEATURE STATISTICS BY CLASS ===")
# Calculate mean of first few features by class
sample_features = features[:5]
for feature in sample_features:
    print(f"\
{feature} mean by class:")
    print(df.groupby(target)[feature].mean().round(3))

# 5. Dimensionality reduction for visualization
print("\
=== DIMENSIONALITY REDUCTION ===")
print("Performing PCA...")

# Prepare data
X = df[features].values
y = df[target].values

# PCA with 2 components
pca = PCA(n_components=6, random_state=42)
X_pca = pca.fit_transform(X)
for i in range(3):
    plt.figure(figsize=(10, 8))
    scatter = plt.scatter(X_pca[:, i*2], X_pca[:, i*2+1], c=y, cmap='viridis', alpha=0.6, edgecolors='black', linewidth=0.5)
    plt.colorbar(scatter, label='Cancer Type')
    plt.xlabel(f'PC{i*2} ({pca.explained_variance_ratio_[i*2]:.2%} variance)', fontsize=12)
    plt.ylabel(f'PC{i*2+1} ({pca.explained_variance_ratio_[i*2+1]:.2%} variance)', fontsize=12)
    plt.title('PCA Visualization of Cancer Types', fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.savefig(f'visualizations/pca_visualization{i}.png', dpi=300, bbox_inches='tight')
    plt.close()

# 6. Box plots for feature distribution
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for i, feature in enumerate(features[:6]):
    df.boxplot(column=feature, by=target, ax=axes[i])
    axes[i].set_title(f'{feature} by Cancer Type')
    axes[i].set_xlabel('Cancer Type')
    axes[i].set_ylabel(feature)

plt.suptitle('Feature Distribution by Cancer Type', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.savefig('visualizations/feature_boxplots.png', dpi=300, bbox_inches='tight')
plt.close()

print("\
=== VISUALIZATIONS SAVED ===")
print("1. target_distribution.png - Distribution of cancer types")
print("2. correlation_heatmap.png - Feature correlation matrix")
print("3. pca_visualization.png - PCA visualization of samples")
print("4. feature_boxplots.png - Feature distribution by cancer type")

# Save summary statistics
summary_stats = {
    'n_samples': len(df),
    'n_features': len(features),
    'n_classes': df[target].nunique(),
    'class_distribution': class_counts.to_dict(),
    'imbalance_ratio': imbalance_ratio,
    'features_with_high_variance': [col for col in features if df[col].std() > 50]
}

print("\
=== SUMMARY ===")
for key, value in summary_stats.items():
    print(f"{key}: {value}")


=== TARGET VARIABLE ANALYSIS ===
Target variable: V1
Number of classes: 7
Class distribution:
V1
1    363
2    285
3    266
4    195
5    231
6    170
7    205
Name: count, dtype: int64
Class percentages:
V1
1    21.17
2    16.62
3    15.51
4    11.37
5    13.47
6     9.91
7    11.95
Name: count, dtype: float64
=== FEATURE ANALYSIS ===
Number of features: 201
=== CLASS IMBALANCE ANALYSIS ===
Smallest class size: 170
Largest class size: 363
Imbalance ratio: 2.14
=== FEATURE STATISTICS BY CLASS ===
V102 mean by class:
V1
1    0.016
2    0.048
3   -0.148
4   -0.039
5   -0.052
6   -0.075
7   -0.067
Name: V102, dtype: float64
V103 mean by class:
V1
1   -0.176
2    0.129
3    1.714
4   -1.953
5   -0.082
6   -5.809
7    2.114
Name: V103, dtype: float64
V104 mean by class:
V1
1   -4.229
2    8.570
3   -1.670
4    8.290
5    3.766
6   -1.897
7    3.376
Name: V104, dtype: float64
V105 mean by class:
V1
1   -4.583
2    1.939
3    7.581
4   -1.854
5   -7.151
6    3.793
7   -5.499
Name: V105, dtype