In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# import data 
dermatology = fetch_ucirepo(name = "dermatology") 

X = dermatology.data.features
y = dermatology.data.targets

df = pd.DataFrame(X)

df.head()

In [None]:
# split data into tarin and test
X = df.drop('Class', axis=1) 
y = df['Class'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# split data and train model (Support Vector Machine (SVM) model)
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

svm_model = SVC(kernel='linear', random_state=123)  # kernel types: linear, poly, rbf, etc.
svm_model.fit(X_train_scaled, y_train)

y_pred = svm_model.predict(X_test_scaled)

In [None]:
# model evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

print('Classification Report:')
print(classification_report(y_test, y_pred))

The confusion matrix is something you'd analyze both before and after hyperparameter tuning. If you’ve already visualized the confusion matrix in the earlier stages, you could recheck it after tuning and assess whether the tuning has improved it.

In [None]:
# confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

print("Confusion Matrix:")
print(conf_matrix)

plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=np.unique(y), yticklabels=np.unique(y))
plt.title('Confusion Matrix for SVM Model')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

In [None]:
# more detail
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Calculate precision, recall, and f1 score for each class individually
precision = precision_score(y_test, y_pred, average=None)
recall = recall_score(y_test, y_pred, average=None)
f1 = f1_score(y_test, y_pred, average=None)

# Calculate overall accuracy
accuracy = accuracy_score(y_test, y_pred)

# Print detailed per-class performance metrics
for i, class_name in enumerate(['psoriasis', 'seboreic dermatitis', 'lichen planus', 'pityriasis rosea', 'cronic dermatitis', 'pityriasis rubra pilaris']):
    print(f'{class_name}:')
    print(f'  Precision: {precision[i]:.4f}')
    print(f'  Recall: {recall[i]:.4f}')
    print(f'  F1-score: {f1[i]:.4f}')
    print()

# Print overall accuracy
print(f'Overall Accuracy: {accuracy:.4f}')


In [None]:
# k-fold cross-validation
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(), SVC(kernel='linear', random_state=123))

k = 5 
cv_scores = cross_val_score(model, X_train, y_train, cv=k, scoring='accuracy')

print(f'Cross-validation scores for each fold: {cv_scores}')
print(f'Mean cross-validation accuracy: {cv_scores.mean():.4f}')
print(f'Standard deviation of cross-validation accuracy: {cv_scores.std():.4f}')

In [None]:
# hyperparameter tuning
#Grid Search CV
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


model = make_pipeline(StandardScaler(), SVC(random_state=123))

param_grid = {
    'svc__C': [0.1, 1, 10, 100],  
    'svc__kernel': ['linear', 'rbf'],  
    'svc__gamma': ['scale', 'auto'],  
    'svc__degree': [3, 5]  
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

grid_search.fit(X_train, y_train)

print("Best hyperparameters found: ", grid_search.best_params_)
print("Best cross-validation accuracy: {:.4f}".format(grid_search.best_score_))

results = grid_search.cv_results_
print("\nAll grid search results:")
for mean_score, params in zip(results["mean_test_score"], results["params"]):
    print(f"Score: {mean_score:.4f} - Parameters: {params}")

In [None]:
# evaluate after hypertuning and cross-validation


In [None]:
# analyze feature importance using radom forest
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import numpy as np

# Create a Random Forest model (you can adjust parameters based on your tuning)
rf_model = RandomForestClassifier(n_estimators=100, random_state=123)

# Fit the model
rf_model.fit(X_train, y_train)

# Get feature importances
feature_importances = rf_model.feature_importances_

# Sort the features by importance
indices = np.argsort(feature_importances)[::-1]

# Plot feature importances
plt.figure(figsize=(12, 6))
plt.title('Feature Importance from Random Forest')
plt.barh(range(X_train.shape[1]), feature_importances[indices], align='center')
plt.yticks(range(X_train.shape[1]), [X_train.columns[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

In [None]:
# SHAP for model interpretability
import shap
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

# Fit the Random Forest model (if not already done)
rf_model = RandomForestClassifier(n_estimators=100, random_state=123)
rf_model.fit(X_train, y_train)

# Initialize SHAP explainer
explainer = shap.Explainer(rf_model, X_train)

# Calculate SHAP values for the test set
shap_values = explainer(X_test)

# Summary plot for global feature importance
shap.summary_plot(shap_values, X_test, plot_type="bar")

# SHAP force plot for an individual prediction (first test sample)
shap.initjs()
shap.force_plot(shap_values[0].base_value, shap_values[0].values, X_test.iloc[0])

In [None]:
# Investigate the relationship between significant features and the target class
#1-Boxplot of features vs. target class
import seaborn as sns
import matplotlib.pyplot as plt

# Choose significant features identified in the previous analysis
significant_features = ['erythema', 'scaling', 'fibrosis of the papillary dermis']

# Create boxplots for each significant feature with respect to the target class
plt.figure(figsize=(15, 8))
for i, feature in enumerate(significant_features, 1):
    plt.subplot(1, len(significant_features), i)
    sns.boxplot(x='target_class', y=feature, data=pd.concat([X_train[significant_features], y_train], axis=1))
    plt.title(f'Boxplot of {feature} vs. Target Class')
    plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Investigate the relationship between significant features and the target class
#2-correlation matirx
# Calculate the correlation between features and the target class
corr_matrix = pd.concat([X_train[significant_features], y_train], axis=1).corr()

# Display the correlation matrix
plt.figure(figsize=(10, 7))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', center=0)
plt.title('Correlation Matrix of Significant Features and Target Class')
plt.show()