This code demonstrates how to load the COCO dataset using torchvision in PyTorch and access its elements (images and annotations) for further processing or visualization.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Load the training dataset
train_df = pd.read_csv('train.csv')

# Display the first few rows of the dataset
train_df.head()

# Display basic information about the dataset
train_df.info()

# Visualize the distribution of the 'Survived' variable
sns.countplot(x='Survived', data=train_df)
plt.title('Distribution of Survival')
plt.show()

# Check for missing values in the dataset
missing_values = train_df.isnull().sum()
missing_values = missing_values[missing_values > 0].sort_values(ascending=False)

# Display missing values
missing_values

# Display basic statistics of numerical features
statistics_summary = train_df.describe()

# Display basic statistics of categorical features
categorical_summary = train_df.describe(include=['object'])

# Explore the distribution of 'Age' using a histogram
plt.figure(figsize=(10, 6))
sns.histplot(train_df['Age'], bins=30, kde=True, color='skyblue')
plt.title('Distribution of Age')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()

# Visualize the survival distribution by gender
plt.figure(figsize=(8, 5))
sns.countplot(x='Survived', hue='Sex', data=train_df)
plt.title('Survival Distribution by Gender')
plt.xlabel('Survived')
plt.ylabel('Count')
plt.legend(title='Sex')
plt.show()

# Visualize the survival distribution by passenger class
plt.figure(figsize=(8, 5))
sns.countplot(x='Survived', hue='Pclass', data=train_df)
plt.title('Survival Distribution by Passenger Class')
plt.xlabel('Survived')
plt.ylabel('Count')
plt.legend(title='Pclass')
plt.show()

# Check for missing values before handling
missing_values_before = train_df.isnull().sum()
missing_values_before = missing_values_before[missing_values_before > 0].sort_values(ascending=False)

# Display missing values before handling
missing_values_before

# Impute missing values for 'Age' using the median
train_df['Age'].fillna(train_df['Age'].median(), inplace=True)

# Impute missing values for 'Embarked' using the mode
train_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace=True)

# Drop 'Cabin' column due to a large number of missing values
train_df.drop('Cabin', axis=1, inplace=True)

# Drop irrelevant features: 'PassengerId', 'Name', 'Ticket'
train_df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)


# Check for missing values after handling
missing_values_after = train_df.isnull().sum()
missing_values_after = missing_values_after[missing_values_after > 0].sort_values(ascending=False)

# Display missing values after handling
missing_values_after

train_df.head()


# Create a new feature 'FamilySize' by combining 'SibSp' and 'Parch'
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1

# Create a new feature 'IsAlone' indicating whether the passenger is traveling alone
train_df['IsAlone'] = (train_df['FamilySize'] == 1).astype('int')

# Display the dataset with new features
train_df.head()


# Convert categorical variable 'Pclass' into one-hot encoded vectors
pclass_dummies = pd.get_dummies(train_df['Pclass'], prefix='Pclass')
train_df = pd.concat([train_df, pclass_dummies], axis=1)
train_df.drop('Pclass', axis=1, inplace=True)

# Display the dataset with one-hot encoded vectors
train_df.head()


from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score
import matplotlib.pyplot as plt



# Separate features (X) and target variable (y)
X = train_df.drop('Survived', axis=1)
y = train_df['Survived']

# Initialize SVM classifier
svm_classifier = SVC()

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'C': [0.1,1,10,100,500,1000],
    'kernel': ['linear','poly','rbf'],
    'gamma': ['scale']##missing auto
}

# Initialize GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(svm_classifier, param_grid, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)

# Perform hyperparameter tuning
grid_search.fit(X, y)

# Display the best hyperparameters
best_params = grid_search.best_params_
print(f"Best Hyperparameters: {best_params}")

# Update the classifier with the best hyperparameters
best_svm_classifier = SVC(C=best_params['C'], kernel=best_params['kernel'], gamma=best_params['gamma'])

# Train the classifier on the entire training dataset
best_svm_classifier.fit(X, y)


# Print the parameters of the trained SVM classifier
print("Parameters of the trained SVM classifier:")
print(best_svm_classifier.get_params())



# Extract results of hyperparameter tuning
results = pd.DataFrame(grid_search.cv_results_)

# Plot the mean test scores for each combination of hyperparameters
plt.figure(figsize=(20, 10))
sns.lineplot(x='param_C', y='mean_test_score', hue='param_kernel', data=results, marker='o')
plt.xscale('log')
plt.title('Hyperparameter Tuning for SVM')
plt.xlabel('C (Regularization Parameter)')
plt.ylabel('Mean Test Score (Accuracy)')
plt.show()




import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

# Assuming X and y are your feature matrix and target variable

# Create a KNN classifier
knn_classifier = KNeighborsClassifier()

# Define the parameter grid for hyperparameter tuning
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'metric': ['euclidean', 'manhattan']
}

# Initialize GridSearchCV for hyperparameter tuning
grid_search_knn = GridSearchCV(knn_classifier, param_grid_knn, cv=8, scoring='accuracy', verbose=1, n_jobs=-1)

# Perform hyperparameter tuning
grid_search_knn.fit(X, y)

# Extract the results from the grid search
results_knn = pd.DataFrame(grid_search_knn.cv_results_)

# Display the best hyperparameters for KNN
best_params_knn = grid_search_knn.best_params_
print(f"Best Hyperparameters for KNN: {best_params_knn}")

# Update the KNN classifier with the best hyperparameters
best_knn_classifier = KNeighborsClassifier(n_neighbors=best_params_knn['n_neighbors'],
                                           weights=best_params_knn['weights'],
                                           algorithm=best_params_knn['algorithm'],
                                           metric=best_params_knn['metric'])

# Train the classifier on the entire training dataset
best_knn_classifier.fit(X, y)

# Print the parameters of the trained SVM classifier
print("Parameters of the trained SVM classifier:")
print(best_knn_classifier.get_params())




# Visualize the process of hyperparameter tuning

# Plot for 'metric'
plt.figure(figsize=(20, 10))
sns.lineplot(x='param_n_neighbors', y='mean_test_score', hue='param_metric', data=results_knn, marker='o')
plt.title('Hyperparameter Tuning for KNN - Metric')
plt.xlabel('K')
plt.ylabel('Mean Test Score (Accuracy)')
plt.legend(title='distance')
plt.show()

# Plot for 'n_neighbors'
plt.figure(figsize=(20, 10))
sns.lineplot(x='param_n_neighbors', y='mean_test_score', hue='param_weights', data=results_knn, marker='o')
plt.title('Hyperparameter Tuning for KNN - Number of Neighbors')
plt.xlabel('Number of Neighbors (n_neighbors)')
plt.ylabel('Mean Test Score (Accuracy)')
plt.legend(title='Weights')
plt.show()

# Plot for 'algorithm'
plt.figure(figsize=(20, 10))
sns.lineplot(x='param_n_neighbors', y='mean_test_score', hue='param_algorithm', data=results_knn, marker='o')
plt.title('Hyperparameter Tuning for KNN - Algorithm')
plt.xlabel('K')
plt.ylabel('Mean Test Score (Accuracy)')
plt.legend(title='algorithms')
plt.show()



import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV



# Assuming 'X' and 'y' are already defined from your dataset
# Separate features (X) and target variable (y)

# Create a Gaussian Naive Bayes classifier
bayes_classifier = GaussianNB()

# Define the parameter grid for hyperparameter tuning
param_grid_bayes = {
    'var_smoothing': [ 0.1,0.01,0.001,0.0001,0.00001,0.000001,0.0000001,0.00000001,0.000000001]
}

# Initialize GridSearchCV for hyperparameter tuning
grid_search_bayes = GridSearchCV(bayes_classifier, param_grid_bayes, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

# Perform hyperparameter tuning
grid_search_bayes.fit(X, y)

# Display the best hyperparameters for Bayesian classifier
best_params_bayes = grid_search_bayes.best_params_
print(f"Best Hyperparameters for Bayesian Classifier: {best_params_bayes}")

# Update the Bayesian classifier with the best hyperparameters
best_bayes_classifier = GaussianNB(var_smoothing=best_params_bayes['var_smoothing'])

# Train the classifier on the entire training dataset
best_bayes_classifier.fit(X, y)

# Visualize the process of hyperparameter tuning
results_bayes = pd.DataFrame(grid_search_bayes.cv_results_)


# Print the parameters of the trained SVM classifier
print("Parameters of the trained SVM classifier:")
print(best_bayes_classifier.get_params())




# Plot the mean test scores for each combination of hyperparameters
plt.figure(figsize=(20, 10))
sns.lineplot(x='param_var_smoothing', y='mean_test_score', data=results_bayes, marker='o')
plt.title('Hyperparameter Tuning for Bayesian Classifier')
plt.xlabel('Variance Smoothing (var_smoothing)')
plt.ylabel('Mean Test Score (Accuracy)')
plt.show()

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split


# Assuming you have the following classifiers available:
# svm_classifier, knn_classifier, bayes_classifier, tree_classifier, mlp_classifier

classifiers = {
    'SVM': best_svm_classifier,
    'KNN': best_knn_classifier,
    'Naive Bayes': best_bayes_classifier,
    'Decision Trees': best_tree_classifier,
    'MLP': best_mlp_classifier
}

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize dictionaries to store metrics
accuracy_scores = {}
precision_scores = {}
recall_scores = {}
f1_scores = {}
roc_auc_scores = {}
roc_curves = {}

# Evaluate each classifier
for name, classifier in classifiers.items():
    # Fit the model on the training data
    #classifier.fit(X_train, y_train)
    
    # Predictions on the test set
    y_pred = classifier.predict(X_test)
    
    # Calculate metrics
    accuracy_scores[name] = accuracy_score(y_test, y_pred)
    precision_scores[name] = precision_score(y_test, y_pred)
    recall_scores[name] = recall_score(y_test, y_pred)
    f1_scores[name] = f1_score(y_test, y_pred)
    
    # For ROC/AUC, handle binary classifiers
    if hasattr(classifier, "decision_function"):
        y_scores = classifier.decision_function(X_test)
    elif hasattr(classifier, "predict_proba"):
        probas = classifier.predict_proba(X_test)[:, 1]
        y_scores = probas
    else:
        raise ValueError("Classifier does not have decision_function or predict_proba.")
    
    roc_auc_scores[name] = roc_auc_score(y_test, y_scores)
    
    # ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_scores)
    roc_curves[name] = {'fpr': fpr, 'tpr': tpr}

# Print or visualize the metrics as needed
print("Accuracy Scores:")
print(accuracy_scores)

print("\nPrecision Scores:")
print(precision_scores)

print("\nRecall Scores:")
print(recall_scores)

print("\nF1 Scores:")
print(f1_scores)

print("\nROC AUC Scores:")
print(roc_auc_scores)

# Visualize ROC curves
plt.figure(figsize=(10, 6))
for name, curve in roc_curves.items():
    plt.plot(curve['fpr'], curve['tpr'], label=f'{name} (AUC = {roc_auc_scores[name]:.2f})')

plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Random')
plt.title('ROC Curves')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()


import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV



# Create a Decision Tree classifier
tree_classifier = DecisionTreeClassifier()

# Define the parameter grid for hyperparameter tuning
param_grid_tree = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': [None, 'sqrt', 'log2', 0.2, 0.5, 0.8],
    'max_leaf_nodes': [None, 5, 10, 20, 50],
    'min_impurity_decrease': [0.0, 0.1, 0.2, 0.5]
}


# Initialize GridSearchCV for hyperparameter tuning
grid_search_tree = GridSearchCV(tree_classifier, param_grid_tree, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

# Perform hyperparameter tuning
grid_search_tree.fit(X, y)

# Display the best hyperparameters for Decision Tree
best_params_tree = grid_search_tree.best_params_
print(f"Best Hyperparameters for Decision Tree: {best_params_tree}")

# Update the Decision Tree classifier with the best hyperparameters
best_tree_classifier = DecisionTreeClassifier(criterion=best_params_tree['criterion'],
                                              splitter=best_params_tree['splitter'],
                                              max_depth=best_params_tree['max_depth'],
                                              min_samples_split=best_params_tree['min_samples_split'],
                                              min_samples_leaf=best_params_tree['min_samples_leaf'],
                                              max_features=best_params_tree['max_features'],
                                              max_leaf_nodes=best_params_tree['max_leaf_nodes'],
                                              min_impurity_decrease=best_params_tree['min_impurity_decrease'])

# Train the classifier on the entire training dataset
best_tree_classifier.fit(X, y)

# Assuming best_svm_classifier is already defined and trained

# Print the parameters of the trained SVM classifier
print("Parameters of the trained SVM classifier:")
print(best_tree_classifier.get_params())


# Visualize the process of hyperparameter tuning
results_tree = pd.DataFrame(grid_search_tree.cv_results_)



# Additional visualizations
plt.figure(figsize=(20, 10))

# Convert 'None' values to a string for visualization
results_tree['param_max_depth'] = results_tree['param_max_depth'].astype(str)
results_tree['param_min_samples_split'] = results_tree['param_min_samples_split'].astype(str)
results_tree['param_max_features'] = results_tree['param_max_features'].astype(str)
results_tree['param_max_leaf_nodes'] = results_tree['param_max_leaf_nodes'].astype(str)

# Plot 1: Min Samples Split 
plt.subplot(2, 2, 1)
sns.barplot(x='param_min_samples_split', y='mean_test_score', hue='param_criterion', data=results_tree)
plt.title('Hyperparameter Tuning for Decision Tree')
plt.xlabel('Min Samples Split (min_samples_split)')
plt.ylabel('Mean Test Score (Accuracy)')
plt.legend(title='Criterion')

# Plot 2: Max Features 
plt.subplot(2, 2, 2)
sns.pointplot(x='param_max_features', y='mean_test_score', hue='param_criterion', data=results_tree, dodge=True, markers=['o', 's'])
plt.title('Hyperparameter Tuning for Decision Tree')
plt.xlabel('Max Features (max_features)')
plt.ylabel('Mean Test Score (Accuracy)')
plt.legend(title='Criterion')

# Plot 3: Max Leaf Nodes 
plt.subplot(2, 2, 3)
sns.boxplot(x='param_max_leaf_nodes', y='mean_test_score', hue='param_criterion', data=results_tree)
plt.title('Hyperparameter Tuning for Decision Tree')
plt.xlabel('Max Leaf Nodes (max_leaf_nodes)')
plt.ylabel('Mean Test Score (Accuracy)')
plt.legend(title='Criterion')

# Plot3: Max depth 
plt.subplot(2, 2, 4)
sns.lineplot(x='param_max_depth', y='mean_test_score', hue='param_criterion', data=results_tree, marker='o')
plt.title('Hyperparameter Tuning for Decision Tree')
plt.xlabel('Max Depth (max_depth)')
plt.ylabel('Mean Test Score (Accuracy)')
plt.legend(title='Criterion')
plt.show()

plt.tight_layout()
plt.show()

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV



# Assuming 'X' and 'y' are already defined from your dataset
# Separate features (X) and target variable (y)

# Create an MLP (Multi-Layer Perceptron) classifier
mlp_classifier = MLPClassifier()

# Define the parameter grid for hyperparameter tuning
param_grid_mlp = {
    'hidden_layer_sizes': [(100,), (50, 50)],
    'activation': ['relu', 'tanh', 'logistic'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
    'max_iter': [100, 200, 300],
    'learning_rate_init': [0.001, 0.01, 0.1],
}

# Initialize GridSearchCV for hyperparameter tuning
grid_search_mlp = GridSearchCV(mlp_classifier, param_grid_mlp, cv=2, scoring='accuracy', verbose=1, n_jobs=-1)

# Perform hyperparameter tuning
grid_search_mlp.fit(X, y)

# Display the best hyperparameters for MLP
best_params_mlp = grid_search_mlp.best_params_
print(f"Best Hyperparameters for MLP: {best_params_mlp}")

# Update the MLP classifier with the best hyperparameters
best_mlp_classifier = MLPClassifier(hidden_layer_sizes=best_params_mlp['hidden_layer_sizes'],
                                    activation=best_params_mlp['activation'],
                                    alpha=best_params_mlp['alpha'],
                                    learning_rate=best_params_mlp['learning_rate'],
                                    max_iter=best_params_mlp['max_iter'],
                                    learning_rate_init=best_params_mlp['learning_rate_init'])

# Train the classifier on the entire training dataset
best_mlp_classifier.fit(X, y)


# Print the parameters of the trained SVM classifier
print("Parameters of the trained SVM classifier:")
print(best_mlp_classifier.get_params())


# Visualize the process of hyperparameter tuning (example plot)
results_mlp = pd.DataFrame(grid_search_mlp.cv_results_)

# Print unique values in 'param_hidden_layer_sizes' column
unique_values = results_mlp['param_hidden_layer_sizes'].unique()
print("Unique values in 'param_hidden_layer_sizes':", unique_values)


# Convert 'param_hidden_layer_sizes' to numeric
results_mlp['param_hidden_layer_sizes'] = results_mlp['param_hidden_layer_sizes'].apply(lambda x: x[0] if isinstance(x, tuple) else x)
results_mlp['param_hidden_layer_sizes'] = pd.to_numeric(results_mlp['param_hidden_layer_sizes'], errors='coerce')

# Visualize the process of hyperparameter tuning (example plot)
plt.figure(figsize=(16, 8))
sns.lineplot(x='param_hidden_layer_sizes', y='mean_test_score', hue='param_activation', data=results_mlp, marker='o')
plt.title('Hyperparameter Tuning for MLP')
plt.xlabel('Hidden Layer Sizes (hidden_layer_sizes)')
plt.ylabel('Mean Test Score (Accuracy)')
plt.legend(title='Activation Function')
plt.show()

# Visualize the process of hyperparameter tuning for a different plot
plt.figure(figsize=(16, 8))
sns.lineplot(x='param_alpha', y='mean_test_score', hue='param_learning_rate', data=results_mlp, marker='o')
plt.title('Hyperparameter Tuning for MLP')
plt.xlabel('alpha')
plt.ylabel('learning rate')
plt.legend(title='Activation Function')
plt.show()

plt.figure(figsize=(16, 8))
sns.lineplot(x='param_alpha', y='mean_test_score', hue='param_activation', data=results_mlp, marker='o')
plt.title('Hyperparameter Tuning for MLP')
plt.xlabel('Alpha (param_alpha)')
plt.ylabel('Mean Test Score (Accuracy)')
plt.legend(title='Activation Function')
plt.show()

plt.figure(figsize=(16, 8))
sns.lineplot(x='param_learning_rate', y='mean_test_score', hue='param_activation', data=results_mlp, marker='o')
plt.title('Hyperparameter Tuning for MLP')
plt.xlabel('Learning Rate (param_learning_rate)')
plt.ylabel('Mean Test Score (Accuracy)')
plt.legend(title='Activation Function')
plt.show()

plt.figure(figsize=(16, 8))
sns.lineplot(x='param_hidden_layer_sizes', y='mean_test_score', hue='param_alpha', data=results_mlp, marker='o')
plt.title('Hyperparameter Tuning for MLP')
plt.xlabel('Hidden Layer Sizes (param_hidden_layer_sizes)')
plt.ylabel('Mean Test Score (Accuracy)')
plt.legend(title='Alpha')
plt.show()

plt.figure(figsize=(16, 8))
sns.lineplot(x='param_learning_rate_init', y='mean_test_score', hue='param_activation', data=results_mlp, marker='o')
plt.title('Hyperparameter Tuning for MLP')
plt.xlabel('Learning Rate Init (param_learning_rate_init)')
plt.ylabel('Mean Test Score (Accuracy)')
plt.legend(title='Activation Function')
plt.show()

plt.figure(figsize=(16, 8))
sns.lineplot(x='param_max_iter', y='mean_test_score', hue='param_alpha', data=results_mlp, marker='o')
plt.title('Hyperparameter Tuning for MLP')
plt.xlabel('Max Iterations (param_max_iter)')
plt.ylabel('Mean Test Score (Accuracy)')
plt.legend(title='Alpha')
plt.show()

plt.figure(figsize=(16, 8))
sns.lineplot(x='param_alpha', y='mean_test_score', hue='param_learning_rate_init', data=results_mlp, marker='o')
plt.title('Hyperparameter Tuning for MLP')
plt.xlabel('Alpha (param_alpha)')
plt.ylabel('Mean Test Score (Accuracy)')
plt.legend(title='Learning Rate init')
plt.show()

In [11]:
! pip install torch torchvision matplotlib




[notice] A new release of pip is available: 23.1.2 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [12]:
import torchvision.transforms as transforms
from torchvision.datasets import CocoDetection
import matplotlib.pyplot as plt
