## Algorithms Used for Classification
1. CART (Classification and Regression Trees)
2. Gaussian Naive Bayes / Naive Bayes
3. Gradient Boosting Machines (AdaBoost)
4. K-Nearest Neighbors (K-NN)
5. Logistic Regression
6. Multi-Layer Perceptron (MLP)
7. Perceptron
8. Random Forest
9. Support Vector Machines (SVM)

### 1. CART (Classification and Regression Trees) - DecisionTree Classifier
- Sampling Technique - Repeated Random Train/Test Split (65:35)
- Classification Metrics - Logarithmic Loss

In [None]:
from pandas import read_csv
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import log_loss
import numpy as np

# Load the dataset
filename = 'D:/MSU_IIT/4th Year/ITD105/ML Using Different Algorithms/Case Studies/Case Study 1/seattle-weather.csv'
dataframe = read_csv(filename)

# Use LabelEncoder to encode the 'weather' column
label_encoder = LabelEncoder()
dataframe['weather'] = label_encoder.fit_transform(dataframe['weather'])

# Extract features (X) and target variable (Y)
X = dataframe.drop(columns=['weather', 'date'])  # Assuming 'date' is not a useful feature
Y = dataframe['weather']

# Set the number of splitting iterations and the test size
n_splits = 10
test_size = 0.35
seed = 7

# Do Repeated Random Train-Test Splits by initializing ShuffleSplit
shuffle_split = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=seed)

log_losses = []  # To store log losses for each iteration

# Perform repeated shuffling and splitting, and train the model
for train_index, test_index in shuffle_split.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]

# Create a DecisionTree Classifier
model = DecisionTreeClassifier(
    max_depth= 5,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=seed
)

# Train the model on the training data
model.fit(X_train, Y_train)

# Make predictions on the test set
predictions = model.predict_proba(X_test)

# Calculate the logarithmic loss
logloss = log_loss(Y_test, predictions)

# Calculate and store the logarithmic loss
log_loss_value = log_loss(Y_test, predictions)
log_losses.append(log_loss_value)

# Print the average log loss over all iterations
print("Logarithmic Loss: %.3f" % np.mean(log_losses))


### 2. Gaussian Naive Bayes
- Sampling Technique - Repeated Random Train/Test Split (65:35)
- Classification Metrics - Logarithmic Loss

In [None]:
from pandas import read_csv
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import log_loss
import numpy as np

# Load the dataset
filename = 'D:/MSU_IIT/4th Year/ITD105/ML Using Different Algorithms/Case Studies/Case Study 1/seattle-weather.csv'
dataframe = read_csv(filename)

# Use LabelEncoder to encode the 'weather' column
label_encoder = LabelEncoder()
dataframe['weather'] = label_encoder.fit_transform(dataframe['weather'])

# Extract features (X) and target variable (Y)
X = dataframe.drop(columns=['weather', 'date'])  # Assuming 'date' is not a useful feature
Y = dataframe['weather']

# Set the number of splitting iterations and the test size
n_splits = 10
test_size = 0.35
seed = 7

# Do Repeated Random Train-Test Splits by initializing ShuffleSplit
shuffle_split = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=seed)

log_losses = []  # To store log losses for each iteration

# Perform repeated shuffling and splitting, and train the model
for train_index, test_index in shuffle_split.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]

# Create a Gaussian Naive Bayes classifier
model = GaussianNB(priors=None, var_smoothing=1e-9)
# Hyperparameters:
# - priors: You can specify class prior probabilities if you have prior knowledge.
# - var_smoothing: A smoothing parameter for avoiding zero variances.

# Train the model on the training data
model.fit(X_train, Y_train)

# Make predictions on the test set
predictions = model.predict_proba(X_test)

# Calculate the logarithmic loss
logloss = log_loss(Y_test, predictions)

# Calculate and store the logarithmic loss
log_loss_value = log_loss(Y_test, predictions)
log_losses.append(log_loss_value)

# Print the average log loss over all iterations
print("Logarithmic Loss: %.3f" % np.mean(log_losses))

### 3. Gradient Boosting Machines (AdaBoost)
- Sampling Technique - Repeated Random Train/Test Split (65:35)
- Classification Metrics - Logarithmic Loss

In [None]:
from pandas import read_csv
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import log_loss
import numpy as np

# Load the dataset
filename = 'D:/MSU_IIT/4th Year/ITD105/ML Using Different Algorithms/Case Studies/Case Study 1/seattle-weather.csv'
dataframe = read_csv(filename)

# Use LabelEncoder to encode the 'weather' column
label_encoder = LabelEncoder()
dataframe['weather'] = label_encoder.fit_transform(dataframe['weather'])

# Extract features (X) and target variable (Y)
X = dataframe.drop(columns=['weather', 'date'])  # Assuming 'date' is not a useful feature
Y = dataframe['weather']

# Set the number of splitting iterations and the test size
n_splits = 10
test_size = 0.35
seed = 7

# Do Repeated Random Train-Test Splits by initializing ShuffleSplit
shuffle_split = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=seed)

log_losses = []  # To store log losses for each iteration

# Perform repeated shuffling and splitting, and train the model
for train_index, test_index in shuffle_split.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]

# Create an AdaBoost classifier
model = AdaBoostClassifier(n_estimators=50, random_state=seed)
# Hyperparameters:
# - n_estimators: The number of weak classifiers (base estimators) to train. You can adjust this to control the complexity of the ensemble.
# - random_state: The random seed for reproducibility. You can set this to a specific value if you want consistent results.

# Train the model on the training data
model.fit(X_train, Y_train)

# Make predictions on the test set
predictions = model.predict_proba(X_test)

# Calculate the logarithmic loss
logloss = log_loss(Y_test, predictions)

# Calculate and store the logarithmic loss
log_loss_value = log_loss(Y_test, predictions)
log_losses.append(log_loss_value)

# Print the average log loss over all iterations
print("Logarithmic Loss: %.3f" % np.mean(log_losses))


### 4. K-Nearest Neighbors (K-NN)
- Sampling Technique - Repeated Random Train/Test Split (65:35)
- Classification Metrics - Logarithmic Loss

In [None]:
from pandas import read_csv
from sklearn.model_selection import ShuffleSplit
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
import numpy as np

# Load the dataset
filename = 'D:/MSU_IIT/4th Year/ITD105/ML Using Different Algorithms/Case Studies/Case Study 1/seattle-weather.csv'
dataframe = read_csv(filename)

# Use LabelEncoder to encode the 'weather' column
label_encoder = LabelEncoder()
dataframe['weather'] = label_encoder.fit_transform(dataframe['weather'])

# Extract features (X) and target variable (Y)
X = dataframe.drop(columns=['weather', 'date'])  # Assuming 'date' is not a useful feature
Y = dataframe['weather']

# Set the number of splitting iterations and the test size
n_splits = 10
test_size = 0.35
seed = 7

# Do Repeated Random Train-Test Splits by initializing ShuffleSplit
shuffle_split = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=seed)

log_losses = []  # To store log losses for each iteration

# Perform repeated shuffling and splitting, and train the model
for train_index, test_index in shuffle_split.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]

# Create a K-Nearest Neighbors (K-NN) classifier
model = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto')
# Hyperparameters:
# - n_neighbors: The number of nearest neighbors to consider when making predictions. You can adjust this to control the model's sensitivity to local patterns.
# - weights: Determines how the neighbors' contributions are weighted (e.g., 'uniform' or 'distance'). You can choose the appropriate weighting strategy.
# - algorithm: The algorithm used to compute the nearest neighbors ('auto', 'ball_tree', 'kd_tree', or 'brute'). You can choose the most suitable algorithm based on your data size and structure.

# Train the model on the training data
model.fit(X_train, Y_train)

# Make predictions on the test set
predictions = model.predict_proba(X_test)

# Calculate the logarithmic loss
logloss = log_loss(Y_test, predictions)

# Calculate and store the logarithmic loss
log_loss_value = log_loss(Y_test, predictions)
log_losses.append(log_loss_value)

# Print the average log loss over all iterations
print("Logarithmic Loss: %.3f" % np.mean(log_losses))


### 5. Logistic Regression
- Sampling Technique - Repeated Random Train/Test Split (65:35)
- Classification Metrics - Logarithmic Loss

In [None]:
from pandas import read_csv
from sklearn.model_selection import ShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
import numpy as np

# Load the dataset
filename = 'D:/MSU_IIT/4th Year/ITD105/ML Using Different Algorithms/Case Studies/Case Study 1/seattle-weather.csv'
dataframe = read_csv(filename)

# Use LabelEncoder to encode the 'weather' column
label_encoder = LabelEncoder()
dataframe['weather'] = label_encoder.fit_transform(dataframe['weather'])

# Extract features (X) and target variable (Y)
X = dataframe.drop(columns=['weather', 'date'])  # Assuming 'date' is not a useful feature
Y = dataframe['weather']

# Set the number of splitting iterations and the test size
n_splits = 10
test_size = 0.35
seed = 7

# Do Repeated Random Train-Test Splits by initializing ShuffleSplit
shuffle_split = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=seed)

log_losses = []  # To store log losses for each iteration

# Perform repeated shuffling and splitting, and train the model
for train_index, test_index in shuffle_split.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]

# Create a Logistic Regression model
model = LogisticRegression(max_iter=700, solver='lbfgs', C=1.0, random_state=seed)
# Hyperparameters:
# - max_iter: The maximum number of iterations for the solver to converge. You can adjust this if the model does not converge.
# - solver: The algorithm to use for optimization ('lbfgs', 'liblinear', etc.). Choose an appropriate solver for your data and problem.
# - C: Inverse of regularization strength. Smaller values increase regularization. You can adjust this to control the trade-off between fitting the data and preventing overfitting.

# Train the model on the training data
model.fit(X_train, Y_train)

# Make predictions on the test set
predictions = model.predict_proba(X_test)

# Calculate the logarithmic loss
logloss = log_loss(Y_test, predictions)

# Calculate and store the logarithmic loss
log_loss_value = log_loss(Y_test, predictions)
log_losses.append(log_loss_value)

# Print the average log loss over all iterations
print("Logarithmic Loss: %.3f" % np.mean(log_losses))


### 6. Multi-Layer Perceptron (MLP)
- Sampling Technique - Repeated Random Train/Test Split (65:35)
- Classification Metrics - Logarithmic Loss

In [None]:
from pandas import read_csv
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import log_loss
import numpy as np

# Load the dataset
filename = 'D:/MSU_IIT/4th Year/ITD105/ML Using Different Algorithms/Case Studies/Case Study 1/seattle-weather.csv'
dataframe = read_csv(filename)

# Use LabelEncoder to encode the 'weather' column
label_encoder = LabelEncoder()
dataframe['weather'] = label_encoder.fit_transform(dataframe['weather'])

# Extract features (X) and target variable (Y)
X = dataframe.drop(columns=['weather', 'date'])  # Assuming 'date' is not a useful feature
Y = dataframe['weather']

# Set the number of splitting iterations and the test size
n_splits = 10
test_size = 0.35
seed = 7

# Initialize ShuffleSplit
shuffle_split = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=seed)

log_losses = []  # To store log losses for each iteration

# Perform repeated shuffling and splitting, and train the model
for train_index, test_index in shuffle_split.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]

# Create an MLP-based model
model = MLPClassifier(hidden_layer_sizes=(65, 32), activation='relu', solver='adam', max_iter=700, random_state=seed)
# Hyperparameters:
# - hidden_layer_sizes: The number of neurons in each hidden layer. You can customize the architecture by adjusting this parameter.
# - activation: The activation function used in the hidden layers ('relu', 'tanh', etc.). Choose the appropriate one for your problem.
# - solver: The algorithm for weight optimization ('adam', 'lbfgs', etc.). Select the one that works best for your data.
# - max_iter: The maximum number of iterations for the solver to converge. You can adjust this if the model does not converge.
# - random_state: The random seed for reproducibility. Set this to a specific value for consistent results.

# Train the model on the training data
model.fit(X_train, Y_train)

# Make predictions on the test set
predictions = model.predict_proba(X_test)

# Calculate the logarithmic loss
logloss = log_loss(Y_test, predictions)

# Calculate and store the logarithmic loss
log_loss_value = log_loss(Y_test, predictions)
log_losses.append(log_loss_value)

# Print the average log loss over all iterations
print("Logarithmic Loss: %.3f" % np.mean(log_losses))


### 7. Perceptron
- Sampling Technique - Repeated Random Train/Test Split (65:35)
- Classification Metrics - N/A

In [None]:
from pandas import read_csv
from sklearn.linear_model import Perceptron
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import log_loss
import numpy as np

# Load the dataset
filename = 'D:/MSU_IIT/4th Year/ITD105/ML Using Different Algorithms/Case Studies/Case Study 1/seattle-weather.csv'
dataframe = read_csv(filename)

# Use LabelEncoder to encode the 'weather' column
label_encoder = LabelEncoder()
dataframe['weather'] = label_encoder.fit_transform(dataframe['weather'])

# Extract features (X) and target variable (Y)
X = dataframe.drop(columns=['weather', 'date'])  # Assuming 'date' is not a useful feature
Y = dataframe['weather']

# Set the number of splitting iterations and the test size
n_splits = 10
test_size = 0.35
seed = 7

# Do Repeated Random Train-Test Splits by initializing ShuffleSplit
shuffle_split = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=seed)

log_losses = []  # To store log losses for each iteration

# Perform repeated shuffling and splitting, and train the model
for train_index, test_index in shuffle_split.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]

# Create a Perceptron classifier
model = Perceptron(max_iter=700, random_state=seed, eta0=1.0, tol=1e-3)
# Hyperparameters:
# - max_iter: The maximum number of iterations for the solver to converge. You can adjust this if the model does not converge.
# - random_state: The random seed for reproducibility. Set this to a specific value for consistent results.
# - eta0: The initial learning rate. You can control the step size for weight updates by adjusting this.
# - tol: The tolerance for stopping criterion. The model will stop training when the change in the average loss is smaller than this value.

# Train the model
model.fit(X_train, Y_train)

# Make predictions on the test set
predictions = model.predict(X_test)

# Print confusion matrix
#print("\nConfusion Matrix:\n", confusion_matrix(Y_test, predictions))

# Print classification report for more detailed metrics
#print("\nClassification Report:\n", classification_report(Y_test, predictions, zero_division=1))


### 8. Random Forest
- Sampling Technique - Repeated Random Train/Test Split (65:35)
- Classification Metrics - Logarithmic Loss

In [None]:
from pandas import read_csv
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import log_loss
import numpy as np

# Load the dataset
filename = 'D:/MSU_IIT/4th Year/ITD105/ML Using Different Algorithms/Case Studies/Case Study 1/seattle-weather.csv'
dataframe = read_csv(filename)

# Use LabelEncoder to encode the 'weather' column
label_encoder = LabelEncoder()
dataframe['weather'] = label_encoder.fit_transform(dataframe['weather'])

# Extract features (X) and target variable (Y)
X = dataframe.drop(columns=['weather', 'date'])  # Assuming 'date' is not a useful feature
Y = dataframe['weather']

# Set the number of splitting iterations and the test size
n_splits = 10
test_size = 0.35
seed = 7

# Do Repeated Random Train-Test Splits by initializing ShuffleSplit
shuffle_split = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=seed)

log_losses = []  # To store log losses for each iteration

# Perform repeated shuffling and splitting, and train the model
for train_index, test_index in shuffle_split.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]
    
# Create a Random Forest classifier
model = RandomForestClassifier(
    n_estimators=100, 
    random_state=seed, 
    max_depth=20, 
    min_samples_split=10, 
    min_samples_leaf=1
    )
# Hyperparameters:
# - n_estimators: The number of decision trees in the random forest. Adjust this to control the ensemble size.
# - random_state: The random seed for reproducibility. Set this to a specific value for consistent results.
# - max_depth: The maximum depth of the decision trees. You can limit tree depth to prevent overfitting.
# - min_samples_split: The minimum number of samples required to split a node. Adjust this to control tree node splitting.
# - min_samples_leaf: The minimum number of samples required in a leaf node. You can adjust this to control tree leaf size.

# Train the model on the training data
model.fit(X_train, Y_train)

# Make predictions on the test set
predictions = model.predict_proba(X_test)

# Calculate the logarithmic loss
logloss = log_loss(Y_test, predictions)

# Calculate and store the logarithmic loss
log_loss_value = log_loss(Y_test, predictions)
log_losses.append(log_loss_value)

# Print the average log loss over all iterations
print("Logarithmic Loss: %.3f" % np.mean(log_losses))


### 9. Support Vector Machines (SVM)
- Sampling Technique - Repeated Random Train/Test Split (65:35)
- Classification Metrics - Logarithmic Loss

In [None]:
from pandas import read_csv
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import log_loss
import numpy as np

# Load the dataset
filename = 'D:/MSU_IIT/4th Year/ITD105/ML Using Different Algorithms/Case Studies/Case Study 1/seattle-weather.csv'
dataframe = read_csv(filename)

# Use LabelEncoder to encode the 'weather' column
label_encoder = LabelEncoder()
dataframe['weather'] = label_encoder.fit_transform(dataframe['weather'])

# Extract features (X) and target variable (Y)
X = dataframe.drop(columns=['weather', 'date'])  # Assuming 'date' is not a useful feature
Y = dataframe['weather']

# Set the number of splitting iterations and the test size
n_splits = 10
test_size = 0.35
seed = 7

# Do Repeated Random Train-Test Splits by initializing ShuffleSplit
shuffle_split = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=seed)

log_losses = []  # To store log losses for each iteration

# Perform repeated shuffling and splitting, and train the model
for train_index, test_index in shuffle_split.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]
    
# Create an SVM classifier
model = SVC(kernel='linear', C=1.0, random_state=seed, probability=True)
# Hyperparameters:
# - kernel: The type of kernel to use ('linear', 'poly', 'rbf', etc.). Choose the appropriate kernel for your problem.
# - C: The regularization parameter. Smaller values increase regularization. You can adjust this to control the trade-off between fitting the data and preventing overfitting.
# - random_state: The random seed for reproducibility. Set this to a specific value for consistent results.

# Train the model on the training data
model.fit(X_train, Y_train)

# Make predictions on the test set
predictions = model.predict_proba(X_test)

# Calculate the logarithmic loss
logloss = log_loss(Y_test, predictions)

# Calculate and store the logarithmic loss
log_loss_value = log_loss(Y_test, predictions)
log_losses.append(log_loss_value)

# Print the average log loss over all iterations
print("Logarithmic Loss: %.3f" % np.mean(log_losses))
