## Algorithms Used for Classification
1. CART (Classification and Regression Trees)
2. Gaussian Naive Bayes / Naive Bayes
3. Gradient Boosting Machines (AdaBoost)
4. K-Nearest Neighbors (K-NN)
5. Logistic Regression
6. Multi-Layer Perceptron (MLP)
7. Perceptron
8. Random Forest
9. Support Vector Machines (SVM)

### 1. CART (Classification and Regression Trees) - DecisionTree Classifier
- Sampling Technique - Train/Test Split (80:20)
- Classification Metrics - Classification Report and Confusion Matrix

In [None]:
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report

# Load the dataset
filename = 'D:/MSU_IIT/4th Year/ITD105/ML Using Different Algorithms/Case Studies/Case Study 1/seattle-weather.csv'
dataframe = read_csv(filename)

# Use LabelEncoder to encode the 'weather' column
label_encoder = LabelEncoder()
dataframe['weather'] = label_encoder.fit_transform(dataframe['weather'])

# Extract features (X) and target variable (Y)
X = dataframe.drop(columns=['weather', 'date'])  # Assuming 'date' is not a useful feature
Y = dataframe['weather']

# Set the test size and random seed for reproducibility
test_size = 0.20
random_seed = 7  

# Split the dataset into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=random_seed)

# Create a DecisionTree Classifier
model = DecisionTreeClassifier(
    max_depth= 5,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=random_seed
)

# Train the model on the training data
model.fit(X_train, Y_train)

# Make predictions on the test set
predictions = model.predict(X_test)

# Print confusion matrix
print("\nConfusion Matrix:\n", confusion_matrix(Y_test, predictions))

# Print classification report for more detailed metrics
print("\nClassification Report:\n", classification_report(Y_test, predictions, zero_division=1))


### 2. Gaussian Naive Bayes
- Sampling Technique - Train/Test Split (80:20)
- Classification Metrics - Classification Report and Confusion Matrix

In [None]:
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report

# Load the dataset
filename = 'D:/MSU_IIT/4th Year/ITD105/ML Using Different Algorithms/Case Studies/Case Study 1/seattle-weather.csv'
dataframe = read_csv(filename)

# Use LabelEncoder to encode the 'weather' column
label_encoder = LabelEncoder()
dataframe['weather'] = label_encoder.fit_transform(dataframe['weather'])

# Extract features (X) and target variable (Y)
X = dataframe.drop(columns=['weather', 'date'])  # Assuming 'date' is not a useful feature
Y = dataframe['weather']

# Set the test size
test_size = 0.20  # Hyperparameter: Fraction of the dataset to use for testing
seed = 7

# Split the dataset into test and train
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

# Create a Gaussian Naive Bayes classifier
model = GaussianNB(priors=None, var_smoothing=1e-9)
# Hyperparameters:
# - priors: You can specify class prior probabilities if you have prior knowledge.
# - var_smoothing: A smoothing parameter for avoiding zero variances.

# Train the model on the training data
model.fit(X_train, Y_train)

# Make predictions on the test set
predictions = model.predict(X_test)

# Print confusion matrix
print("\nConfusion Matrix:\n", confusion_matrix(Y_test, predictions))

# Print classification report for more detailed metrics
print("\nClassification Report:\n", classification_report(Y_test, predictions, zero_division=1))

### 3. Gradient Boosting Machines (AdaBoost)
- Sampling Technique - Train/Test Split (80:20)
- Classification Metrics - Classification Report and Confusion Matrix

In [None]:
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report

# Load the dataset
filename = 'D:/MSU_IIT/4th Year/ITD105/ML Using Different Algorithms/Case Studies/Case Study 1/seattle-weather.csv'
dataframe = read_csv(filename)

# Use LabelEncoder to encode the 'weather' column
label_encoder = LabelEncoder()
dataframe['weather'] = label_encoder.fit_transform(dataframe['weather'])

# Extract features (X) and target variable (Y)
X = dataframe.drop(columns=['weather', 'date'])  # Assuming 'date' is not a useful feature
Y = dataframe['weather']

# Set the test size
test_size = 0.20  # Hyperparameter: Fraction of the dataset to use for testing
seed = 7

# Split the dataset into test and train
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

# Create an AdaBoost classifier
model = AdaBoostClassifier(n_estimators=50, random_state=seed)
# Hyperparameters:
# - n_estimators: The number of weak classifiers (base estimators) to train. You can adjust this to control the complexity of the ensemble.
# - random_state: The random seed for reproducibility. You can set this to a specific value if you want consistent results.

# Train the model on the training data
model.fit(X_train, Y_train)

# Make predictions on the test set
predictions = model.predict(X_test)

# Print confusion matrix
print("\nConfusion Matrix:\n", confusion_matrix(Y_test, predictions))

# Print classification report for more detailed metrics
print("\nClassification Report:\n", classification_report(Y_test, predictions, zero_division=1))


### 4. K-Nearest Neighbors (K-NN)
- Sampling Technique - Train/Test Split (80:20)
- Classification Metrics - Classification Report and Confusion Matrix

In [None]:
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report

# Load the dataset
filename = 'D:/MSU_IIT/4th Year/ITD105/ML Using Different Algorithms/Case Studies/Case Study 1/seattle-weather.csv'
dataframe = read_csv(filename)

# Use LabelEncoder to encode the 'weather' column
label_encoder = LabelEncoder()
dataframe['weather'] = label_encoder.fit_transform(dataframe['weather'])

# Extract features (X) and target variable (Y)
X = dataframe.drop(columns=['weather', 'date'])  # Assuming 'date' is not a useful feature
Y = dataframe['weather']

# Set the test size
test_size = 0.20  # Hyperparameter: Fraction of the dataset to use for testing
seed = 7

# Split the dataset into test and train
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

# Create a K-Nearest Neighbors (K-NN) classifier
model = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto')
# Hyperparameters:
# - n_neighbors: The number of nearest neighbors to consider when making predictions. You can adjust this to control the model's sensitivity to local patterns.
# - weights: Determines how the neighbors' contributions are weighted (e.g., 'uniform' or 'distance'). You can choose the appropriate weighting strategy.
# - algorithm: The algorithm used to compute the nearest neighbors ('auto', 'ball_tree', 'kd_tree', or 'brute'). You can choose the most suitable algorithm based on your data size and structure.

# Train the model on the training data
model.fit(X_train, Y_train)

# Make predictions on the test set
predictions = model.predict(X_test)

# Print confusion matrix
print("\nConfusion Matrix:\n", confusion_matrix(Y_test, predictions))

# Print classification report for more detailed metrics
print("\nClassification Report:\n", classification_report(Y_test, predictions, zero_division=1))


### 5. Logistic Regression
- Sampling Technique - Train/Test Split (80:20)
- Classification Metrics - Classification Report and Confusion Matrix

In [None]:
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report

# Load the dataset
filename = 'D:/MSU_IIT/4th Year/ITD105/ML Using Different Algorithms/Case Studies/Case Study 1/seattle-weather.csv'
dataframe = read_csv(filename)

# Use LabelEncoder to encode the 'weather' column
label_encoder = LabelEncoder()
dataframe['weather'] = label_encoder.fit_transform(dataframe['weather'])

# Extract features (X) and target variable (Y)
X = dataframe.drop(columns=['weather', 'date'])  # Assuming 'date' is not a useful feature
Y = dataframe['weather']

# Set the test size
test_size = 0.20  # Hyperparameter: Fraction of the dataset to use for testing
seed = 7

# Split the dataset into test and train
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

# Create a Logistic Regression model
model = LogisticRegression(max_iter=700, solver='lbfgs', C=1.0)
# Hyperparameters:
# - max_iter: The maximum number of iterations for the solver to converge. You can adjust this if the model does not converge.
# - solver: The algorithm to use for optimization ('lbfgs', 'liblinear', etc.). Choose an appropriate solver for your data and problem.
# - C: Inverse of regularization strength. Smaller values increase regularization. You can adjust this to control the trade-off between fitting the data and preventing overfitting.

# Train the model on the training data
model.fit(X_train, Y_train)

# Make predictions on the test set
predictions = model.predict(X_test)

# Print confusion matrix
print("\nConfusion Matrix:\n", confusion_matrix(Y_test, predictions))

# Print classification report for more detailed metrics
print("\nClassification Report:\n", classification_report(Y_test, predictions, zero_division=1))


### 6. Multi-Layer Perceptron (MLP)
- Sampling Technique - Train/Test Split (80:20)
- Classification Metrics - Classification Report and Confusion Matrix

In [None]:
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report

# Load the dataset
filename = 'D:/MSU_IIT/4th Year/ITD105/ML Using Different Algorithms/Case Studies/Case Study 1/seattle-weather.csv'
dataframe = read_csv(filename)

# Use LabelEncoder to encode the 'weather' column
label_encoder = LabelEncoder()
dataframe['weather'] = label_encoder.fit_transform(dataframe['weather'])

# Extract features (X) and target variable (Y)
X = dataframe.drop(columns=['weather', 'date'])  # Assuming 'date' is not a useful feature
Y = dataframe['weather']

# Set the test size
test_size = 0.20  # Hyperparameter: Fraction of the dataset to use for testing
seed = 7

# Split the dataset into test and train
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

# Create an MLP-based model
model = MLPClassifier(hidden_layer_sizes=(65, 32), activation='relu', solver='adam', max_iter=700, random_state=seed)
# Hyperparameters:
# - hidden_layer_sizes: The number of neurons in each hidden layer. You can customize the architecture by adjusting this parameter.
# - activation: The activation function used in the hidden layers ('relu', 'tanh', etc.). Choose the appropriate one for your problem.
# - solver: The algorithm for weight optimization ('adam', 'lbfgs', etc.). Select the one that works best for your data.
# - max_iter: The maximum number of iterations for the solver to converge. You can adjust this if the model does not converge.
# - random_state: The random seed for reproducibility. Set this to a specific value for consistent results.

# Train the model
model.fit(X_train, Y_train)

# Make predictions on the test set
predictions = model.predict(X_test)

# Print confusion matrix
print("\nConfusion Matrix:\n", confusion_matrix(Y_test, predictions))

# Print classification report for more detailed metrics
print("\nClassification Report:\n", classification_report(Y_test, predictions, zero_division=1))


### 7. Perceptron
- Sampling Technique - Train/Test Split (80:20)
- Classification Metrics - Classification Report and Confusion Matrix

In [None]:
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report

# Load the dataset
filename = 'D:/MSU_IIT/4th Year/ITD105/ML Using Different Algorithms/Case Studies/Case Study 1/seattle-weather.csv'
dataframe = read_csv(filename)

# Use LabelEncoder to encode the 'weather' column
label_encoder = LabelEncoder()
dataframe['weather'] = label_encoder.fit_transform(dataframe['weather'])

# Extract features (X) and target variable (Y)
X = dataframe.drop(columns=['weather', 'date'])  # Assuming 'date' is not a useful feature
Y = dataframe['weather']

# Set the test size
test_size = 0.20  # Hyperparameter: Fraction of the dataset to use for testing
seed = 7

# Split the dataset into test and train
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

# Create a Perceptron classifier
model = Perceptron(max_iter=700, random_state=seed, eta0=1.0, tol=1e-3)
# Hyperparameters:
# - max_iter: The maximum number of iterations for the solver to converge. You can adjust this if the model does not converge.
# - random_state: The random seed for reproducibility. Set this to a specific value for consistent results.
# - eta0: The initial learning rate. You can control the step size for weight updates by adjusting this.
# - tol: The tolerance for stopping criterion. The model will stop training when the change in the average loss is smaller than this value.

# Train the model
model.fit(X_train, Y_train)

# Make predictions on the test set
predictions = model.predict(X_test)

# Print confusion matrix
print("\nConfusion Matrix:\n", confusion_matrix(Y_test, predictions))

# Print classification report for more detailed metrics
print("\nClassification Report:\n", classification_report(Y_test, predictions, zero_division=1))


### 8. Random Forest
- Sampling Technique - Train/Test Split (80:20)
- Classification Metrics - Classification Report and Confusion Matrix

In [None]:
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report

# Load the dataset
filename = 'D:/MSU_IIT/4th Year/ITD105/ML Using Different Algorithms/Case Studies/Case Study 1/seattle-weather.csv'
dataframe = read_csv(filename)

# Use LabelEncoder to encode the 'weather' column
label_encoder = LabelEncoder()
dataframe['weather'] = label_encoder.fit_transform(dataframe['weather'])

# Extract features (X) and target variable (Y)
X = dataframe.drop(columns=['weather', 'date'])  # Assuming 'date' is not a useful feature
Y = dataframe['weather']

# Set the test size
test_size = 0.20  # Hyperparameter: Fraction of the dataset to use for testing
seed = 7

# Split the dataset into test and train
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

# Create a Random Forest classifier
model = RandomForestClassifier(n_estimators=100, random_state=seed, max_depth=None, min_samples_split=2, min_samples_leaf=1)
# Hyperparameters:
# - n_estimators: The number of decision trees in the random forest. Adjust this to control the ensemble size.
# - random_state: The random seed for reproducibility. Set this to a specific value for consistent results.
# - max_depth: The maximum depth of the decision trees. You can limit tree depth to prevent overfitting.
# - min_samples_split: The minimum number of samples required to split a node. Adjust this to control tree node splitting.
# - min_samples_leaf: The minimum number of samples required in a leaf node. You can adjust this to control tree leaf size.

# Train the model
model.fit(X_train, Y_train)

# Make predictions on the test set
predictions = model.predict(X_test)

# Print confusion matrix
print("\nConfusion Matrix:\n", confusion_matrix(Y_test, predictions))

# Print classification report for more detailed metrics
print("\nClassification Report:\n", classification_report(Y_test, predictions, zero_division=1))


### 9. Support Vector Machines (SVM)
- Sampling Technique - Train/Test Split (80:20)
- Classification Metrics - Classification Report and Confusion Matrix

In [None]:
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report

# Load the dataset
filename = 'D:/MSU_IIT/4th Year/ITD105/ML Using Different Algorithms/Case Studies/Case Study 1/seattle-weather.csv'
dataframe = read_csv(filename)

# Use LabelEncoder to encode the 'weather' column
label_encoder = LabelEncoder()
dataframe['weather'] = label_encoder.fit_transform(dataframe['weather'])

# Extract features (X) and target variable (Y)
X = dataframe.drop(columns=['weather', 'date'])  # Assuming 'date' is not a useful feature
Y = dataframe['weather']

# Set the test size
test_size = 0.20  # Hyperparameter: Fraction of the dataset to use for testing
seed = 7

# Split the dataset into test and train
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

# Create an SVM classifier
model = SVC(kernel='linear', C=1.0, random_state=seed)
# Hyperparameters:
# - kernel: The type of kernel to use ('linear', 'poly', 'rbf', etc.). Choose the appropriate kernel for your problem.
# - C: The regularization parameter. Smaller values increase regularization. You can adjust this to control the trade-off between fitting the data and preventing overfitting.
# - random_state: The random seed for reproducibility. Set this to a specific value for consistent results.

# Train the model
model.fit(X_train, Y_train)

# Make predictions on the test set
predictions = model.predict(X_test)

# Print confusion matrix
print("\nConfusion Matrix:\n", confusion_matrix(Y_test, predictions))

# Print classification report for more detailed metrics
print("\nClassification Report:\n", classification_report(Y_test, predictions, zero_division=1))


## Algorithms Used for Regression
1. CART (Classification and Regression Trees)
2. Elastic Net
3. Gradient Boosting Machines (AdaBoost)
4. K-Nearest Neighbors (K-NN)
5. Lasso and Ridge Regression
6. Linear Regression
7. Multi-Layer Perceptron (MLP)
8. Random Forest
9. Support Vector Machines (SVM)

**When comparing models, a lower MAE is generally better.

### 1. CART (Classification and Regression Trees) - DecisionTree Regressor
- Sampling Technique = Train/Test Split (80:20)
- Regression Metrics = MAE

In [None]:
from pandas import read_csv
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Load the dataset
filename = 'D:/MSU_IIT/4th Year/ITD105/ML Using Different Algorithms/Case Studies/Case Study 1/Rainfall_data.csv'
dataframe = read_csv(filename)

# Drop the 'Day' column
dataframe = dataframe.drop(columns=['Day'])

# Extract features (X) and target variable (Y)
X = dataframe.drop(columns=['Precipitation'])  # Features excluding 'Precipitation'
Y = dataframe['Precipitation']  # Target variable

# Set the test size
test_size = 0.20  # Hyperparameter: Fraction of the dataset to use for testing
seed = 7

# Split the dataset into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

# Train the data on a Decision Tree Regressor
model = DecisionTreeRegressor(max_depth=None, min_samples_split=2, min_samples_leaf=1, random_state=seed)
# Hyperparameters:
# - max_depth: The maximum depth of the decision tree. You can limit tree depth to prevent overfitting.
# - min_samples_split: The minimum number of samples required to split an internal node. Adjust this to control node splitting.
# - min_samples_leaf: The minimum number of samples required in a leaf node. You can adjust this to control leaf size.
# - random_state: The random seed for reproducibility. Set this to a specific value for consistent results.

# Train with train set
model.fit(X_train, Y_train)

# Make predictions on the test set
predictions = model.predict(X_test)

# Calculate the mean absolute error on the test set
mae = mean_absolute_error(Y_test, predictions)
print("MAE on Test Set: %.3f" % mae)

### 2. Elastic Net
- Sampling Technique = Train/Test Split (80:20)
- Regression Metrics = MAE

In [None]:
from pandas import read_csv
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Load the dataset
filename = 'D:/MSU_IIT/4th Year/ITD105/ML Using Different Algorithms/Case Studies/Case Study 1/Rainfall_data.csv'
dataframe = read_csv(filename)

# Drop the 'Day' column
dataframe = dataframe.drop(columns=['Day'])

# Extract features (X) and target variable (Y)
X = dataframe.drop(columns=['Precipitation'])  # Features excluding 'Precipitation'
Y = dataframe['Precipitation']  # Target variable

# Set the test size
test_size = 0.20  # Hyperparameter: Fraction of the dataset to use for testing
seed = 7

# Split the dataset into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

# Train the data on an Elastic Net model
model = ElasticNet(alpha=1.0, l1_ratio=0.5, max_iter=1000, random_state=seed)
# Hyperparameters:
# - alpha: The regularization parameter that controls the balance between L1 (Lasso) and L2 (Ridge) penalties. Adjust this to control the regularization strength.
# - l1_ratio: The mixing parameter for L1 and L2 penalties. A value of 0 corresponds to L2, 1 to L1, and values in between to combinations.
# - max_iter: The maximum number of iterations for the solver to converge. You can adjust this if the model does not converge.
# - random_state: The random seed for reproducibility. Set this to a specific value for consistent results.

# Train with train set
model.fit(X_train, Y_train)

# Make predictions on the test set
predictions = model.predict(X_test)

# Calculate the mean absolute error on the test set
mae = mean_absolute_error(Y_test, predictions)
print("MAE on Test Set: %.3f" % mae)


### 3. Gradient Boosting Machines (AdaBoost)
- Sampling Technique = Train/Test Split (80:20)
- Regression Metrics = MAE

In [None]:
from pandas import read_csv
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Load the dataset
filename = 'D:/MSU_IIT/4th Year/ITD105/ML Using Different Algorithms/Case Studies/Case Study 1/Rainfall_data.csv'
dataframe = read_csv(filename)

# Drop the 'Day' column
dataframe = dataframe.drop(columns=['Day'])

# Extract features (X) and target variable (Y)
X = dataframe.drop(columns=['Precipitation'])  # Features excluding 'Precipitation'
Y = dataframe['Precipitation']  # Target variable

# Set the test size
test_size = 0.20  # Hyperparameter: Fraction of the dataset to use for testing
seed = 7

# Split the dataset into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

# Train the data on an AdaBoost Regressor
model = AdaBoostRegressor(n_estimators=50, learning_rate=1.0, random_state=seed)
# Hyperparameters:
# - n_estimators: The number of weak regressors to combine in the ensemble. You can adjust this to control the complexity of the ensemble.
# - learning_rate: The contribution of each weak regressor to the final prediction. You can adjust this to control the impact of individual estimators.
# - random_state: The random seed for reproducibility. Set this to a specific value for consistent results.

# Train with train set
model.fit(X_train, Y_train)

# Make predictions on the test set
predictions = model.predict(X_test)

# Calculate the mean absolute error on the test set
mae = mean_absolute_error(Y_test, predictions)
print("MAE on Test Set: %.3f" % mae)


### 4. K-Nearest Neighbors (K-NN)
- Sampling Technique = Train/Test Split (80:20)
- Regression Metrics = MAE

In [None]:
from pandas import read_csv
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Load the dataset
filename = 'D:/MSU_IIT/4th Year/ITD105/ML Using Different Algorithms/Case Studies/Case Study 1/Rainfall_data.csv'
dataframe = read_csv(filename)

# Drop the 'Day' column
dataframe = dataframe.drop(columns=['Day'])

# Extract features (X) and target variable (Y)
X = dataframe.drop(columns=['Precipitation'])  # Features excluding 'Precipitation'
Y = dataframe['Precipitation']  # Target variable

# Set the test size
test_size = 0.20  # Hyperparameter: Fraction of the dataset to use for testing
seed = 7

# Split the dataset into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

# Train the data on a K-Nearest Neighbors Regressor
model = KNeighborsRegressor(n_neighbors=5, weights='uniform', algorithm='auto')
# Hyperparameters:
# - n_neighbors: The number of nearest neighbors to consider when making predictions. You can adjust this to control the model's sensitivity to local patterns.
# - weights: Determines how the neighbors' contributions are weighted (e.g., 'uniform' or 'distance'). You can choose the appropriate weighting strategy.
# - algorithm: The algorithm used to compute the nearest neighbors ('auto', 'ball_tree', 'kd_tree', or 'brute'). You can choose the most suitable algorithm based on your data size and structure.

# Train with train set
model.fit(X_train, Y_train)

# Make predictions on the test set
predictions = model.predict(X_test)

# Calculate the mean absolute error on the test set
mae = mean_absolute_error(Y_test, predictions)
print("MAE on Test Set: %.3f" % mae)


### 5. Lasso and Ridge Regression
- Sampling Technique = Train/Test Split (80:20)
- Regression Metrics = MAE

In [None]:
from pandas import read_csv
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

# Load the dataset
filename = 'D:/MSU_IIT/4th Year/ITD105/ML Using Different Algorithms/Case Studies/Case Study 1/Rainfall_data.csv'
dataframe = read_csv(filename)

# Drop the 'Day' column
dataframe = dataframe.drop(columns=['Day'])

# Extract features (X) and target variable (Y)
X = dataframe.drop(columns=['Precipitation'])  # Features excluding 'Precipitation'
Y = dataframe['Precipitation']  # Target variable

# Set the test size
test_size = 0.20  # Hyperparameter: Fraction of the dataset to use for testing
seed = 7

# Split the dataset into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

# Standardize the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the data on a Lasso Regression model
lasso_model = Lasso(alpha=1.0, max_iter=1000, random_state=seed)
lasso_model.fit(X_train_scaled, Y_train)
# Hyperparameters for Lasso:
# - alpha: The regularization parameter that controls the strength of L1 regularization. Adjust this to control the level of sparsity in the model.
# - max_iter: The maximum number of iterations for the solver to converge. You can adjust this if the model does not converge.
# - random_state: The random seed for reproducibility. Set this to a specific value for consistent results.

# Make predictions on the test set
lasso_predictions = lasso_model.predict(X_test_scaled)

# Calculate the mean absolute error with Lasso
lasso_mae = mean_absolute_error(Y_test, lasso_predictions)
print("Lasso MAE on Test Set: %.3f" % lasso_mae)

# Train the data on a Ridge Regression model
ridge_model = Ridge(alpha=1.0, max_iter=1000, random_state=seed)
ridge_model.fit(X_train_scaled, Y_train)
# Hyperparameters for Ridge:
# - alpha: The regularization parameter that controls the strength of L2 regularization. Adjust this to control the strength of regularization.
# - max_iter: The maximum number of iterations for the solver to converge. You can adjust this if the model does not converge.
# - random_state: The random seed for reproducibility. Set this to a specific value for consistent results.

# Make predictions on the test set
ridge_predictions = ridge_model.predict(X_test_scaled)

# Calculate the mean absolute error with Ridge
ridge_mae = mean_absolute_error(Y_test, ridge_predictions)
print("Ridge MAE on Test Set: %.3f" % ridge_mae)


### 6. Linear Regression
- Sampling Technique = Train/Test Split (80:20)
- Regression Metrics = MAE

In [None]:
from pandas import read_csv
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Load the dataset
filename = 'D:/MSU_IIT/4th Year/ITD105/ML Using Different Algorithms/Case Studies/Case Study 1/Rainfall_data.csv'
dataframe = read_csv(filename)

# Drop the 'Day' column
dataframe = dataframe.drop(columns=['Day'])

# Extract features (X) and target variable (Y)
X = dataframe.drop(columns=['Precipitation'])  # Features excluding 'Precipitation'
Y = dataframe['Precipitation']  # Target variable

# Set the test size
test_size = 0.20  # Hyperparameter: Fraction of the dataset to use for testing
seed = 7

# Split the dataset into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

# Train the data on a Linear Regression model
model = LinearRegression()

# Train with train set
model.fit(X_train, Y_train)

# Make predictions on the test set
predictions = model.predict(X_test)

# Calculate the mean absolute error on the test set
mae = mean_absolute_error(Y_test, predictions)
print("MAE on Test Set: %.3f" % mae)


### 7. Multi-Layer Perceptron (MLP)
- Sampling Technique = Train/Test Split (80:20)
- Regression Metrics = MAE

In [None]:
from pandas import read_csv
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Load the dataset
filename = 'D:/MSU_IIT/4th Year/ITD105/ML Using Different Algorithms/Case Studies/Case Study 1/Rainfall_data.csv'
dataframe = read_csv(filename)

# Drop the 'Day' column
dataframe = dataframe.drop(columns=['Day'])

# Extract features (X) and target variable (Y)
X = dataframe.drop(columns=['Precipitation'])  # Features excluding 'Precipitation'
Y = dataframe['Precipitation']  # Target variable

# Set the test size
test_size = 0.20  # Hyperparameter: Fraction of the dataset to use for testing
seed = 7

# Split the dataset into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

# Train the data on an MLP Regressor with specified hyperparameters
model = MLPRegressor(
    hidden_layer_sizes=(100, 50),  # Hyperparameter: Adjust the architecture as needed, specifying the number and size of hidden layers.
    activation='relu',             # Hyperparameter: Choose an appropriate activation function ('identity', 'logistic', 'tanh', 'relu', etc.).
    solver='adam',                # Hyperparameter: Choose an optimization algorithm ('adam', 'lbfgs', 'sgd', etc.).
    learning_rate='constant',     # Hyperparameter: Choose a learning rate schedule ('constant', 'invscaling', 'adaptive').
    max_iter=1000,                # Hyperparameter: Adjust the maximum number of iterations for training.
    random_state=50               # Hyperparameter: Set a random seed for reproducibility.
)

# Train with train set
model.fit(X_train, Y_train)

# Make predictions on the test set
predictions = model.predict(X_test)

# Calculate the mean absolute error on the test set
mae = mean_absolute_error(Y_test, predictions)
print("MAE on Test Set: %.3f" % mae)


### 8. Random Forest
- Sampling Technique = Train/Test Split (80:20)
- Regression Metrics = MAE

In [None]:
from pandas import read_csv
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Load the dataset
filename = 'D:/MSU_IIT/4th Year/ITD105/ML Using Different Algorithms/Case Studies/Case Study 1/Rainfall_data.csv'
dataframe = read_csv(filename)

# Drop the 'Day' column
dataframe = dataframe.drop(columns=['Day'])

# Extract features (X) and target variable (Y)
X = dataframe.drop(columns=['Precipitation'])  # Features excluding 'Precipitation'
Y = dataframe['Precipitation']  # Target variable

# Set the test size
test_size = 0.20  # Hyperparameter: Fraction of the dataset to use for testing
seed = 42

# Split the dataset into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

# Train the data on a Random Forest Regressor with specified hyperparameters
model = RandomForestRegressor(
    n_estimators=100,       # Hyperparameter: The number of trees in the forest. You can adjust this for ensemble size.
    max_depth=None,         # Hyperparameter: The maximum depth of each tree. Adjust to control tree depth.
    min_samples_split=2,    # Hyperparameter: The minimum number of samples required to split an internal node. Adjust to control node splitting.
    min_samples_leaf=1,     # Hyperparameter: The minimum number of samples required in a leaf node. Adjust to control leaf size.
    random_state=42         # Hyperparameter: Set a random seed for reproducibility.
)

# Train with train set
model.fit(X_train, Y_train)

# Make predictions on the test set
predictions = model.predict(X_test)

# Calculate the mean absolute error on the test set
mae = mean_absolute_error(Y_test, predictions)
print("MAE on Test Set: %.3f" % mae)


### 9. Support Vector Machines (SVM)
- Sampling Technique = Train/Test Split (80:20)
- Regression Metrics = MAE

In [None]:
from pandas import read_csv
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Load the dataset
filename = 'D:/MSU_IIT/4th Year/ITD105/ML Using Different Algorithms/Case Studies/Case Study 1/Rainfall_data.csv'
dataframe = read_csv(filename)

# Drop the 'Day' column
dataframe = dataframe.drop(columns=['Day'])

# Extract features (X) and target variable (Y)
X = dataframe.drop(columns=['Precipitation'])  # Features excluding 'Precipitation'
Y = dataframe['Precipitation']  # Target variable

# Set the test size
test_size = 0.20  # Hyperparameter: Fraction of the dataset to use for testing
seed = 7

# Split the dataset into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

# Train the data on a Support Vector Regressor (SVM) with specified hyperparameters
model = SVR(
    kernel='rbf',           # Hyperparameter: The kernel function to use ('linear', 'poly', 'rbf', etc.).
    C=1.0,                  # Hyperparameter: The regularization parameter. Adjust this to control the trade-off between margin width and error.
    epsilon=0.1,            # Hyperparameter: The epsilon-tube within which no penalty is associated with errors.
)

# Train with train set
model.fit(X_train, Y_train)

# Make predictions on the test set
predictions = model.predict(X_test)

# Calculate the mean absolute error on the test set
mae = mean_absolute_error(Y_test, predictions)
print("MAE on Test Set: %.3f" % mae)
