##This Code is for the Question 1 of the Exercise 2

In [15]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
from imblearn.over_sampling import RandomOverSampler

# Load the Excel file into a pandas DataFrame
file_path = 'HW2_AUT_MLPR_4021-2-Email-SPAM (3).xlsx'  # The actual path to the Excel file
df = pd.read_excel(file_path)

# Extract the text and labels from the DataFrame
texts = df['text'].tolist()
labels = df['spam'].tolist()
labels[:5]
# Create a CountVectorizer to convert text data into a feature vector
vectorizer = CountVectorizer(max_features=100)  # Limit to the top 100 features
X = vectorizer.fit_transform(texts)

# Choose a base classifier for Bagging (RandomForest)
bagging_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Choose a base classifier for Boosting (AdaBoost)
boosting_classifier = AdaBoostClassifier(n_estimators=50, random_state=42)

# Function to evaluate and report metrics
def evaluate_model(classifier, X, y):
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    y_pred = cross_val_predict(classifier, X, y, cv=skf)

    # Confusion Matrix
    conf_matrix = confusion_matrix(y, y_pred)
    print("Confusion Matrix:\n", conf_matrix)

    # Precision, Recall, F-measure
    precision = precision_score(y, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y, y_pred, average='weighted', zero_division=0)
    print("Precision: {:.2%}".format(precision))
    print("Recall: {:.2%}".format(recall))
    print("F-measure: {:.2%}".format(f1))

    # Accuracy and Error
    accuracy = accuracy_score(y, y_pred)
    error = 1 - accuracy
    print("Error: {:.2%}".format(error))

# Fit the classifiers
bagging_classifier.fit(X, labels)
boosting_classifier.fit(X, labels)

# Evaluate and report metrics for Bagging
print("Bagging Metrics:")
evaluate_model(bagging_classifier, X, labels)
print("\n" + "="*30 + "\n")

# Evaluate and report metrics for Boosting
print("Boosting Metrics:")
evaluate_model(boosting_classifier, X, labels)

Bagging Metrics:
Confusion Matrix:
 [[4295   67]
 [  93 1275]]
Precision: 97.19%
Recall: 97.21%
F-measure: 97.20%
Error: 2.79%


Boosting Metrics:
Confusion Matrix:
 [[4204  158]
 [  82 1286]]
Precision: 95.93%
Recall: 95.81%
F-measure: 95.85%
Error: 4.19%


##This Code is for the Question 2 of the Exercise 2

###Installing the required package

In [None]:
pip install jdatetime

Collecting jdatetime
  Downloading jdatetime-4.1.1-py3-none-any.whl (13 kB)
Installing collected packages: jdatetime
Successfully installed jdatetime-4.1.1


In [None]:
from jdatetime import datetime, date

###Converting the Persian calendar datetime to Gregorian calendar datetime (for example 14010801 to 10/23/2022) and reverse the order of the rows in all columns for Stock_Index

In [None]:
# Read the original Excel file into a pandas DataFrame
df = pd.read_excel('Stock_Index_13940101_14010801.xlsx')

# Convert "dateissue" column from Persian to Gregorian and format it as MM/DD/YYYY
df['dateissue'] = df['dateissue'].apply(lambda x: datetime.strptime(str(x), '%Y%m%d').togregorian().strftime('%m/%d/%Y'))

# Reverse the order of rows in the DataFrame
df = df[::-1]

# Save the updated DataFrame to the new Excel file
df.to_excel('Stock_Index_03-25-2015_10-23-2022.xlsx', index=False)

###Converting date format of the Crude Oil data (for example 2-Dec-22 to 12/02/2022)

In [None]:
# Read the original Excel file into a pandas DataFrame
df = pd.read_excel('Crude_Oil_Price-Jan_2015_Dec_2022.xlsx')

# Assuming the date column is named "date_column"
df['Date'] = pd.to_datetime(df['Date'], format='%d-%b-%y').dt.strftime('%m/%d/%Y')

# Reverse the order of rows in the DataFrame
df = df[::-1]

# Save the updated DataFrame to the new Excel file
df.to_excel('Crude_Oil_Price_01-02-2015_12-02-2022.xlsx', index=False)

###Converting date format of the Gold data (for example 2-Dec-22 to 12/02/2022)

In [None]:
# Read the original Excel file into a pandas DataFrame
df = pd.read_excel('Gold_Price-Jan_2015_Dec_2022.xlsx')

# Assuming the date column is named "date_column"
df['Date'] = pd.to_datetime(df['Date'], format='%d-%b-%y').dt.strftime('%m/%d/%Y')

# Reverse the order of rows in the DataFrame
df = df[::-1]

# Save the updated DataFrame to the new Excel file
df.to_excel('Gold_Price_01-02-2015_12-02-2022.xlsx', index=False)

 ### Now, We select only Close column of every table and merge them on Date Column and save them as "Dataset.xlsx".

In [None]:
# List of file paths for the six Excel files
file_paths = [
    'USD-IRR-Nov-2011-December-2022.xlsx',
    'AED-USD-Jan_2015_Dec_2022.xlsx',
    'EUR-USD-Jan_2015_Dec_2022.xlsx',
    'Crude_Oil_Price_01-02-2015_12-02-2022.xlsx',
    'Gold_Price_01-02-2015_12-02-2022.xlsx',
    'Stock_Index_03-25-2015_10-23-2022.xlsx',
]

# Specify the common column for merging
merge_column = 'Date'

# Read each Excel file into a DataFrame, selecting only the "Date" and "Close" columns
dfs = [pd.read_excel(file, usecols=[merge_column, 'Close']) for file in file_paths]

# Convert the "Date" column to a consistent datetime type
for i in range(len(dfs)):
    dfs[i][merge_column] = pd.to_datetime(dfs[i][merge_column], errors='coerce')

# Merge DataFrames based on the "Date" column
merged_df = pd.merge(dfs[0], dfs[1], on=merge_column, how='outer', suffixes=('_file1', '_file2'))

# Continue merging with the rest of the DataFrames
for i in range(2, len(dfs)):
    merged_df = pd.merge(merged_df, dfs[i], on=merge_column, how='outer', suffixes=('', f'_file{i + 1}'))

# Save the merged DataFrame to a new Excel file
merged_df.to_excel('Dataset.xlsx', index=False)

###Now, We change the name of each column in the Dataset Excel file. Also, we unify all the date formats to be alike. We do this manually in Microsoft Excel easily and save it as "Dataset-Final.xlsx". Now we have our final dataset called "Dataset-Final.xlsx" but still, it has null data. To address this issue, First, we sort the records by Date column.

####Sortig the Dataset on Date column

In [None]:
# Read the merged Excel file into a DataFrame
merged_df = pd.read_excel('Dataset-Final.xlsx')

# Sort the DataFrame by the "Date" column
merged_df.sort_values(by='Date', inplace=True)

# Save the updated and sorted DataFrame to a new Excel file
merged_df.to_excel('Dataset-Final-2.xlsx', index=False)

 ### Then, we use the dataset of 03-25-2015 to 10-23-2022 period and save it as "Dataset-Final-3.xlsx". By doing this, we've handled the outer null values but still, we need to handle the inner null values. To do so, we should interpolate for inner null values.

####Interpolate the null values for every column of price data

In [None]:
import numpy as np

# Read the merged Excel file into a DataFrame
merged_df = pd.read_excel('Dataset-Final-3.xlsx', index_col='Date', parse_dates=True)

# Replace non-numeric values (e.g., "-") with NaN
merged_df.replace('-', np.nan, inplace=True)

# Convert columns to numeric (if not already) to prepare for interpolation
merged_df = merged_df.apply(pd.to_numeric, errors='coerce')

# Forward fill and backward fill
for idx in range(0, 6):
    merged_df.iloc[:, idx].fillna(method='ffill', inplace=True)
    merged_df.iloc[:, idx].fillna(method='bfill', inplace=True)

# Interpolate using time-based method
for idx in range(0, 6):
    merged_df.iloc[:, idx] = merged_df.iloc[:, idx].interpolate(method='time')

# Save the updated and sorted DataFrame to a new Excel file
merged_df.to_excel('Dataset-Final-4.xlsx')

###Now we add a new column called "Next Day Date" and fill it with the next day date data.


In [None]:
# Read the merged Excel file into a DataFrame
merged_df = pd.read_excel('Dataset-Final-4.xlsx', index_col='Date', parse_dates=True)

# Add a new column "Next Day Date" and fill it with the Date from the next row
merged_df['Next Day Date'] = merged_df.index.shift(1, freq='D')

# Save the updated DataFrame to a new Excel file
merged_df.to_excel('Dataset-Final-5.xlsx')

###Now we add a new column called "Gap" and fill it with the date distance between every two consequent rows.


In [None]:
# Read the merged Excel file into a DataFrame
merged_df = pd.read_excel('Dataset-Final-5.xlsx', index_col='Date', parse_dates=True)

# Add a new column "Gap" and fill it with the date gap between "Date" and "Next Day Date"
merged_df['Gap'] = (merged_df['Next Day Date'] - merged_df.index).dt.days

# Save the updated DataFrame to a new Excel file
merged_df.to_excel('Dataset-Final-6.xlsx')

###Now we add a new column called "Forecast" and fill it with the -1, 0, 1 by comparing each USD-IRR column value of each row with the next corresponding record.

In [None]:
# Read the merged Excel file into a DataFrame
merged_df = pd.read_excel('Dataset-Final-6.xlsx', index_col='Date', parse_dates=True)

# Add a new column "Forecast" and fill it based on the corrected conditions
merged_df['Forecast'] = (merged_df['USD-IRR'].shift(-1) - merged_df['USD-IRR']).apply(lambda x: -1 if x < 0 else (1 if x > 0 else 0))

# Save the updated DataFrame to a new Excel file
merged_df.to_excel('Dataset-Final-7.xlsx')

###Move the first column place to the penultimate column next to the forecast for more convenience

In [None]:
# Read the merged Excel file into a DataFrame
merged_df = pd.read_excel('Dataset-Final-7.xlsx', index_col='Date', parse_dates=True)

# Move the first column to the penultimate position
first_column = merged_df.pop(merged_df.columns[0])
merged_df.insert(len(merged_df.columns) - 1, first_column.name, first_column)

# Save the updated DataFrame to a new Excel file
merged_df.to_excel('Dataset-Final-8.xlsx')

##Now that the dataset is ready, We proceed to the simulations and evaluations part for Classification Methods.

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

# Read the dataset
df = pd.read_excel('Dataset-Final-8.xlsx', index_col='Date', parse_dates=True)

# Drop unnecessary columns
df = df.drop(['Next Day Date'], axis=1)

# Define features (X) and target (y)
X = df.drop('Forecast', axis=1)
y = df['Forecast']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# SVM (Support Vector Machine)
svm_model = SVC()
svm_model.fit(X_train_scaled, y_train)
svm_predictions = svm_model.predict(X_test_scaled)

# Naive Bayes
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
nb_predictions = nb_model.predict(X_test)

# Decision Tree
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
dt_predictions = dt_model.predict(X_test)

# KNN (K-Nearest Neighbors)
knn_model = KNeighborsClassifier(n_neighbors=100)
knn_model.fit(X_train_scaled, y_train)
knn_predictions = knn_model.predict(X_test_scaled)

# Evaluation Metrics
def evaluate_model(predictions, y_test):
    precision = round(precision_score(y_test, predictions, average='weighted', zero_division=1) * 100, 2)
    recall = round(recall_score(y_test, predictions, average='weighted', zero_division=1) * 100, 2)
    accuracy = round(accuracy_score(y_test, predictions) * 100, 2)
    f1 = round(f1_score(y_test, predictions, average='weighted', zero_division=1) * 100, 2)

    return precision, recall, accuracy, f1

# Evaluate SVM
svm_metrics = evaluate_model(svm_predictions, y_test)
print("SVM Metrics:")
print("Precision:", svm_metrics[0])
print("Recall:", svm_metrics[1])
print("Accuracy:", svm_metrics[2])
print("F1 Score:", svm_metrics[3])

# Evaluate Naive Bayes
nb_metrics = evaluate_model(nb_predictions, y_test)
print("\nNaive Bayes Metrics:")
print("Precision:", nb_metrics[0])
print("Recall:", nb_metrics[1])
print("Accuracy:", nb_metrics[2])
print("F1 Score:", nb_metrics[3])

# Evaluate Decision Tree
dt_metrics = evaluate_model(dt_predictions, y_test)
print("\nDecision Tree Metrics:")
print("Precision:", dt_metrics[0])
print("Recall:", dt_metrics[1])
print("Accuracy:", dt_metrics[2])
print("F1 Score:", dt_metrics[3])

# Evaluate KNN
knn_metrics = evaluate_model(knn_predictions, y_test)
print("\nKNN Metrics:")
print("Precision:", knn_metrics[0])
print("Recall:", knn_metrics[1])
print("Accuracy:", knn_metrics[2])
print("F1 Score:", knn_metrics[3])

SVM Metrics:
Precision: 48.64
Recall: 37.59
Accuracy: 37.59
F1 Score: 33.7

Naive Bayes Metrics:
Precision: 58.73
Recall: 37.04
Accuracy: 37.04
F1 Score: 28.04

Decision Tree Metrics:
Precision: 42.17
Recall: 41.97
Accuracy: 41.97
F1 Score: 41.93

KNN Metrics:
Precision: 46.64
Recall: 40.15
Accuracy: 40.15
F1 Score: 40.31


##Now, We proceed to the simulations and evaluations part for Regression Methods. We use Linear Regression, Ridge Regression, Decision Tree Regressor, and K-Nearest Neighbors Regressor.

In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, r2_score

# Read the dataset
df = pd.read_excel('Dataset-Final-8.xlsx', index_col='Date', parse_dates=True)

# Drop unnecessary columns (if needed)
df = df.drop(['Next Day Date'], axis=1)

# Define features (X) and target (y)
X = df.drop('USD-IRR', axis=1)
y = df['USD-IRR']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Linear Regression
linear_model = LinearRegression()
linear_model.fit(X_train_scaled, y_train)
linear_predictions = linear_model.predict(X_test_scaled)

# Ridge Regression
ridge_model = Ridge(alpha=1.0)  # You can experiment with different alpha values
ridge_model.fit(X_train_scaled, y_train)
ridge_predictions = ridge_model.predict(X_test_scaled)

# Decision Tree Regressor
dt_model = DecisionTreeRegressor()
dt_model.fit(X_train, y_train)
dt_predictions = dt_model.predict(X_test)

# K-Nearest Neighbors Regressor
knn_model = KNeighborsRegressor()
knn_model.fit(X_train_scaled, y_train)
knn_predictions = knn_model.predict(X_test_scaled)

# Evaluation Metrics
def evaluate_regression_model(predictions, y_test):
    mae = round(mean_absolute_error(y_test, predictions), 2)
    mape = round(mean_absolute_percentage_error(y_test, predictions) * 100, 2)
    mse = round(mean_squared_error(y_test, predictions), 2)
    rmse = round(mean_squared_error(y_test, predictions, squared=False), 2)
    r2 = round(r2_score(y_test, predictions) * 100, 2)

    return mae, mape, mse, rmse, r2

# Evaluate Linear Regression
linear_metrics = evaluate_regression_model(linear_predictions, y_test)
print("Linear Regression Metrics:")
print("Mean Absolute Error:", linear_metrics[0])
print("Mean Absolute Percentage Error:", linear_metrics[1], "%")
print("Mean Squared Error:", linear_metrics[2])
print("Root Mean Squared Error:", linear_metrics[3])
print("R2 Score:", linear_metrics[4], "%")

# Evaluate Ridge Regression
ridge_metrics = evaluate_regression_model(ridge_predictions, y_test)
print("\nRidge Regression Metrics:")
print("Mean Absolute Error:", ridge_metrics[0])
print("Mean Absolute Percentage Error:", ridge_metrics[1], "%")
print("Mean Squared Error:", ridge_metrics[2])
print("Root Mean Squared Error:", ridge_metrics[3])
print("R2 Score:", ridge_metrics[4], "%")

# Evaluate Decision Tree Regressor
dt_metrics = evaluate_regression_model(dt_predictions, y_test)
print("\nDecision Tree Regressor Metrics:")
print("Mean Absolute Error:", dt_metrics[0])
print("Mean Absolute Percentage Error:", dt_metrics[1], "%")
print("Mean Squared Error:", dt_metrics[2])
print("Root Mean Squared Error:", dt_metrics[3])
print("R2 Score:", dt_metrics[4], "%")

# Evaluate K-Nearest Neighbors Regressor
knn_metrics = evaluate_regression_model(knn_predictions, y_test)
print("\nK-Nearest Neighbors Regressor Metrics:")
print("Mean Absolute Error:", knn_metrics[0])
print("Mean Absolute Percentage Error:", knn_metrics[1], "%")
print("Mean Squared Error:", knn_metrics[2])
print("Root Mean Squared Error:", knn_metrics[3])
print("R2 Score:", knn_metrics[4], "%")

Linear Regression Metrics:
Mean Absolute Error: 23462.28
Mean Absolute Percentage Error: 30.4 %
Mean Squared Error: 833833695.86
Root Mean Squared Error: 28876.18
R2 Score: 90.7 %

Ridge Regression Metrics:
Mean Absolute Error: 23467.1
Mean Absolute Percentage Error: 30.42 %
Mean Squared Error: 833989808.59
Root Mean Squared Error: 28878.88
R2 Score: 90.7 %

Decision Tree Regressor Metrics:
Mean Absolute Error: 2041.64
Mean Absolute Percentage Error: 1.36 %
Mean Squared Error: 35544536.13
Root Mean Squared Error: 5961.92
R2 Score: 99.6 %

K-Nearest Neighbors Regressor Metrics:
Mean Absolute Error: 5377.18
Mean Absolute Percentage Error: 5.16 %
Mean Squared Error: 123328436.93
Root Mean Squared Error: 11105.33
R2 Score: 98.62 %
