In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score,explained_variance_score, median_absolute_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import time
import psutil

# Load the dataset (replace with your path)
df = pd.read_csv('E:/assignment/et/report/work500.csv')

# Data cleaning (drop irrelevant columns)
irrelevant_columns = ['TimePosition', 'SourcePosition', 'Latitude', 'Longitude', 'Course', 'NavStatus',
                      'TimeVoyage', 'SourceVoyage', 'IMO', 'Name', 'Callsign', 'Flag', 'DimA', 'DimB',
                      'DimC', 'DimD', 'Destination', 'TimeETA']

columns_to_drop = [col for col in irrelevant_columns if col in df.columns]
df = df.drop(columns=columns_to_drop)

# Drop rows with missing values
df = df.dropna()

# Remove outliers (for example, speeds or draughts above 99th percentile)
df = df[(df['Speed'] <= df['Speed'].quantile(0.99)) & 
        (df['Draught'] <= df['Draught'].quantile(0.99))]

# Create a synthetic target column 'CO2_Emissions' for demonstration (highly non-linear relationship)
df['CO2_Emissions'] = 0.1 * (df['Speed'] ** 2) + 0.3 * np.sin(df['Draught']) + 0.2 * (df['Length'] ** 3) + 0.05 * np.cos(df['Width']) + np.random.randn(len(df))

# Create interaction terms between features
df['Speed_Draught'] = df['Speed'] * df['Draught']
df['Length_Width'] = df['Length'] * df['Width']

# Introduce non-linear terms (squared and cubic terms)
df['Speed_squared'] = df['Speed'] ** 2
df['Draught_squared'] = df['Draught'] ** 2
df['Length_cubed'] = df['Length'] ** 3

# Trigonometric transformations (to create non-linearity)
df['sin_Length'] = np.sin(df['Length'])
df['cos_Width'] = np.cos(df['Width'])

# Introduce exponential terms
df['exp_Speed'] = np.exp(df['Speed'] / 100)  # Keep the value reasonable by dividing

# Label encoding for categorical feature 'VesselType'
le = LabelEncoder()
df['VesselType'] = le.fit_transform(df['VesselType'])  # Encode 'VesselType'

# Apply KMeans clustering to add a new feature representing cluster memberships
X = df.drop(columns='CO2_Emissions')  # Exclude the target variable
kmeans = KMeans(n_clusters=5, random_state=42)
df['Cluster'] = kmeans.fit_predict(X)  # Adding cluster labels as a feature

# Apply PCA to reduce dimensionality but retain the complexity
pca = PCA(n_components=5)
X_pca = pca.fit_transform(X)

# Adding PCA components to the original dataset
df['PCA1'] = X_pca[:, 0]
df['PCA2'] = X_pca[:, 1]

# Example of domain-based feature engineering
df['Vessel_Volume'] = df['Length'] * df['Width'] * df['Draught']

# Generate polynomial features (quadratic in this example)
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)

# Split data into features (X) and target (y)
X = df.drop(columns='CO2_Emissions')
y = df['CO2_Emissions']

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the dataset
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize Linear SVM model
linear_svm = SVR(kernel='linear')

# Initialize Polynomial SVM model with hyperparameter tuning using GridSearchCV
param_grid = {
    'C': [0.1, 1, 10],           # Regularization strength
    'epsilon': [0.01, 0.1, 0.5],  # Margin of error for SVR
    'degree': [2, 3, 4]           # Polynomial degree
}

poly_svm = SVR(kernel='poly')

# Use GridSearchCV to find the best hyperparameters for Polynomial SVM
grid_search = GridSearchCV(estimator=poly_svm, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
# Function to calculate Root Mean Squared Error (RMSE)
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Function to calculate Mean Bias Deviation (MBD)
def mean_bias_deviation(y_true, y_pred):
    return np.mean(y_true - y_pred)
    
# Track memory usage
def memory_usage():
    process = psutil.Process()
    x =round(process.memory_info().rss / (1024 ** 2) ,2) # Convert to MB
    return x
    

# Train Linear SVM
start_time_linear = time.time()
linear_svm.fit(X_train_scaled, y_train)
linear_time = time.time() - start_time_linear
linear_memory = memory_usage()

# Train Polynomial SVM with GridSearchCV
start_time_poly = time.time()
grid_search.fit(X_train_scaled, y_train)
poly_time = time.time() - start_time_poly
poly_memory = memory_usage()

# Output best parameters for Polynomial SVM
best_params = grid_search.best_params_
print(f"Best Parameters for Polynomial SVM: {best_params}")

# Train the best Polynomial SVM model
best_poly_svm = grid_search.best_estimator_

# Predict on test data using both models
y_pred_linear = linear_svm.predict(X_test_scaled)
y_pred_poly = best_poly_svm.predict(X_test_scaled)

# Evaluate the models (accuracy)
linear_mse = mean_squared_error(y_test, y_pred_linear)
linear_mae = mean_absolute_error(y_test, y_pred_linear)
linear_r2 = r2_score(y_test, y_pred_linear)

poly_mse = mean_squared_error(y_test, y_pred_poly)
poly_mae = mean_absolute_error(y_test, y_pred_poly)
poly_r2 = r2_score(y_test, y_pred_poly)


# Evaluate the models using additional metrics
linear_rmse = rmse(y_test, y_pred_linear)
linear_evs = explained_variance_score(y_test, y_pred_linear)
linear_mbd = mean_bias_deviation(y_test, y_pred_linear)
linear_medae = median_absolute_error(y_test, y_pred_linear)

poly_rmse = rmse(y_test, y_pred_poly)
poly_evs = explained_variance_score(y_test, y_pred_poly)
poly_mbd = mean_bias_deviation(y_test, y_pred_poly)
poly_medae = median_absolute_error(y_test, y_pred_poly)

# Output the additional metrics for Linear SVM


# Output the additional metrics for Polynomial SVM


# Output the results for Linear SVM
print("Linear SVM Results:")
print(f"Mean Squared Error: {linear_mse}")
print(f"Mean Absolute Error: {linear_mae}")
print(f"R2 Score: {linear_r2}")
print(f"Training Time: {linear_time} seconds")
print(f"Memory Usage: {linear_memory} MB")
print("\nAdditional Linear SVM Results:")
print(f"Root Mean Squared Error (RMSE): {linear_rmse}")
print(f"Explained Variance Score (EVS): {linear_evs}")
print(f"Mean Bias Deviation (MBD): {linear_mbd}")
print(f"Median Absolute Error (MedAE): {linear_medae}")

# Output the results for Polynomial SVM
print("\nPolynomial SVM Results:")
print(f"Mean Squared Error: {poly_mse}")
print(f"Mean Absolute Error: {poly_mae}")
print(f"R2 Score: {poly_r2}")
print(f"Training Time: {poly_time} seconds")
print(f"Memory Usage: {poly_memory} MB")
print("\nAdditional Polynomial SVM Results:")
print(f"Root Mean Squared Error (RMSE): {poly_rmse}")
print(f"Explained Variance Score (EVS): {poly_evs}")
print(f"Mean Bias Deviation (MBD): {poly_mbd}")
print(f"Median Absolute Error (MedAE): {poly_medae}")




Best Parameters for Polynomial SVM: {'C': 10, 'degree': 3, 'epsilon': 0.01}
Linear SVM Results:
Mean Squared Error: 364785818602.37164
Mean Absolute Error: 428945.22011687467
R2 Score: -0.3776129249610687
Training Time: 0.004000425338745117 seconds
Memory Usage: 225.22265625 MB

Additional Linear SVM Results:
Root Mean Squared Error (RMSE): 603975.014882546
Explained Variance Score (EVS): 0.010282180316335099
Mean Bias Deviation (MBD): 320488.55022175325
Median Absolute Error (MedAE): 194720.56726778147

Polynomial SVM Results:
Mean Squared Error: 367514867625.2616
Mean Absolute Error: 430671.04800859885
R2 Score: -0.38791917321707237
Training Time: 0.5316920280456543 seconds
Memory Usage: 225.23046875 MB

Additional Polynomial SVM Results:
Root Mean Squared Error (RMSE): 606230.0451357237
Explained Variance Score (EVS): 0.004047803492014612
Mean Bias Deviation (MBD): 322166.2992862817
Median Absolute Error (MedAE): 196649.6518449306
