In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd

# Load and preprocess the data
df = pd.read_csv('postgres_data_15000_rv_mean.csv')

# Define the features and target
features = ['rv', 'ae']
X = df[features].astype(float).values
y = df['ae_cl']

# Create a mask that is True for non-nan values and False for nan values
mask = ~np.isnan(y)

# Apply the mask to both X and y
X = X[mask]
y = y[mask]

# Check the count of each class in y
print(np.unique(y, return_counts=True))

X = X.reshape(-1, 2)
y = y.values.reshape(-1)

# Convert the data to tensors
X_tensor = torch.Tensor(X)
y_tensor = torch.LongTensor(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=42)

# Define the neural network model
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(2, 10)  # Input layer with 1 input feature and 10 hidden units
        self.fc2 = nn.Linear(10, 3)  # Hidden layer with 10 units and 3 output units
        self.softmax = nn.Softmax(dim=1)  # Softmax layer to get probabilities of each class

    def forward(self, x):
        x = torch.relu(self.fc1(x))  # Apply ReLU activation to the hidden layer
        x = self.fc2(x)  # Apply linear transformation to the output layer
        x = self.softmax(x)  # Apply softmax to get probabilities of each class
        return x

# Create an instance of the neural network model
model = NeuralNetwork()

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001)

# Train the model
num_epochs = 300
for epoch in range(num_epochs):
    optimizer.zero_grad()
    outputs = model(X_train)
    loss = criterion(outputs, y_train)
    loss.backward()
    optimizer.step()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}")


# Evaluate the model
with torch.no_grad():
    train_outputs = model(X_train)
    train_predictions = torch.argmax(train_outputs, dim=1)
    train_accuracy = (train_predictions == y_train).float().mean()

    test_outputs = model(X_test)
    test_predictions = torch.argmax(test_outputs, dim=1)
    test_accuracy = (test_predictions == y_test).float().mean()


print("Training Accuracy: {:.2f}%".format(train_accuracy * 100))
print("Testing Accuracy: {:.2f}%".format(test_accuracy * 100))


# Create and print new df with actual and predicted value for test set
df = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': test_predictions.flatten()})
print(df)

# Save the dataframe to a CSV file
df.to_csv('predictions_cl.csv', index=False)


# Calculate difference between actual and predicted values
df['Difference'] = (df['Actual'] - df['Predicted'])

# Calculate accuracy
count = (df['Difference'] == 0.00).sum()
print("Correct predictions:", count)
print("Total predictions:", len(df))
print("Accuracy:", count / len(df))

# Check the count of each class in df['Predicted']
print(np.unique(df['Predicted'], return_counts=True))

# NB the model ALWAYS predicts 2, given the high frequency of its occurence in the training set

