#### Import Libraries

Import `torch`, `pandas`, and any other libraries you might need.

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

#### Import the Dataset

Import the `student_performances_encoded.csv` dataset.

In [None]:
df = pd.read_csv('student_performances_encoded.csv')
df.info()
print(df.Total_Salary.value_counts())

#### Explore the Dataset

Explore the different features in the dataset. If needed,refer to `student_performances_raw.csv` to see the unencoded fields.

Are there any classification problems you would like to try out? Pick a target label to move forward with!

#### Create Tensors

Create a tensor with input features for your model, and another tensor with just the target labels. Remember that the target labels are expected to be integers starting at `0`.

In [None]:
#print(df.Total_Salary.unique())

#will be predicting the Total Salary Label

#adding since not all labels are in test dataset due to small size
#Create Salary_Outcome target column {0: Below Average, 1: Average, 2: Above Average
df['Salary_Outcome'] = df['Total_Salary'].replace({0:0, 
                                                    1:1, 2:1,
                                                    3:2, 4:2})

cols_not_features = ['Student_ID', 'Total_Salary', 'Salary_Outcome']
#cols_not_features = ['Student_ID', 'Total_Salary']
train_features = [x for x in df.columns if x not in cols_not_features]

X = torch.tensor(df[train_features].values, dtype=torch.float)
#y = torch.tensor(df['Total_Salary'].values, dtype=torch.long)
y = torch.tensor(df['Salary_Outcome'].values, dtype=torch.long)

#### Train-test-split

Use scikit-learn to create a training dataset and a testing dataset.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size = 0.8, random_state=49)

#### Build and Train the Model

Create a model using `Sequential`. Select appropriate loss and optimizer functions. Create and run a training loop.

In [None]:
torch.manual_seed(49)

model = nn.Sequential(
    nn.Linear(56, 224),
    nn.ReLU(),
    nn.Linear(224, 112),
    nn.ReLU(),
    #nn.Linear(112, 5)
    nn.Linear(112, 3)
)

loss = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001)

num_epochs = 5000

for i in range(num_epochs):
    preds = model(X_train)
    CEloss = loss(preds, y_train)
    CEloss.backward()
    optimizer.step()
    optimizer.zero_grad()
    
    
    if (i + 1) % 100 == 0:
        predicted_labels = torch.argmax(preds, dim=1)
        accuracy = accuracy_score(y_train, predicted_labels)
        print(f'Epoch {i+ 1}/{num_epochs}, CELoss: {CEloss}, accuracy score: {accuracy}')

#### Evaluate the model

Test the trained model on the testing dataset. How did you do? Feel free to go back and iterate on your decisions to try to improve your model performance!

In [None]:
model.eval()
with torch.no_grad():
    predictions = model(X_test)
    predicted_labels = torch.argmax(predictions, dim=1)
    accuracy = accuracy_score(predicted_labels, y_test)
    report = classification_report(predicted_labels, y_test)
    
print(f"Accuracy: {accuracy}")
print(report)

In [None]:
print(df.Salary_Outcome.value_counts())
# classes were highly imabalanced and there was not enough data. Moving on.