# The template for Assignment 6

In [5]:
from re import X
import os
import pandas as pd

def read_data_from_csv(path):
    """Load datasets from CSV files.
    Args:
        path (str): Path to the CSV file.
    Returns:
        X (np.ndarray): Features of samples.
        y (np.ndarray): Labels of samples, only provided in the public datasets.
    """
    assert os.path.exists(path), f'File not found: {path}!'
    assert os.path.splitext(path)[
        -1] == '.csv', f'Unsupported file type {os.path.splitext(path)[-1]}!'

    data = pd.read_csv(path)
    column_list = data.columns.values.tolist()

    if 'Label' in column_list:
        # for the public dataset, label column is provided.
        column_list.remove('Label')
        X = data[column_list].values
        y = data['Label'].astype('int').values
        return X, y
    else:
        # for the private dataset, label column is not provided.
        X = data[column_list].values
        return X

X_public, y_public = read_data_from_csv('assignment_6_public.csv')
print('Shape of X_public:', X_public.shape)  # n_sample, m_feature (20000, 23)
print('Shape of y_public:', y_public.shape)  # n_sample (20000,)

'''
CODE HERE!
'''
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

X=pd.DataFrame(X_public)
y=pd.DataFrame(y_public)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X.values, dtype=torch.float32)
        self.y = torch.tensor(y.values, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = CustomDataset(X_train, y_train)
test_dataset = CustomDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

class NeuralNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        return x

input_size = len(X.columns)
hidden_size = 64
output_size = 1

model = NeuralNetwork(input_size, hidden_size, output_size)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 50

for epoch in range(num_epochs):
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels.unsqueeze(1))
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

with torch.no_grad():
    total_loss = 0
    for inputs, labels in test_loader:
        outputs = model(inputs)
        loss = criterion(outputs, labels.unsqueeze(1))
        total_loss += loss.item() * len(inputs)

    mean_loss = total_loss / len(test_dataset)
    print(f'Test Loss: {mean_loss:.4f}')

#cal Accuracy on tarin dataset
with torch.no_grad():
    train_outputs = model(torch.tensor(X_train.values, dtype=torch.float32))
    train_predictions = (train_outputs.squeeze(1) > 0.5).long()
    train_accuracy = accuracy_score(y_train.values, train_predictions)

    test_outputs = model(torch.tensor(X_test.values, dtype=torch.float32))
    test_predictions = (test_outputs.squeeze(1) > 0.5).long()
    test_accuracy = accuracy_score(y_test.values, test_predictions)

print(f'Training Accuracy: {train_accuracy:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f}')

X_private = read_data_from_csv('assignment_6_private.csv')
print('Shape of X_private:', X_private.shape)  # k_sample, m_feature (5000, 23)

import numpy as np

# remove and make your own predictions.
preds = np.full(len(X_private), -1,
                dtype=int)
'''
CODE HERE!
e.g.,
preds = np.full(len(X_private), -1, dtype=int)
'''

private_dataset = CustomDataset(pd.DataFrame(X_private), pd.DataFrame(np.zeros((len(X_private), 1))))

private_loader = DataLoader(private_dataset, batch_size=32, shuffle=False)

preds = []
with torch.no_grad():
    for inputs, _ in private_loader:
        outputs = model(inputs)
        preds.extend((outputs.squeeze(1).cpu().numpy() > 0.5).astype(int))

submission = pd.DataFrame({'Label': preds})
submission.to_csv('assignment_6.csv', index=True, index_label='Id')

Shape of X_public: (20000, 23)
Shape of y_public: (20000,)


  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [1/50], Loss: 19949.4766
Epoch [2/50], Loss: 19851.5000
Epoch [3/50], Loss: 225766.3281
Epoch [4/50], Loss: 1459.2627
Epoch [5/50], Loss: 4899.0742
Epoch [6/50], Loss: 1067.6129
Epoch [7/50], Loss: 4534.8828
Epoch [8/50], Loss: 2799.4709
Epoch [9/50], Loss: 382.9598
Epoch [10/50], Loss: 2740.9229
Epoch [11/50], Loss: 266.8012
Epoch [12/50], Loss: 922.0833
Epoch [13/50], Loss: 861.3950
Epoch [14/50], Loss: 54777.7578
Epoch [15/50], Loss: 8490.9648
Epoch [16/50], Loss: 211.1637
Epoch [17/50], Loss: 253.2463
Epoch [18/50], Loss: 74.4108
Epoch [19/50], Loss: 100.2764
Epoch [20/50], Loss: 136.3648
Epoch [21/50], Loss: 53.6631
Epoch [22/50], Loss: 17.1428
Epoch [23/50], Loss: 198.9242
Epoch [24/50], Loss: 699.1090
Epoch [25/50], Loss: 38.4783
Epoch [26/50], Loss: 12.6618
Epoch [27/50], Loss: 188.2642
Epoch [28/50], Loss: 473.3173
Epoch [29/50], Loss: 23.6366
Epoch [30/50], Loss: 8.1553
Epoch [31/50], Loss: 29.1762
Epoch [32/50], Loss: 3.4332
Epoch [33/50], Loss: 1.7914
Epoch [34/50], L

In [None]:
pd.DataFrame(X_public)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,20000,1,2,1,24,1,4,3,2,2,...,17473,16900,17690,18255,1000,0,0,1376,1000,649
1,130000,2,2,2,24,2,2,-1,0,0,...,41,1062,-201,-201,0,41,1021,0,0,0
2,120000,1,1,2,27,0,0,0,0,0,...,38083,36728,36952,35475,2415,1816,1381,1264,1228,1217
3,300000,1,3,2,51,-1,-1,-1,-1,-1,...,6246,3872,11875,5290,19854,6279,3883,11883,5305,9998
4,130000,2,3,1,43,0,0,0,0,0,...,129385,96775,98071,97743,6739,5540,4403,3814,3562,4021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,170000,2,2,2,35,0,0,0,0,0,...,118111,92592,96114,99628,5000,5000,10000,5000,5000,5000
19996,70000,2,1,2,31,2,2,7,7,7,...,2400,2400,2400,2400,0,0,0,0,0,0
19997,280000,2,1,2,52,-1,-1,-1,-1,-2,...,930,0,0,0,0,930,0,0,0,0
19998,20000,1,1,2,23,1,2,3,2,0,...,19753,19160,19661,19816,2300,0,0,788,558,198


In [None]:
pd.DataFrame(X_private)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,400000,1,2,1,34,0,0,0,0,0,...,398634,210628,203102,85374,28000,14547,8078,6500,5000,3012
1,390000,2,2,2,36,0,0,0,0,0,...,91761,76756,65520,52904,3669,3277,3220,2236,2007,1600
2,180000,2,2,1,43,-1,-1,-1,-1,-1,...,316,396,396,3306,316,316,396,396,3306,3306
3,70000,1,2,2,25,0,0,0,0,0,...,25059,25559,26094,26612,1400,1415,915,947,952,986
4,360000,1,2,2,31,0,0,0,0,0,...,265549,244053,239088,193401,14128,11208,6554,7248,7122,6144
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,90000,2,2,1,28,0,0,0,0,0,...,60630,58919,47850,48034,2852,15678,7763,1686,1735,1518
4996,50000,1,3,1,45,1,-2,-2,-2,-2,...,0,0,0,0,0,0,0,0,0,0
4997,50000,2,2,2,23,0,0,0,0,0,...,20669,4510,2893,2000,1750,1150,1000,300,400,800
4998,150000,2,2,3,48,0,0,0,0,0,...,6158,4626,0,0,2000,1523,2000,0,0,0
