#prepare data for VFC method

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import os
from sklearn.metrics import f1_score,accuracy_score
import random

In [None]:
# Read train data
df = pd.read_csv('/content/drive/MyDrive/DSAAA_2023/data/train.csv')
df = df.drop(['id'], axis=1)
df.head()

Unnamed: 0,id1,id2,label
0,9202,9202,1
1,410411,460254,0
2,211858,312074,1
3,253901,504325,0
4,415071,63239,0


In [None]:
# Read test data
dft = pd.read_csv('/content/drive/MyDrive/DSAAA_2023/data/test.csv')
dft = dft.drop(['id'], axis=1)
dft.head()

Unnamed: 0,id1,id2
0,253077,253077
1,235274,65408
2,172772,677546
3,378856,175720
4,825250,35839


In [None]:
df_id_map = pd.concat([df['id1'], df['id2'], dft['id1'], dft['id2']], axis=0)
df_id_map = df_id_map.drop_duplicates()
df_id_map = df_id_map.reset_index(drop=True)
df_id_map = df_id_map.reset_index()
df_id_map = df_id_map.rename(columns={'index': 'new_id', 0: 'id'})
df_id_map.head()

Unnamed: 0,new_id,id
0,0,9202
1,1,410411
2,2,211858
3,3,253901
4,4,415071


In [None]:
print(df_id_map.shape)
print(df_id_map['new_id'].max())

(831454, 2)
831453


In [None]:
df = df.merge(df_id_map, left_on='id1', right_on='id', how='left')
df = df.rename(columns={'new_id': 'new_id1'})
df = df.merge(df_id_map, left_on='id2', right_on='id', how='left')
df = df.rename(columns={'new_id': 'new_id2'})
df.drop(['id1', 'id2', 'id_x', 'id_y'], axis=1, inplace=True)
df.rename(columns={'new_id1': 'id1', 'new_id2': 'id2'}, inplace=True)
df.head()

Unnamed: 0,label,id1,id2
0,1,0,0
1,0,1,310353
2,1,2,435879
3,0,3,274109
4,0,4,435880


In [None]:
dft = dft.merge(df_id_map, left_on='id1', right_on='id', how='left')
dft = dft.rename(columns={'new_id': 'new_id1'})
dft = dft.merge(df_id_map, left_on='id2', right_on='id', how='left')
dft = dft.rename(columns={'new_id': 'new_id2'})
dft.drop(['id1', 'id2', 'id_x', 'id_y'], axis=1, inplace=True)
dft.rename(columns={'new_id1': 'id1', 'new_id2': 'id2'}, inplace=True)
dft.head()

Unnamed: 0,id1,id2
0,460724,460724
1,46,440647
2,65,461529
3,83,440127
4,677245,496798


#define neural net model

In [None]:
#this model only use id
import torch.nn.init as init
class VFC_Model(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_rate):
        super(VFC_Model, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)

        # Xavier Initialization
        init.xavier_uniform_(self.embedding.weight)

        self.seq1 = nn.Sequential(nn.Dropout(dropout_rate), nn.Linear(hidden_size, hidden_size//2))

        # He Initialization
        init.kaiming_uniform_(self.seq1[1].weight, mode='fan_in', nonlinearity='relu')

        self.seq1.add_module('relu1', nn.ReLU())
        self.seq1.add_module('linear1', nn.Linear(hidden_size//2, hidden_size//2))

        self.seq2 = nn.Sequential(nn.Dropout(dropout_rate), nn.Linear(hidden_size, hidden_size//2))

        # He Initialization
        init.kaiming_uniform_(self.seq2[1].weight, mode='fan_in', nonlinearity='relu')

        self.seq2.add_module('relu2', nn.ReLU())
        self.seq2.add_module('linear2', nn.Linear(hidden_size//2, hidden_size//2))

        self.decoder = nn.Sequential(nn.Dropout(dropout_rate), nn.Linear(hidden_size, hidden_size//2))

        # He Initialization
        init.kaiming_uniform_(self.decoder[1].weight, mode='fan_in', nonlinearity='relu')

        self.decoder.add_module('relu3', nn.ReLU())
        self.decoder.add_module('linear3', nn.Linear(hidden_size//2, 1))
        self.decoder.add_module('sigmoid', nn.Sigmoid())

    def forward(self, x1, x2):
        x1 = self.embedding(x1)
        x1 = self.seq1(x1)
        x2 = self.embedding(x2)
        x2 = self.seq2(x2)
        x = torch.cat((x1, x2), dim=1)
        x = self.decoder(x).squeeze()
        return x


#define dataloader

In [None]:
class MyDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        data1 = self.df.iloc[idx]['id1']
        data2 = self.df.iloc[idx]['id2']
        target = self.df.iloc[idx]['label']
        return data1, data2, target


def split_data(df):
  from numpy.random import RandomState
  rng = RandomState(1234)
  train = df.sample(frac=0.8, random_state=rng)
  print(df.shape[0])
  val = df.loc[~df.index.isin(train.index)]
  train.to_csv('./train.csv',index=False)
  val.to_csv('./dev.csv',index=False)
  return train,val


In [None]:
class MyTestDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        data1 = self.df.iloc[idx]['id1']
        data2 = self.df.iloc[idx]['id2']
        return data1, data2


#define task

In [None]:
class Task:
    def __init__(self,df,dft):
        self.num_epochs = 100
        self.patience = 5
        self.learning_rate = 0.001
        self.train_batch=1024
        self.valid_batch=1024
        self.save_path = '/content/drive/MyDrive/save_dsaa_only_id'
        self.best_metric= 'f1'
        self.input_size=df.shape[0]
        self.train,self.val=split_data(df)
        self.test=dft
        self.hidden_dim=256
        self.drop=0.2
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.base_model=VFC_Model(self.input_size,self.hidden_dim,self.drop).to(self.device)
        self.optimizer = optim.Adam(self.base_model.parameters(), lr=self.learning_rate)
        self.criterion = nn.BCELoss()
    def training(self):
        if not os.path.exists(self.save_path):
          os.makedirs(self.save_path)

        train = DataLoader(MyDataset(self.train), batch_size=self.train_batch, shuffle=True)
        valid = DataLoader(MyDataset(self.val), batch_size=self.valid_batch, shuffle=True)

        if os.path.exists(os.path.join(self.save_path, 'last_model.pth')):
            checkpoint = torch.load(os.path.join(self.save_path, 'last_model.pth'))
            self.base_model.load_state_dict(checkpoint['model_state_dict'])
            self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
            print('loaded the last saved model!!!')
            initial_epoch = checkpoint['epoch'] + 1
            print(f"continue training from epoch {initial_epoch}")
        else:
            initial_epoch = 0
            print("first time training!!!")
            train_loss = 0.
            valid_loss = 0.

        if os.path.exists(os.path.join(self.save_path, 'best_model.pth')):
            checkpoint = torch.load(os.path.join(self.save_path, 'best_model.pth'))
            best_score = checkpoint['score']
        else:
            best_score = 0.

        threshold=0
        self.base_model.train()
        for epoch in range(initial_epoch, self.num_epochs + initial_epoch):
            valid_acc = 0.
            valid_f1 =0.
            train_loss = 0.
            valid_loss = 0.
            for id1, id2, labels in train:
                id1, id2, labels = id1.to(self.device), id2.to(self.device), labels.to(self.device)
                self.optimizer.zero_grad()
                output = self.base_model(id1, id2)
                loss = self.criterion(output, labels.float())
                loss.backward()
                self.optimizer.step()
                train_loss += loss
            train_loss /=len(train)
            print(f"epoch {epoch + 1}/{self.num_epochs + initial_epoch}")
            print(f"train loss: {train_loss:.10f}")

            with torch.no_grad():
                for id1, id2, labels in valid:
                    id1, id2, labels = id1.to(self.device), id2.to(self.device), labels.to(self.device)
                    self.optimizer.zero_grad()
                    output = self.base_model(id1, id2)
                    loss = self.criterion(output, labels.float())
                    preds= (torch.round(output).detach()).cpu().numpy()
                    labels=labels.cpu().numpy()
                    valid_loss += loss
                    valid_acc+=accuracy_score(labels,preds)
                    valid_f1+=f1_score(labels,preds)
            valid_acc /=len(valid)
            valid_f1 /= len(valid)

            print(f"valid loss: {valid_loss:.10f} valid acc: {valid_acc:.10f} valid f1: {valid_f1:.10f}")

            if self.best_metric =='accuracy':
                score=valid_acc
            if self.best_metric=='f1':
                score=valid_f1

            # save the last model
            torch.save({
                'epoch': epoch,
                'model_state_dict': self.base_model.state_dict(),
                'optimizer_state_dict': self.optimizer.state_dict(),
                'score': score}, os.path.join(self.save_path, 'last_model.pth'))

            # save the best model
            if epoch > 0 and score <= best_score:
              threshold += 1
            else:
              threshold = 0

            if score > best_score:
                best_score = score
                torch.save({
                    'epoch': epoch,
                    'model_state_dict': self.base_model.state_dict(),
                    'optimizer_state_dict': self.optimizer.state_dict(),
                    'score':score}, os.path.join(self.save_path, 'best_model.pth'))
                print(f"saved the best model with {self.best_metric} of {score:.10f}")

            # early stopping
            if threshold >= self.patience:
                print(f"early stopping after epoch {epoch + 1}")
                break
    def evaluate(self):
        test_data = DataLoader(MyTestDataset(self.test), batch_size=1024, shuffle=False)
        if os.path.exists(os.path.join(self.save_path, 'best_model.pth')):
            checkpoint = torch.load(os.path.join(self.save_path, 'best_model.pth'), map_location=self.device)
            self.base_model.load_state_dict(checkpoint['model_state_dict'])
            self.base_model.eval()
            pred_labels = []
            with torch.no_grad():
                for id1, id2 in test_data:
                    id1, id2 = id1.to(self.device), id2.to(self.device)
                    output = self.base_model(id1, id2)
                    preds=(torch.round(output).detach()).cpu().numpy()
                    pred_labels.extend(preds)

            dfss = pd.read_csv('/content/drive/MyDrive/DSAAA_2023/data/test.csv')
            dfss.drop(['id1', 'id2'], axis=1, inplace=True)
            dfss['label'] = pred_labels
            dfss['label'] = dfss['label'].astype(int)
            dfss.to_csv('./submission1.csv', index=False)
            print("task done!!!")
            
        else:
            print('model has not been trained')


#training

In [None]:
task=Task(df,dft)
task.training()

#predict

In [None]:
task.evaluate()

task done!!!


#xgboost model

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import accuracy_score, classification_report, f1_score,confusion_matrix
import time

In [None]:
df['label'].value_counts()

0    512389
1    435843
Name: label, dtype: int64

In [None]:
xgb = XGBClassifier(n_estimators=100,tree_method='gpu_hist')
training_start = time.perf_counter()
oversampler = RandomOverSampler(random_state=1)
X_oversampled, y_oversampled = oversampler.fit_resample(df.drop('label',axis=1), df['label'])
X_train, X_val, y_train, y_val = train_test_split(df.drop('label',axis=1), df['label'], test_size=0.96, random_state=1)
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)

xgb.fit(X_train, y_train)
training_end = time.perf_counter()
prediction_start = time.perf_counter()


preds = xgb.predict(X_val)
prediction_end = time.perf_counter()
acc_xgb = accuracy_score(y_val, preds)
f1_xgb = f1_score(y_val, preds)

xgb_train_time = training_end - training_start
xgb_prediction_time = prediction_end - prediction_start

print("XGBoost's prediction accuracy is: ", acc_xgb)
print("XGBoost's F1 score is: ", f1_xgb)
print("XGBoost's confusion matrix is:\n", confusion_matrix(y_val,preds))
print(classification_report(y_val, preds))
print("Time consumed for training: %4.3f" % xgb_train_time)
print("Time consumed for prediction: %6.5f seconds" % xgb_prediction_time)

XGBoost's prediction accuracy is:  0.9966725365070751
XGBoost's F1 score is:  0.9963819054660821
XGBoost's confusion matrix is:
 [[490198   1768]
 [  1261 417076]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    491966
           1       1.00      1.00      1.00    418337

    accuracy                           1.00    910303
   macro avg       1.00      1.00      1.00    910303
weighted avg       1.00      1.00      1.00    910303

Time consumed for training: 1.647
Time consumed for prediction: 1.26528 seconds


In [None]:
preds=xgb.predict(dft)
dfss = pd.read_csv('/content/drive/MyDrive/DSAAA_2023/data/test.csv')
dfss.drop(['id1', 'id2'], axis=1, inplace=True)
dfss['label'] = preds
dfss['label'] = dfss['label'].astype(int)
dfss.to_csv('./submission1.csv', index=False)
print("task done!!!")