<a href="https://colab.research.google.com/github/noahcho124/AI-deepfake-audio-detction/blob/main/Loan_pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import f1_score, accuracy_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from imblearn.over_sampling import SMOTE, ADASYN

def feature_tuning(train):
    train.drop(['대출목적', '근로기간'], axis = 1, inplace= True)
    train.loc[train['대출기간'] == ' 36 months','DTI'] = train['연간소득'] / (train['대출금액'] / 36)
    train.loc[train['대출기간'] == ' 60 months','DTI'] = train['연간소득'] / (train['대출금액'] / 60)
    train['이자율'] = (train['총상환이자']+10) / (train['총상환원금']+10)
    train.loc[train['총연체금액'] < 30000, '이자율'] = 0.4
    train['연간소득'] = np.log(train['연간소득'] + 0.1)
    train.loc[train['총연체금액'] > 0, '총연체금액'] = True
    train.loc[train['총연체금액'] == 0, '총연체금액'] = False
    # train['근로기간'] = train['근로기간'].replace(work_dict)
    # cats = train['대출목적'].value_counts()[lambda x: x<100].index # 대출목적 column에서 value가 100개 넘는 feature에 대해서만 dummy 생성
    # train['대출목적'].replace(cats[[i for i in range(len(cats))]], '기타', inplace = True)
    cats = train['주택소유상태'].value_counts()[lambda x: x<100].index # 주택소유상태 column에서 value가 100개 넘는 feature에 대해서만 dummy 생성
    train['주택소유상태'].replace(cats[[i for i in range(len(cats))]], None, inplace = True)
    train = pd.get_dummies(train, columns= ['대출기간','주택소유상태']) # 가변수 지정 여부 바꿀수도 있음. drop_first = True
    train = train.set_index('ID')
    return train

train = pd.read_csv('/content/drive/MyDrive/train.csv')
target = train['대출등급']
train.drop('대출등급', axis = 1, inplace=True)
train = feature_tuning(train)

imb = ADASYN()
scaler = RobustScaler()
label_encoder = LabelEncoder()
onehot_encoder = OneHotEncoder(sparse=False)

# train_x_scaled = scaler.fit_transform(train)
target_encoded = label_encoder.fit_transform(target)
target_encoded = onehot_encoder.fit_transform(target_encoded.reshape(-1,1))
# Split the data into training and validation sets
train_data, valid_data, train_target, valid_target = train_test_split(train, target_encoded, test_size=0.2, stratify=target_encoded)
train_data, train_target = imb.fit_resample(train_data, train_target)

# Convert NumPy arrays to PyTorch tensors
train_data_tensor = torch.tensor(train_data, dtype=torch.float32)
train_target_tensor = torch.tensor(train_target, dtype=torch.long)
valid_data_tensor = torch.tensor(valid_data, dtype=torch.float32)
valid_target_tensor = torch.tensor(valid_target, dtype=torch.long)

# Create DataLoader for training and validation
train_dataset = TensorDataset(train_data_tensor, train_target_tensor)
valid_dataset = TensorDataset(valid_data_tensor, valid_target_tensor)

train_loader = DataLoader(train_dataset, batch_size=200, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=200, shuffle=False)

# Define the neural network model
class MLP(nn.Module):
    def __init__(self, input_size):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, 20)
        self.fc2 = nn.Linear(20, 16)
        self.fc3 = nn.Linear(16, 10)
        # self.fc4 = nn.Linear(50, 35)
        # self.fc5 = nn.Linear(35, 20)
        self.output_layer = nn.Linear(10, num_classes)

        self.activation = nn.ReLU()

    def forward(self, x):
        x = self.activation(self.fc1(x))
        x = self.activation(self.fc2(x))
        x = self.activation(self.fc3(x))
        # x = self.activation(self.fc4(x))
        # x = self.activation(self.fc5(x))
        x = self.output_layer(x)
        return x

# Instantiate the model
num_classes = len(set(target))
input_size = train_data.shape[1]
model = MLP(input_size)

# Move the model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.02)

# Training loop
num_epochs = 1000
for epoch in range(num_epochs):
    model.train()
    for data, target in train_loader:
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        outputs = model.forward(data)
        target = torch.tensor(target, dtype= torch.float32)
        loss = criterion(outputs, target)
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    with torch.no_grad():
        valid_data_tensor = valid_data_tensor.to(device)
        valid_target_tensor = valid_target_tensor.to(device)
        outputs = model(valid_data_tensor)
        _, predicted = torch.max(outputs, 1)
        predicted_cpu = predicted.cpu().numpy()
        predicted = onehot_encoder.transform(predicted_cpu.reshape(-1,1))
        # predicted = torch.argmax(outputs,1)
        # valid_target_tensor = torch.argmax(valid_target_tensor,1)
        accuracy = accuracy_score(valid_target_tensor.cpu().numpy(), predicted)
        print(f'Epoch {epoch+1}/{num_epochs}, Validation Accuracy: {accuracy:.4f}')

# Test the model on the entire dataset
model.eval()
with torch.no_grad():
    train_data_tensor = train_data_tensor.to(device)
    train_target_tensor = train_target_tensor.to(device)
    outputs = model(train_data_tensor)
    _, predicted = torch.max(outputs, 1)
    predicted_cpu = predicted.cpu().numpy()
    predicted = onehot_encoder.transform(predicted_cpu.reshape(-1,1))
    accuracy = accuracy_score(train_target_tensor.cpu().numpy(), predicted)
    print(f'Training Accuracy: {accuracy:.4f}')



ValueError: No samples will be generated with the provided ratio settings.

In [None]:
from sklearn.metrics import classification_report
print('Results on the test set:')
print(classification_report(train_target_tensor.cpu().numpy(), predicted))

Results on the test set:
              precision    recall  f1-score   support

           0       0.87      0.93      0.90     23054
           1       0.88      0.79      0.83     23054
           2       0.81      0.87      0.84     23054
           3       0.85      0.84      0.84     23054
           4       0.82      0.84      0.83     23054
           5       0.84      0.89      0.86     23054
           6       0.97      0.87      0.92     23054

   micro avg       0.86      0.86      0.86    161378
   macro avg       0.86      0.86      0.86    161378
weighted avg       0.86      0.86      0.86    161378
 samples avg       0.86      0.86      0.86    161378



In [None]:
test = pd.read_csv('/content/drive/MyDrive/test.csv')
test = feature_tuning(test)
test = scaler.transform(test)
test = torch.tensor(test, dtype=torch.float32)
test = test.to('cuda')
model.to('cuda')
prediction = model.forward(test)
_, prediction = torch.max(prediction, 1)
predicted_cpu = prediction.cpu().numpy()
prediction = onehot_encoder.transform(predicted_cpu.reshape(-1,1))
decoded_pred = onehot_encoder.inverse_transform(prediction)
decode_dict = {0:'A', 1:'B', 2:'C', 3:'D', 4:'E', 5:'F',6:'G'}
prediction = np.vectorize(decode_dict.get)(decoded_pred)

In [None]:
sample_submission = pd.read_csv('/content/drive/MyDrive/sample_submission.csv')
sample_submission['대출등급'] = prediction
sample_submission.to_csv('submission_smote.csv', index=None)

In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, accuracy_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from imblearn.over_sampling import SMOTE

def feature_tuning(train):
    train.drop(['대출목적', '근로기간'], axis = 1, inplace= True)
    train.loc[train['대출기간'] == ' 36 months','DTI'] = train['연간소득'] / (train['대출금액'] / 36)
    train.loc[train['대출기간'] == ' 60 months','DTI'] = train['연간소득'] / (train['대출금액'] / 60)
    train['이자율'] = (train['총상환이자']+10) / (train['총상환원금']+10)
    train.loc[train['총연체금액'] < 30000, '이자율'] = 0.4
    train['연간소득'] = np.log(train['연간소득'] + 0.1)
    train.loc[train['총연체금액'] > 0, '총연체금액'] = True
    train.loc[train['총연체금액'] == 0, '총연체금액'] = False
    # train['근로기간'] = train['근로기간'].replace(work_dict)
    # cats = train['대출목적'].value_counts()[lambda x: x<100].index # 대출목적 column에서 value가 100개 넘는 feature에 대해서만 dummy 생성
    # train['대출목적'].replace(cats[[i for i in range(len(cats))]], '기타', inplace = True)
    cats = train['주택소유상태'].value_counts()[lambda x: x<100].index # 주택소유상태 column에서 value가 100개 넘는 feature에 대해서만 dummy 생성
    train['주택소유상태'].replace(cats[[i for i in range(len(cats))]], None, inplace = True)
    train = pd.get_dummies(train, columns= ['대출기간','주택소유상태']) # 가변수 지정 여부 바꿀수도 있음. drop_first = True
    train = train.set_index('ID')
    return train

train = pd.read_csv('/content/drive/MyDrive/train.csv')
target = train['대출등급']
train.drop('대출등급', axis = 1, inplace=True)
train = feature_tuning(train)

# train, validation data set 분리
x_train , x_valid, y_train, y_valid = train_test_split(train, target, test_size=0.2, shuffle=True, stratify = target, random_state=42)
sm = SMOTE()
scaler = StandardScaler()
train_x_resampled, train_y_resampled = sm.fit_resample(train, target)
train_x_scaled = scaler.fit_transform(train_x_resampled)

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(train_x_scaled, train_y_resampled)
prediction = lr.predict(x_valid)
from sklearn.metrics import classification_report
print('Results on the test set:')
print(classification_report(y_valid, prediction))

Results on the test set:
              precision    recall  f1-score   support

           A       0.22      1.00      0.36      3354
           B       0.00      0.00      0.00      5763
           C       0.00      0.00      0.00      5525
           D       0.33      0.00      0.00      2671
           E       0.26      0.68      0.38      1471
           F       0.58      0.19      0.28       391
           G       0.00      0.00      0.00        84

    accuracy                           0.23     19259
   macro avg       0.20      0.27      0.15     19259
weighted avg       0.12      0.23      0.10     19259

