In [102]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import Dataset,DataLoader, TensorDataset
import math
import matplotlib.pyplot as plt
import numpy as np
import ast
import shutil
import seaborn as sns
import os
from PIL import Image
import cv2
from sklearn.model_selection import train_test_split
import pickle
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, precision_score, recall_score, roc_auc_score

In [103]:
data = pd.read_csv("fraud.csv")
data = data.dropna()
data.reset_index(drop=True)
half = len(data)//2
data = data[:half]


In [104]:
X = data['Email Info']
y = data['Class']

In [105]:
x_train, x_test_valid, y_train, y_test_valid = train_test_split(X, y, test_size = 0.20)
x_test, X_valid, y_test, y_valid = train_test_split(x_test_valid, y_test_valid, test_size=0.5, random_state=42)

In [106]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
cv.fit(data['Email Info'])
x_train = cv.transform(x_train)
x_test = cv.transform(x_test)
x_valid = cv.transform(X_valid)
print(x_test.shape)
x_train.shape

(894, 90954)


(7152, 90954)

In [107]:
#https://medium.com/analytics-vidhya/part-1-sentiment-analysis-in-pytorch-82b35edb40b8
X_train_tensor = torch.from_numpy(x_train.todense()).float()
X_test_tensor = torch.from_numpy(x_test.todense()).float()
X_valid_tensor = torch.from_numpy(x_valid.todense()).float()
Y_train_tensor = torch.from_numpy(np.array(y_train))
Y_test_tensor = torch.from_numpy(np.array(y_test))
Y_valid_tensor = torch.from_numpy(np.array(y_valid))

In [108]:
train_data = TensorDataset(X_train_tensor, Y_train_tensor)
test_data = TensorDataset(X_test_tensor, Y_test_tensor)
valid_data = TensorDataset(X_valid_tensor, Y_valid_tensor)

In [109]:
train_loader = DataLoader(train_data,batch_size=16, shuffle=True,drop_last=True, num_workers=0)
valid_loader = DataLoader(train_data,batch_size=16, shuffle=True,drop_last=True, num_workers=0)
test_loader = DataLoader(train_data,batch_size=16, shuffle=True,drop_last=True, num_workers=0)

In [129]:
vocab_size = 90954
hidden_units = 3
out_classes = 1
class model(torch.nn.Module):
    def __init__(self,vocab_size = vocab_size,hidden_units = hidden_units,num_classes = out_classes): 
      super(model, self).__init__()
      #First fully connected layer
      self.fc1 = torch.nn.Linear(vocab_size,hidden_units)
      #Second fully connected layer
      self.fc2 = torch.nn.Linear(hidden_units,num_classes)
      #Final output of sigmoid function      
      self.output = torch.nn.Sigmoid()
    
    def forward(self,x):
          fc1 = self.fc1(x)
          fc2 = self.fc2(fc1)
          # y_pred = torch.nn.sigmoid(fc1)
          # return y_pred
          output = self.output(fc2)
          return output[:, -1]

model = model()

In [130]:
model.train()

model(
  (fc1): Linear(in_features=90954, out_features=3, bias=True)
  (fc2): Linear(in_features=3, out_features=1, bias=True)
  (output): Sigmoid()
)

In [131]:
num_epochs = 20

LEARNING_RATE = 0.001


In [132]:
optimizer =torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)
criterion = torch.nn.BCELoss()


In [133]:
for i in range(num_epochs):
    for x_batch,y_batch in train_loader:
       # model.train()
       y_pred = model(x_batch)
       loss = criterion(y_pred,y_batch.float())
       loss.backward()
       optimizer.step()
       optimizer.zero_grad()
    print('After {} epoch training loss is {}'.format(i,loss.item()))

After 0 epoch training loss is 0.5422660708427429
After 1 epoch training loss is 0.39276814460754395
After 2 epoch training loss is 0.3573758900165558
After 3 epoch training loss is 0.4203060269355774
After 4 epoch training loss is 0.22761639952659607
After 5 epoch training loss is 0.22459010779857635
After 6 epoch training loss is 0.261107474565506
After 7 epoch training loss is 0.20015831291675568
After 8 epoch training loss is 0.33845293521881104
After 9 epoch training loss is 0.1582053154706955
After 10 epoch training loss is 0.2240990251302719
After 11 epoch training loss is 0.12805670499801636
After 12 epoch training loss is 0.290839821100235
After 13 epoch training loss is 0.2037290334701538
After 14 epoch training loss is 0.11044930666685104
After 15 epoch training loss is 0.18104593455791473
After 16 epoch training loss is 0.10290972888469696
After 17 epoch training loss is 0.1418287754058838
After 18 epoch training loss is 0.09765950590372086
After 19 epoch training loss is 0

In [137]:
#https://github.com/vedaant-varshney/ImageClassifierCNN/blob/master/Image%20Classifier.ipynb
# os.chdir('/Users/king.botti/Documents/CS488/models')
n_total_steps = len(train_loader)
# tracks validation loss change after each epoch
minimum_validation_loss = np.inf 
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        
        train_loss = 0
        valid_loss = 0

        images = images
        labels = labels
        
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels.float())

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # train_loss += loss.item()*data.size(0)
    model.eval()
    for batch_index, (data, target) in enumerate(valid_loader):
        # moves tensors to GPU
        data, target = data, target
        # forward pass
        output = model(data)
        # loss in batch
        loss = criterion(output, target.float())
        # update validation loss
        valid_loss += loss.item()*data.size(0)

    train_loss = train_loss/len(train_loader.sampler)
    valid_loss = valid_loss/len(valid_loader.sampler)

    if (i+1) % 2000 == 0:
        print (f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')

    if valid_loss <= minimum_validation_loss:
        print(f'Validation loss decreased from {round(minimum_validation_loss, 6)} to {round(valid_loss, 6)}')
        torch.save(model.state_dict(), 'trained_model_age2.pt')
        minimum_validation_loss = valid_loss
        print('Saving New Model')

Validation loss decreased from inf to 0.144236
Saving New Model
Validation loss decreased from 0.144236 to 0.14102
Saving New Model
Validation loss decreased from 0.14102 to 0.138065
Saving New Model
Validation loss decreased from 0.138065 to 0.135281
Saving New Model
Validation loss decreased from 0.135281 to 0.132693
Saving New Model
Validation loss decreased from 0.132693 to 0.13027
Saving New Model
Validation loss decreased from 0.13027 to 0.12799
Saving New Model
Validation loss decreased from 0.12799 to 0.125842
Saving New Model
Validation loss decreased from 0.125842 to 0.123801
Saving New Model
Validation loss decreased from 0.123801 to 0.12186
Saving New Model
Validation loss decreased from 0.12186 to 0.120015
Saving New Model
Validation loss decreased from 0.120015 to 0.118258
Saving New Model
Validation loss decreased from 0.118258 to 0.116559
Saving New Model
Validation loss decreased from 0.116559 to 0.114942
Saving New Model
Validation loss decreased from 0.114942 to 0.11

In [144]:
batch_size = 16
with torch.no_grad():
    y_pred = []
    y_test = []
    n_correct = 0
    n_samples = 0
    n_class_correct = [0 for i in range(2)]
    n_class_samples = [0 for i in range(2)]
    for images, labels in test_loader:
        images = images
        labels = labels
        outputs = model(images)
        # max returns (value ,index)
        print(outputs)
        print(labels)
        _, predicted = torch.max(outputs, 0)
        # print(predicted)
        n_samples += labels.size(0)
        n_correct += (predicted == labels).sum().item()
        
        for i in range(batch_size):
            label = labels[i]
            pred = predicted[i]
            y_pred.append(int(pred))
            y_test.append(int(label))
            if (label == pred):
                n_class_correct[99] += 1
            n_class_samples[99] += 1
            if (label == pred):
                n_class_correct[label] += 1
            n_class_samples[label] += 1

    acc = 100.0 * n_correct / n_samples
    print(f'Accuracy of the network: {acc} %')

    for i in range(2):
        acc = 100.0 * n_class_correct[i] / n_class_samples[i]
        print(f'Accuracy equals: {acc} %')

tensor([0.0298, 0.0869, 0.0540, 0.0514, 1.0000, 0.9846, 0.9805, 0.0606, 0.9994,
        1.0000, 0.9995, 0.0761, 1.0000, 0.5879, 0.0754, 0.0792])
tensor([0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0])


IndexError: invalid index of a 0-dim tensor. Use `tensor.item()` in Python or `tensor.item<T>()` in C++ to convert a 0-dim tensor to a number