In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np


class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        # 1 input image channel, 6 output channels, 5x5 square convolution
        # kernel
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(16 * 5 * 5, 120)  # 5*5 from image dimension
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        # Max pooling over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # If the size is a square, you can specify with a single number
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = torch.flatten(x, 1) # flatten all dimensions except the batch dimension
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


#net = Net()
#print(net)

In [2]:
import pandas as pd

df = pd.read_csv('train.csv')
dfTest = pd.read_csv('test.csv')
dataset = pd.read_csv('train.csv')


In [3]:
assert dataset.isnull().sum().sum() == 0

num_col_names = list(dataset.select_dtypes(include='number').columns)
cat_col_names = list(set(dataset.columns) - set(num_col_names))

# 70:15:15 stratified split
from sklearn.model_selection import train_test_split
train, tmp = train_test_split(dataset, test_size=0.3, stratify = dataset["Attrition"], random_state=42)
val, test  = train_test_split(tmp,     test_size=0.5, stratify =     tmp["Attrition"], random_state=42)


#!pip install -q pytorch_tabular[extra]



In [4]:
#!pip install -q pytorch_tabular[extra]

In [5]:
cat_col_names

['Gender',
 'Department',
 'MaritalStatus',
 'Over18',
 'OverTime',
 'BusinessTravel',
 'JobRole',
 'EducationField']

Help from chatgpt:

You can convert a pandas dataframe with categorical data fields to a PyTorch tensor using the torch.from_numpy() function and the .values attribute of the dataframe.

First, you will need to convert the categorical data fields to numerical values using the .cat.codes attribute of the dataframe column, or using pandas' get_dummies() function.

Then you can use the .values attribute to extract the numpy array from the dataframe and pass it to the torch.from_numpy() function to convert it to a PyTorch tensor.

Example:

import pandas as pd
import torch

df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': [1, 2, 3]})
df = pd.get_dummies(df, columns=['A'])
tensor = torch.from_numpy(df.values)

Note that in the above example we use the pd.get_dummies function to convert the categorical column 'A' to multiple columns, one for each category in the column.

Thanks A.I.

You're welcome! Let me know if you have any other questions.

In [4]:

df = pd.get_dummies(df, columns=cat_col_names)
tensor = torch.from_numpy(df.values)

In [9]:
df

Unnamed: 0,id,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,EducationField_Human Resources,EducationField_Life Sciences,EducationField_Marketing,EducationField_Medical,EducationField_Other,EducationField_Technical Degree
0,0,36,599,24,3,1,4,42,3,1,...,0,0,0,0,0,0,0,1,0,0
1,1,35,921,8,3,1,1,46,3,1,...,0,0,0,1,0,0,0,0,1,0
2,2,32,718,26,3,1,3,80,3,2,...,0,0,1,0,0,0,1,0,0,0
3,3,38,1488,2,3,1,3,40,3,2,...,0,0,0,0,0,0,0,1,0,0
4,4,50,1017,5,4,1,2,37,3,5,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1672,1672,30,945,1,3,1,4,73,3,3,...,0,0,1,0,0,1,0,0,0,0
1673,1673,32,1303,2,3,1,1,48,3,1,...,0,1,0,0,0,1,0,0,0,0
1674,1674,29,1184,24,3,1,2,36,2,1,...,0,0,0,0,1,0,0,0,0,0
1675,1675,36,441,9,2,1,2,48,4,2,...,0,0,1,0,0,0,1,0,0,0


In [14]:
df["Gender_Female"]

0       0
1       0
2       0
3       1
4       1
       ..
1672    1
1673    0
1674    0
1675    0
1676    1
Name: Gender_Female, Length: 1677, dtype: uint8

In [5]:
tensor.shape

torch.Size([1677, 56])

In [6]:
tensor

tensor([[   0,   36,  599,  ...,    1,    0,    1],
        [   1,   35,  921,  ...,    1,    1,    0],
        [   2,   32,  718,  ...,    1,    1,    0],
        ...,
        [1674,   29, 1184,  ...,    1,    0,    1],
        [1675,   36,  441,  ...,    1,    1,    0],
        [1676,   36, 1141,  ...,    0,    1,    0]])

how to split dataset into train and test set in pytorch. and then how to use dataloader with the result?

In PyTorch, you can split a dataset into a training set and a test set using the torch.utils.data.random_split() function. This function takes in a dataset and a list of lengths, and splits the dataset into multiple subsets with the specified lengths.

For example, to split a dataset into a training set and a test set, you can use the following code:

from torch.utils.data import random_split

dataset = ... # your dataset
train_ratio = 0.8 # ratio of data to be used for training

train_len = int(train_ratio * len(dataset))
test_len = len(dataset) - train_len

train_set, test_set = random_split(dataset, [train_len, test_len])

Once you have the training and test sets, you can use the DataLoader class from torch.utils.data to load the data in batches.

from torch.utils.data import DataLoader

batch_size = 32

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False)

In this example, we pass the train_set and test_set as input to the DataLoader, also we set the batch size to 32, and shuffle to True for the training set, and False for the test set.

You can then use these data loaders to iterate through the training and test sets in batches. For example, you can use a for loop to iterate through the training data in batches:

for data, labels in train_loader:
    # do something with the data and labels

This way, the data is loaded in batches of size "batch_size" and can be used for training your model.

In [8]:
#df
labels = df.pop("Attrition")


tensor = torch.from_numpy(df.values)
labeltensor = torch.from_numpy(labels.values)

dataset = TensorDataset(tensor, labeltensor)

from torch.utils.data import random_split
from torch.utils.data import TensorDataset

train_ratio = 0.8 # ratio of data to be used for training

train_len = int(train_ratio * len(dataset))
test_len = len(dataset) - train_len

train_set, test_set = random_split(dataset, [train_len, test_len])

KeyError: 'Attrition'

In [28]:
train_set

<torch.utils.data.dataset.Subset at 0x212420dee20>

In [21]:
from torch.utils.data import Dataset, DataLoader

loader = iter(DataLoader((), batch_size=32, shuffle=True))

In [23]:
class MihaNetForSwag(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        # 1 input image channel, 6 output channels, 5x5 square convolution
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(56, 256)  # 5*5 from image dimension
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, 1)

    def forward(self, x):
        # Max pooling over a (2, 2) window
        # x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # If the size is a square, you can specify with a single number
        # x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        # x = torch.flatten(x, 1) # flatten all dimensions except the batch dimension
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.sigmoid(self.fc4(x))
        x = self.fc4(x)
        return x


In [None]:

## VARIABLES

# Epochs to train for
epochs = 50
# batch size 
batch_size = 128

# Length of sequence
seq_len = 100

# for printing report purposes
# always start at 0
tracker = 0

# number of characters in text
# num_char = max(encoded_text)+1

model.train()



# Check to see if using GPU
if model.use_gpu:
    model.cuda()

for i in range(epochs):
    
    hidden = model.hidden_state(batch_size)
    
    
    for x,y in generate_batches(train_data,batch_size,seq_len):
        
        tracker += 1
        
        # One Hot Encode incoming data
        x = loader.next() # one_hot_encoder(x,num_char)
        
        # Convert Numpy Arrays to Tensor
        
        inputs = torch.from_numpy(x)
        targets = torch.from_numpy(y)
        
        # Adjust for GPU if necessary
        
        if model.use_gpu:
            
            inputs = inputs.cuda()
            targets = targets.cuda()
            
        # Reset Hidden State
        # If we dont' reset we would backpropagate through all training history
        hidden = tuple([state.data for state in hidden])
        
        model.zero_grad()
        
        lstm_output, hidden = model.forward(inputs,hidden)
        loss = criterion(lstm_output,targets.view(batch_size*seq_len).long())
        
        loss.backward()
        
        # POSSIBLE EXPLODING GRADIENT PROBLEM!
        # LET"S CLIP JUST IN CASE
        nn.utils.clip_grad_norm_(model.parameters(),max_norm=5)
        
        optimizer.step()
        
        
        
        ###################################
        ### CHECK ON VALIDATION SET ######
        #################################
        
        if tracker % 25 == 0:
            
            val_hidden = model.hidden_state(batch_size)
            val_losses = []
            model.eval()
            
            for x,y in generate_batches(val_data,batch_size,seq_len):
                
                # One Hot Encode incoming data
                x = one_hot_encoder(x,num_char)
                

                # Convert Numpy Arrays to Tensor

                inputs = torch.from_numpy(x)
                targets = torch.from_numpy(y)

                # Adjust for GPU if necessary

                if model.use_gpu:

                    inputs = inputs.cuda()
                    targets = targets.cuda()
                    
                # Reset Hidden State
                # If we dont' reset we would backpropagate through 
                # all training history
                val_hidden = tuple([state.data for state in val_hidden])
                
                lstm_output, val_hidden = model.forward(inputs,val_hidden)
                val_loss = criterion(lstm_output,targets.view(batch_size*seq_len).long())
        
                val_losses.append(val_loss.item())
            
            # Reset to training model after val for loop
            model.train()
            
            print(f"Epoch: {i} Step: {tracker} Val Loss: {val_loss.item()}")