In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np


class Net(nn.Module):

    
    def __init__(self):
        super(Net, self).__init__()
        # 1 input image channel, 6 output channels, 5x5 square convolution
        # kernel
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(16 * 5 * 5, 120)  # 5*5 from image dimension
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
        self.predprob = 0

    def forward(self, x):
        # Max pooling over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # If the size is a square, you can specify with a single number
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = torch.flatten(x, 1) # flatten all dimensions except the batch dimension
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        self.predprob = x
        return (x > 0.5).type()


#net = Net()
#print(net)

In [2]:
import pandas as pd

df = pd.read_csv('train.csv')
dfTest = pd.read_csv('test.csv')
dataset = pd.read_csv('train.csv')


In [3]:
assert dataset.isnull().sum().sum() == 0

num_col_names = list(dataset.select_dtypes(include='number').columns)
cat_col_names = list(set(dataset.columns) - set(num_col_names))

# 70:15:15 stratified split
from sklearn.model_selection import train_test_split
train, tmp = train_test_split(dataset, test_size=0.3, stratify = dataset["Attrition"], random_state=42)
val, test  = train_test_split(tmp,     test_size=0.5, stratify =     tmp["Attrition"], random_state=42)


#!pip install -q pytorch_tabular[extra]



In [4]:
#!pip install -q pytorch_tabular[extra]

In [5]:
cat_col_names

['Over18',
 'Department',
 'MaritalStatus',
 'Gender',
 'EducationField',
 'JobRole',
 'OverTime',
 'BusinessTravel']

In [6]:
df["Attrition"]

0       0
1       0
2       0
3       0
4       1
       ..
1672    0
1673    0
1674    1
1675    0
1676    0
Name: Attrition, Length: 1677, dtype: int64

Help from chatgpt:

You can convert a pandas dataframe with categorical data fields to a PyTorch tensor using the torch.from_numpy() function and the .values attribute of the dataframe.

First, you will need to convert the categorical data fields to numerical values using the .cat.codes attribute of the dataframe column, or using pandas' get_dummies() function.

Then you can use the .values attribute to extract the numpy array from the dataframe and pass it to the torch.from_numpy() function to convert it to a PyTorch tensor.

Example:

import pandas as pd
import torch

df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': [1, 2, 3]})
df = pd.get_dummies(df, columns=['A'])
tensor = torch.from_numpy(df.values)

Note that in the above example we use the pd.get_dummies function to convert the categorical column 'A' to multiple columns, one for each category in the column.

Thanks A.I.

You're welcome! Let me know if you have any other questions.

In [7]:

df = pd.get_dummies(df, columns=cat_col_names)
tensor = torch.from_numpy(df.values)

In [8]:
df["Gender_Female"] 

0       0
1       0
2       0
3       1
4       1
       ..
1672    1
1673    0
1674    0
1675    0
1676    1
Name: Gender_Female, Length: 1677, dtype: uint8

In [9]:
df.pop("id")

0          0
1          1
2          2
3          3
4          4
        ... 
1672    1672
1673    1673
1674    1674
1675    1675
1676    1676
Name: id, Length: 1677, dtype: int64

In [10]:
tensor.shape

torch.Size([1677, 56])

In [11]:
tensor

tensor([[   0,   36,  599,  ...,    0,    1,    0],
        [   1,   35,  921,  ...,    0,    0,    1],
        [   2,   32,  718,  ...,    0,    0,    1],
        ...,
        [1674,   29, 1184,  ...,    0,    1,    0],
        [1675,   36,  441,  ...,    0,    0,    1],
        [1676,   36, 1141,  ...,    0,    0,    1]])

how to split dataset into train and test set in pytorch. and then how to use dataloader with the result?

In PyTorch, you can split a dataset into a training set and a test set using the torch.utils.data.random_split() function. This function takes in a dataset and a list of lengths, and splits the dataset into multiple subsets with the specified lengths.

For example, to split a dataset into a training set and a test set, you can use the following code:

from torch.utils.data import random_split

dataset = ... # your dataset
train_ratio = 0.8 # ratio of data to be used for training

train_len = int(train_ratio * len(dataset))
test_len = len(dataset) - train_len

train_set, test_set = random_split(dataset, [train_len, test_len])

Once you have the training and test sets, you can use the DataLoader class from torch.utils.data to load the data in batches.

from torch.utils.data import DataLoader

batch_size = 32

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False)

In this example, we pass the train_set and test_set as input to the DataLoader, also we set the batch size to 32, and shuffle to True for the training set, and False for the test set.

You can then use these data loaders to iterate through the training and test sets in batches. For example, you can use a for loop to iterate through the training data in batches:

for data, labels in train_loader:
    # do something with the data and labels

This way, the data is loaded in batches of size "batch_size" and can be used for training your model.

In [12]:
#df
labels = df.pop("Attrition")

In [13]:
from torch.utils.data import random_split
from torch.utils.data import TensorDataset


from sklearn.preprocessing import StandardScaler

sca = StandardScaler()

df = sca.fit_transform(df)
# X_test = sca.fit_transform(X_test)
# test = sca.fit_transform(test)


tensor = torch.from_numpy(df)
labeltensor = torch.from_numpy(labels.values)

dataset = TensorDataset(tensor, labeltensor)



train_ratio = 0.85 # ratio of data to be used for training

train_len = int(train_ratio * len(dataset))
test_len = len(dataset) - train_len

train_set, test_set = random_split(dataset, [train_len, test_len])

In [14]:
train_set

<torch.utils.data.dataset.Subset at 0x220b6f2e7f0>

In [15]:
# loader.next()

In [16]:
from torch.utils.data import Dataset, DataLoader

loader = DataLoader(train_set, batch_size=32, shuffle=True)
loaderVal = DataLoader(test_set, batch_size=32, shuffle=True)

In [17]:
class MihaNetForSwag(nn.Module):

    def __init__(self):
        super(MihaNetForSwag, self).__init__()
        # 1 input image channel, 6 output channels, 5x5 square convolution
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(54, 256)  # 5*5 from image dimension
        self.fc2 = nn.Linear(256, 64) #128)
        #self.fc3 = nn.Linear(128, 128)
        #self.fc4 = nn.Linear(128, 128)
        #self.fc5 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)

    def forward(self, x):
        # Max pooling over a (2, 2) window
        # x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # If the size is a square, you can specify with a single number
        # x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        # x = torch.flatten(x, 1) # flatten all dimensions except the batch dimension
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        #x = F.relu(self.fc3(x))
        #x = F.relu(self.fc4(x))
        #x = F.relu(self.fc5(x))
        x = torch.sigmoid(self.fc3(x))
        # x = torch.sigmoid(self.fc6(x))
        #print(x.shape)
        #print(x)
        # x = self.fc4(x)
        return x
    

class MihaNetForSwager(nn.Module):

    def __init__(self):
        super(MihaNetForSwag, self).__init__()
        # 1 input image channel, 6 output channels, 5x5 square convolution
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(55, 256)  # 5*5 from image dimension
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, 1)

    def forward(self, x):
        # Max pooling over a (2, 2) window
        # x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # If the size is a square, you can specify with a single number
        # x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        # x = torch.flatten(x, 1) # flatten all dimensions except the batch dimension
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = torch.sigmoid(self.fc4(x))
        #print(x.shape)
        #print(x)
        # x = self.fc4(x)
        return x
    


## VARIABLES

# Epochs to train for
epochs = 50
# batch size 
batch_size = 128

# Length of sequence
seq_len = 100

# for printing report purposes
# always start at 0
tracker = 0

# number of characters in text
# num_char = max(encoded_text)+1



model = MihaNetForSwag()
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
criterion = nn.MSELoss()



model.train()



# Check to see if using GPU
#if model.use_gpu:
#    model.cuda()

for i in range(epochs):
    
    # hidden = model.hidden_state(batch_size)
    
    # print(i)
    loader = iter(DataLoader(train_set, batch_size=batch_size, shuffle=True))
    
    for x,y in loader:
        
        # print(x)
        
        tracker += 1
        
        # One Hot Encode incoming data
        # x = loader.next() # one_hot_encoder(x,num_char)
        
        # Convert Numpy Arrays to Tensor
        
        inputs = x.type(torch.FloatTensor)
        #torch.from_numpy(x)
        targets = y.type(torch.FloatTensor) #torch.from_numpy(y)
        
        # Adjust for GPU if necessary
        
        # if model.use_gpu:
        #    inputs = inputs.cuda()
        #    targets = targets.cuda()
            
        # Reset Hidden State
        # If we dont' reset we would backpropagate through all training history
        # hidden = tuple([state.data for state in hidden])
        
        model.zero_grad()
        
        output = model.forward(inputs)
        loss = criterion(output, targets)
        
        loss.backward()
        
        # POSSIBLE EXPLODING GRADIENT PROBLEM!
        # LET"S CLIP JUST IN CASE
        nn.utils.clip_grad_norm_(model.parameters(),max_norm=5)
        
        optimizer.step()
        
        
        
        ###################################
        ### CHECK ON VALIDATION SET ######
        #################################
        
        
        
        if tracker % 2 == 0:
            loaderVal = iter(DataLoader(test_set, batch_size=batch_size, shuffle=True))
            # val_hidden = model.hidden_state(batch_size)
            val_losses = []
            model.eval()
            
            for x,y in loaderVal:
                
                # One Hot Encode incoming data
                # x = one_hot_encoder(x,num_char)
                

                # Convert Numpy Arrays to Tensor

                inputs = x.type(torch.FloatTensor) #torch.from_numpy(x)
                targets = y.type(torch.FloatTensor) #torch.from_numpy(y)

                # Adjust for GPU if necessary

                #if model.use_gpu:

                #    inputs = inputs.cuda()
                #    targets = targets.cuda()
                    
                # Reset Hidden State
                # If we dont' reset we would backpropagate through 
                # all training history
                # val_hidden = tuple([state.data for state in val_hidden])
                
                output = model.forward(inputs)
                val_loss = criterion(output, targets)
        
                val_losses.append(val_loss.item())
            
            
            # Reset to training model after val for loop
            model.train()
            
            print(f"Epoch: {i} Step: {tracker} Val Loss: {val_loss.item()}")

In [18]:
# set the number of training iterations (epochs)
num_epochs = 60
#torch.set_grad_enabled(True) 

model = MihaNetForSwag()
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
criterion = nn.BCELoss()

# start the training loop
for epoch in range(num_epochs):
    # loop over the training data in batches
    for data, labels in loader:
        # pass the data through the model
        output = model(data.type(torch.FloatTensor))
        # calculate the loss
        #print(output)
        #print(labels)
        
        #size = 32
        #print(output.shape)
        #print(labels.shape)
        #print("$$$")
        
        #print(output)
        #print((output > 0.5).type(torch.FloatTensor))
        #print(labels)
        
        #print(output)
        #print(labels.reshape(labels.size()[0],1))
        # loss = criterion(output.reshape(output.size()[0]), labels.type(torch.FloatTensor))
        loss = criterion(output, labels.reshape(-1,1).type(torch.FloatTensor))
        # zero the gradients
        optimizer.zero_grad()
        # perform backpropagation
        loss.backward()
        # update the model's parameters
        optimizer.step()
        
        #model.eval()
        
        #vl = 0
            
        #for x,y in loaderVal:            
        #    out = model.forward(x.type(torch.FloatTensor))
        #    val_loss = criterion(out,y.reshape(-1, 1).type(torch.FloatTensor)) 
        #    vl = val_loss
        
        #print("Epoch {}/{}, Validation Loss: {:.4f}".format(epoch+1, num_epochs, vl.item()))
        #model.train()

    # print the loss at the end of the epoch
    print("Epoch {}/{}, Loss: {:.4f}".format(epoch+1, num_epochs, loss.item()))

Epoch 1/60, Loss: 0.3595
Epoch 2/60, Loss: 0.3859
Epoch 3/60, Loss: 0.2938
Epoch 4/60, Loss: 0.1466
Epoch 5/60, Loss: 0.3526
Epoch 6/60, Loss: 0.0615
Epoch 7/60, Loss: 0.1699
Epoch 8/60, Loss: 0.1445
Epoch 9/60, Loss: 0.1653
Epoch 10/60, Loss: 0.1770
Epoch 11/60, Loss: 0.1973
Epoch 12/60, Loss: 0.3020
Epoch 13/60, Loss: 0.0723
Epoch 14/60, Loss: 0.0622
Epoch 15/60, Loss: 0.0031
Epoch 16/60, Loss: 0.1504
Epoch 17/60, Loss: 0.0081
Epoch 18/60, Loss: 0.0211
Epoch 19/60, Loss: 0.0109
Epoch 20/60, Loss: 0.0075
Epoch 21/60, Loss: 0.0024
Epoch 22/60, Loss: 0.0053
Epoch 23/60, Loss: 0.0005
Epoch 24/60, Loss: 0.0009
Epoch 25/60, Loss: 0.0023
Epoch 26/60, Loss: 0.0008
Epoch 27/60, Loss: 0.0012
Epoch 28/60, Loss: 0.0022
Epoch 29/60, Loss: 0.0024
Epoch 30/60, Loss: 0.0014
Epoch 31/60, Loss: 0.0007
Epoch 32/60, Loss: 0.0021
Epoch 33/60, Loss: 0.0015
Epoch 34/60, Loss: 0.0007
Epoch 35/60, Loss: 0.0013
Epoch 36/60, Loss: 0.0007
Epoch 37/60, Loss: 0.0004
Epoch 38/60, Loss: 0.0013
Epoch 39/60, Loss: 0.

In [19]:
validations = []
val_inputs = []


for x, y in loaderVal:
    for xt in x:
        # print(xt)
        val_inputs.append(xt.numpy())
    for yt in y:
        validations.append(yt)

# print(val_inputs)
        
model.eval()
out = model.forward(torch.from_numpy(np.array(val_inputs)).type(torch.FloatTensor))
val_loss = criterion(out,torch.from_numpy(np.array(validations)).type(torch.FloatTensor).reshape(-1,1)) 
vl = val_loss

print("Validation Loss: {:.4f}".format(vl.item()))


Validation Loss: 1.5806


In [20]:
for data, labels in loader:
    print(torch.cat((model(data.type(torch.FloatTensor)), labels.reshape(-1,1)), -1))

tensor([[1.5121e-13, 0.0000e+00],
        [1.7894e-07, 0.0000e+00],
        [3.8077e-16, 0.0000e+00],
        [9.9999e-01, 1.0000e+00],
        [5.6120e-13, 0.0000e+00],
        [7.5983e-16, 0.0000e+00],
        [3.0356e-05, 0.0000e+00],
        [7.3203e-04, 0.0000e+00],
        [9.9925e-01, 1.0000e+00],
        [8.5510e-10, 0.0000e+00],
        [2.2927e-04, 0.0000e+00],
        [7.1188e-11, 0.0000e+00],
        [7.2184e-20, 0.0000e+00],
        [2.1144e-08, 0.0000e+00],
        [7.9354e-06, 0.0000e+00],
        [9.9980e-01, 1.0000e+00],
        [6.2790e-06, 0.0000e+00],
        [5.5691e-04, 0.0000e+00],
        [3.2884e-04, 0.0000e+00],
        [2.1831e-12, 0.0000e+00],
        [1.8461e-13, 0.0000e+00],
        [7.7145e-14, 0.0000e+00],
        [9.9996e-01, 1.0000e+00],
        [6.4970e-05, 0.0000e+00],
        [1.3389e-18, 0.0000e+00],
        [9.9977e-01, 1.0000e+00],
        [1.3786e-04, 0.0000e+00],
        [9.9975e-01, 1.0000e+00],
        [4.7375e-23, 0.0000e+00],
        [4.823

tensor([[4.6473e-06, 0.0000e+00],
        [1.6973e-04, 0.0000e+00],
        [3.9590e-12, 0.0000e+00],
        [1.9798e-07, 0.0000e+00],
        [6.9829e-11, 0.0000e+00],
        [2.5233e-04, 0.0000e+00],
        [3.0753e-11, 0.0000e+00],
        [7.2576e-07, 0.0000e+00],
        [1.0595e-06, 0.0000e+00],
        [2.9672e-15, 0.0000e+00],
        [4.8511e-09, 0.0000e+00],
        [2.2661e-08, 0.0000e+00],
        [4.1859e-04, 0.0000e+00],
        [1.0000e+00, 1.0000e+00],
        [1.1209e-08, 0.0000e+00],
        [2.9147e-05, 0.0000e+00],
        [3.4673e-07, 0.0000e+00],
        [1.5811e-07, 0.0000e+00],
        [2.1210e-15, 0.0000e+00],
        [9.1188e-06, 0.0000e+00],
        [1.4335e-09, 0.0000e+00],
        [6.9506e-07, 0.0000e+00],
        [1.7257e-09, 0.0000e+00],
        [3.5800e-07, 0.0000e+00],
        [1.2790e-03, 0.0000e+00],
        [2.2758e-15, 0.0000e+00],
        [3.0426e-04, 0.0000e+00],
        [4.5887e-21, 0.0000e+00],
        [5.8898e-11, 0.0000e+00],
        [1.237

In [21]:
ids = dfTest.pop("id")

In [22]:
ids

0       1677
1       1678
2       1679
3       1680
4       1681
        ... 
1114    2791
1115    2792
1116    2793
1117    2794
1118    2795
Name: id, Length: 1119, dtype: int64

In [23]:
dfTest

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EnvironmentSatisfaction,Gender,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,19,Non-Travel,992,Research & Development,1,1,Medical,1,4,Male,...,4,80,0,1,2,2,1,0,0,0
1,45,Travel_Rarely,1136,Sales,4,4,Marketing,1,3,Male,...,3,80,1,7,3,3,2,2,2,2
2,37,Travel_Rarely,155,Research & Development,13,3,Life Sciences,1,4,Male,...,2,80,2,13,2,2,7,7,1,7
3,32,Travel_Rarely,688,Research & Development,1,4,Life Sciences,1,3,Male,...,3,80,0,14,2,2,14,10,11,8
4,29,Travel_Frequently,464,Research & Development,9,1,Life Sciences,1,3,Male,...,1,80,0,1,5,3,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1114,31,Travel_Rarely,755,Sales,1,1,Life Sciences,1,3,Male,...,3,80,1,1,1,3,1,0,0,0
1115,40,Travel_Rarely,654,Research & Development,26,5,Medical,1,3,Male,...,4,80,1,20,4,3,20,10,1,8
1116,42,Travel_Frequently,884,Research & Development,1,4,Medical,1,2,Female,...,1,80,1,10,2,2,5,3,0,2
1117,25,Travel_Frequently,1469,Sales,1,2,Technical Degree,1,3,Male,...,4,80,1,5,2,3,4,2,1,3


In [24]:
dfTest = pd.get_dummies(dfTest, columns=cat_col_names)
# tensor = torch.from_numpy(df.values)



In [25]:
test_setf = sca.transform(dfTest)

In [26]:
test_setf

array([[-2.00327132,  0.26510292, -0.98212515, ...,  3.5084931 ,
        -0.42932742, -1.82574186],
       [ 1.0539068 ,  0.64973414, -0.5986802 , ..., -0.28502265,
        -0.42932742,  0.54772256],
       [ 0.11323661, -1.97056605,  0.55165465, ..., -0.28502265,
        -0.42932742,  0.54772256],
       ...,
       [ 0.70115548, -0.02337049, -0.98212515, ..., -0.28502265,
         2.32922463, -1.82574186],
       [-1.29776868,  1.53919384, -0.98212515, ..., -0.28502265,
         2.32922463, -1.82574186],
       [ 0.70115548,  0.91149705, -0.85431017, ..., -0.28502265,
        -0.42932742,  0.54772256]])

In [27]:
final = []
for data in test_setf:
    final.append(model(torch.from_numpy(data).type(torch.FloatTensor)).detach().numpy()[0])

In [28]:
final

[3.5385216e-07,
 4.964112e-06,
 2.6500596e-16,
 3.183083e-05,
 0.9718551,
 0.0018219773,
 6.390267e-12,
 0.070162654,
 7.149716e-09,
 2.7529745e-07,
 3.5551265e-15,
 3.0557218e-15,
 2.7395742e-12,
 1.125136e-09,
 0.01120105,
 0.0007185613,
 1.0411242e-06,
 2.0247783e-10,
 0.044558473,
 0.000100890866,
 0.9991285,
 1.0950437e-10,
 0.19516331,
 2.1991058e-09,
 3.362217e-07,
 2.7197339e-15,
 0.55028343,
 7.817521e-06,
 3.2551871e-12,
 0.00037350316,
 2.0449622e-06,
 2.510648e-07,
 0.0023991228,
 1.6826414e-06,
 9.0711954e-05,
 0.6848247,
 1.9100605e-11,
 0.026469497,
 1.2054352e-10,
 0.07529929,
 5.199978e-16,
 7.6296004e-08,
 3.8599433e-06,
 1.5488954e-15,
 0.0066619823,
 0.003749763,
 1.6887272e-08,
 2.4113742e-06,
 6.983289e-14,
 3.5061888e-08,
 4.835699e-10,
 3.921261e-05,
 0.010426506,
 5.914186e-07,
 2.7780559e-15,
 0.018992677,
 1.3627116e-08,
 2.279904e-18,
 0.6610402,
 0.00037204014,
 2.4966076e-10,
 8.714942e-15,
 4.5723634e-09,
 1.676408e-18,
 0.00043928038,
 1.333908e-05,
 0.9

In [29]:

Output = pd.DataFrame({'id':ids,'Attrition':final})




In [30]:
Output.to_csv('Submision.csv',index=False)
Output.head()

Unnamed: 0,id,Attrition
0,1677,3.538522e-07
1,1678,4.964112e-06
2,1679,2.65006e-16
3,1680,3.183083e-05
4,1681,0.9718551
