In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np


class Net(nn.Module):

    
    def __init__(self):
        super(Net, self).__init__()
        # 1 input image channel, 6 output channels, 5x5 square convolution
        # kernel
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(16 * 5 * 5, 120)  # 5*5 from image dimension
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
        self.predprob = 0

    def forward(self, x):
        # Max pooling over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # If the size is a square, you can specify with a single number
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = torch.flatten(x, 1) # flatten all dimensions except the batch dimension
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        self.predprob = x
        return (x > 0.5).type()


#net = Net()
#print(net)

In [2]:
import pandas as pd

df = pd.read_csv('train.csv')
dfTest = pd.read_csv('test.csv')
dataset = pd.read_csv('train.csv')


In [3]:
assert dataset.isnull().sum().sum() == 0

num_col_names = list(dataset.select_dtypes(include='number').columns)
cat_col_names = list(set(dataset.columns) - set(num_col_names))

# 70:15:15 stratified split
from sklearn.model_selection import train_test_split
train, tmp = train_test_split(dataset, test_size=0.3, stratify = dataset["Attrition"], random_state=42)
val, test  = train_test_split(tmp,     test_size=0.5, stratify =     tmp["Attrition"], random_state=42)


#!pip install -q pytorch_tabular[extra]



In [4]:
#!pip install -q pytorch_tabular[extra]

In [5]:
cat_col_names

['OverTime',
 'Department',
 'MaritalStatus',
 'Gender',
 'BusinessTravel',
 'Over18',
 'JobRole',
 'EducationField']

In [6]:
df["Attrition"]

0       0
1       0
2       0
3       0
4       1
       ..
1672    0
1673    0
1674    1
1675    0
1676    0
Name: Attrition, Length: 1677, dtype: int64

Help from chatgpt:

You can convert a pandas dataframe with categorical data fields to a PyTorch tensor using the torch.from_numpy() function and the .values attribute of the dataframe.

First, you will need to convert the categorical data fields to numerical values using the .cat.codes attribute of the dataframe column, or using pandas' get_dummies() function.

Then you can use the .values attribute to extract the numpy array from the dataframe and pass it to the torch.from_numpy() function to convert it to a PyTorch tensor.

Example:

import pandas as pd
import torch

df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': [1, 2, 3]})
df = pd.get_dummies(df, columns=['A'])
tensor = torch.from_numpy(df.values)

Note that in the above example we use the pd.get_dummies function to convert the categorical column 'A' to multiple columns, one for each category in the column.

Thanks A.I.

You're welcome! Let me know if you have any other questions.

In [7]:

df = pd.get_dummies(df, columns=cat_col_names)
tensor = torch.from_numpy(df.values)

In [8]:
df["Gender_Female"] 

0       0
1       0
2       0
3       1
4       1
       ..
1672    1
1673    0
1674    0
1675    0
1676    1
Name: Gender_Female, Length: 1677, dtype: uint8

In [9]:
df.pop("id")

0          0
1          1
2          2
3          3
4          4
        ... 
1672    1672
1673    1673
1674    1674
1675    1675
1676    1676
Name: id, Length: 1677, dtype: int64

In [10]:
tensor.shape

torch.Size([1677, 56])

In [11]:
tensor

tensor([[   0,   36,  599,  ...,    1,    0,    0],
        [   1,   35,  921,  ...,    0,    1,    0],
        [   2,   32,  718,  ...,    0,    0,    0],
        ...,
        [1674,   29, 1184,  ...,    0,    0,    0],
        [1675,   36,  441,  ...,    0,    0,    0],
        [1676,   36, 1141,  ...,    0,    0,    0]])

how to split dataset into train and test set in pytorch. and then how to use dataloader with the result?

In PyTorch, you can split a dataset into a training set and a test set using the torch.utils.data.random_split() function. This function takes in a dataset and a list of lengths, and splits the dataset into multiple subsets with the specified lengths.

For example, to split a dataset into a training set and a test set, you can use the following code:

from torch.utils.data import random_split

dataset = ... # your dataset
train_ratio = 0.8 # ratio of data to be used for training

train_len = int(train_ratio * len(dataset))
test_len = len(dataset) - train_len

train_set, test_set = random_split(dataset, [train_len, test_len])

Once you have the training and test sets, you can use the DataLoader class from torch.utils.data to load the data in batches.

from torch.utils.data import DataLoader

batch_size = 32

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False)

In this example, we pass the train_set and test_set as input to the DataLoader, also we set the batch size to 32, and shuffle to True for the training set, and False for the test set.

You can then use these data loaders to iterate through the training and test sets in batches. For example, you can use a for loop to iterate through the training data in batches:

for data, labels in train_loader:
    # do something with the data and labels

This way, the data is loaded in batches of size "batch_size" and can be used for training your model.

In [12]:
#df
labels = df.pop("Attrition")

In [13]:
from torch.utils.data import random_split
from torch.utils.data import TensorDataset


from sklearn.preprocessing import StandardScaler

sca = StandardScaler()

df = sca.fit_transform(df)
# X_test = sca.fit_transform(X_test)
# test = sca.fit_transform(test)


tensor = torch.from_numpy(df)
labeltensor = torch.from_numpy(labels.values)

dataset = TensorDataset(tensor, labeltensor)



train_ratio = 0.85 # ratio of data to be used for training

train_len = int(train_ratio * len(dataset))
test_len = len(dataset) - train_len

train_set, test_set = random_split(dataset, [train_len, test_len])

In [14]:
train_set

<torch.utils.data.dataset.Subset at 0x20aa6db2910>

In [15]:
# loader.next()

In [16]:
from torch.utils.data import Dataset, DataLoader

loader = DataLoader(train_set, batch_size=32, shuffle=True)
loaderVal = DataLoader(test_set, batch_size=32, shuffle=True)

In [17]:
class MihaNetForSwag(nn.Module):

    def __init__(self):
        super(MihaNetForSwag, self).__init__()
        # 1 input image channel, 6 output channels, 5x5 square convolution
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(54, 256)  # 5*5 from image dimension
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, 1)

    def forward(self, x):
        # Max pooling over a (2, 2) window
        # x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # If the size is a square, you can specify with a single number
        # x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        # x = torch.flatten(x, 1) # flatten all dimensions except the batch dimension
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = torch.sigmoid(self.fc4(x))
        #print(x.shape)
        #print(x)
        # x = self.fc4(x)
        return x
    

class MihaNetForSwager(nn.Module):

    def __init__(self):
        super(MihaNetForSwag, self).__init__()
        # 1 input image channel, 6 output channels, 5x5 square convolution
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(55, 256)  # 5*5 from image dimension
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, 1)

    def forward(self, x):
        # Max pooling over a (2, 2) window
        # x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # If the size is a square, you can specify with a single number
        # x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        # x = torch.flatten(x, 1) # flatten all dimensions except the batch dimension
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = torch.sigmoid(self.fc4(x))
        #print(x.shape)
        #print(x)
        # x = self.fc4(x)
        return x
    


## VARIABLES

# Epochs to train for
epochs = 50
# batch size 
batch_size = 128

# Length of sequence
seq_len = 100

# for printing report purposes
# always start at 0
tracker = 0

# number of characters in text
# num_char = max(encoded_text)+1



model = MihaNetForSwag()
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
criterion = nn.MSELoss()



model.train()



# Check to see if using GPU
#if model.use_gpu:
#    model.cuda()

for i in range(epochs):
    
    # hidden = model.hidden_state(batch_size)
    
    # print(i)
    loader = iter(DataLoader(train_set, batch_size=batch_size, shuffle=True))
    
    for x,y in loader:
        
        # print(x)
        
        tracker += 1
        
        # One Hot Encode incoming data
        # x = loader.next() # one_hot_encoder(x,num_char)
        
        # Convert Numpy Arrays to Tensor
        
        inputs = x.type(torch.FloatTensor)
        #torch.from_numpy(x)
        targets = y.type(torch.FloatTensor) #torch.from_numpy(y)
        
        # Adjust for GPU if necessary
        
        # if model.use_gpu:
        #    inputs = inputs.cuda()
        #    targets = targets.cuda()
            
        # Reset Hidden State
        # If we dont' reset we would backpropagate through all training history
        # hidden = tuple([state.data for state in hidden])
        
        model.zero_grad()
        
        output = model.forward(inputs)
        loss = criterion(output, targets)
        
        loss.backward()
        
        # POSSIBLE EXPLODING GRADIENT PROBLEM!
        # LET"S CLIP JUST IN CASE
        nn.utils.clip_grad_norm_(model.parameters(),max_norm=5)
        
        optimizer.step()
        
        
        
        ###################################
        ### CHECK ON VALIDATION SET ######
        #################################
        
        
        
        if tracker % 2 == 0:
            loaderVal = iter(DataLoader(test_set, batch_size=batch_size, shuffle=True))
            # val_hidden = model.hidden_state(batch_size)
            val_losses = []
            model.eval()
            
            for x,y in loaderVal:
                
                # One Hot Encode incoming data
                # x = one_hot_encoder(x,num_char)
                

                # Convert Numpy Arrays to Tensor

                inputs = x.type(torch.FloatTensor) #torch.from_numpy(x)
                targets = y.type(torch.FloatTensor) #torch.from_numpy(y)

                # Adjust for GPU if necessary

                #if model.use_gpu:

                #    inputs = inputs.cuda()
                #    targets = targets.cuda()
                    
                # Reset Hidden State
                # If we dont' reset we would backpropagate through 
                # all training history
                # val_hidden = tuple([state.data for state in val_hidden])
                
                output = model.forward(inputs)
                val_loss = criterion(output, targets)
        
                val_losses.append(val_loss.item())
            
            
            # Reset to training model after val for loop
            model.train()
            
            print(f"Epoch: {i} Step: {tracker} Val Loss: {val_loss.item()}")

In [18]:
# set the number of training iterations (epochs)
num_epochs = 100
#torch.set_grad_enabled(True) 

model = MihaNetForSwag()
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
criterion = nn.BCELoss()

# start the training loop
for epoch in range(num_epochs):
    # loop over the training data in batches
    for data, labels in loader:
        # pass the data through the model
        output = model(data.type(torch.FloatTensor))
        # calculate the loss
        #print(output)
        #print(labels)
        
        #size = 32
        #print(output.shape)
        #print(labels.shape)
        #print("$$$")
        
        #print(output)
        #print((output > 0.5).type(torch.FloatTensor))
        #print(labels)
        
        #print(output)
        #print(labels.reshape(labels.size()[0],1))
        # loss = criterion(output.reshape(output.size()[0]), labels.type(torch.FloatTensor))
        loss = criterion(output, labels.reshape(-1,1).type(torch.FloatTensor))
        # zero the gradients
        optimizer.zero_grad()
        # perform backpropagation
        loss.backward()
        # update the model's parameters
        optimizer.step()

    # print the loss at the end of the epoch
    print("Epoch {}/{}, Loss: {:.4f}".format(epoch+1, num_epochs, loss.item()))

Epoch 1/100, Loss: 0.1082
Epoch 2/100, Loss: 0.3672
Epoch 3/100, Loss: 0.4542
Epoch 4/100, Loss: 0.1235
Epoch 5/100, Loss: 0.5694
Epoch 6/100, Loss: 0.2487
Epoch 7/100, Loss: 0.2749
Epoch 8/100, Loss: 0.4336
Epoch 9/100, Loss: 0.1540
Epoch 10/100, Loss: 0.1056
Epoch 11/100, Loss: 0.0870
Epoch 12/100, Loss: 0.0019
Epoch 13/100, Loss: 0.0037
Epoch 14/100, Loss: 0.0062
Epoch 15/100, Loss: 0.0058
Epoch 16/100, Loss: 0.0113
Epoch 17/100, Loss: 0.0003
Epoch 18/100, Loss: 0.0479
Epoch 19/100, Loss: 0.0065
Epoch 20/100, Loss: 0.0639
Epoch 21/100, Loss: 0.0788
Epoch 22/100, Loss: 0.0328
Epoch 23/100, Loss: 0.0015
Epoch 24/100, Loss: 0.0041
Epoch 25/100, Loss: 0.0036
Epoch 26/100, Loss: 0.0014
Epoch 27/100, Loss: 0.0003
Epoch 28/100, Loss: 0.0004
Epoch 29/100, Loss: 0.0004
Epoch 30/100, Loss: 0.0002
Epoch 31/100, Loss: 0.0001
Epoch 32/100, Loss: 0.0007
Epoch 33/100, Loss: 0.0002
Epoch 34/100, Loss: 0.0002
Epoch 35/100, Loss: 0.0000
Epoch 36/100, Loss: 0.0000
Epoch 37/100, Loss: 0.0001
Epoch 38/1

In [19]:
for data, labels in loader:
    print(torch.cat((model(data.type(torch.FloatTensor)), labels.reshape(-1,1)), -1))

tensor([[6.9308e-10, 0.0000e+00],
        [9.9994e-01, 1.0000e+00],
        [3.7892e-19, 0.0000e+00],
        [1.0000e+00, 1.0000e+00],
        [2.0422e-12, 0.0000e+00],
        [3.9854e-23, 0.0000e+00],
        [2.2493e-05, 0.0000e+00],
        [2.5726e-07, 0.0000e+00],
        [8.3743e-13, 0.0000e+00],
        [6.5271e-18, 0.0000e+00],
        [8.3418e-23, 0.0000e+00],
        [1.1876e-06, 0.0000e+00],
        [1.0000e+00, 1.0000e+00],
        [5.7971e-17, 0.0000e+00],
        [1.8464e-05, 0.0000e+00],
        [2.7590e-14, 0.0000e+00],
        [1.4824e-08, 0.0000e+00],
        [5.6447e-12, 0.0000e+00],
        [5.3662e-16, 0.0000e+00],
        [3.0436e-10, 0.0000e+00],
        [6.8853e-05, 0.0000e+00],
        [4.1657e-06, 0.0000e+00],
        [1.0000e+00, 1.0000e+00],
        [7.2378e-20, 0.0000e+00],
        [2.1777e-07, 0.0000e+00],
        [1.9929e-05, 0.0000e+00],
        [2.0750e-18, 0.0000e+00],
        [7.6264e-07, 0.0000e+00],
        [6.0336e-16, 0.0000e+00],
        [1.679

In [20]:
ids = dfTest.pop("id")

In [21]:
ids

0       1677
1       1678
2       1679
3       1680
4       1681
        ... 
1114    2791
1115    2792
1116    2793
1117    2794
1118    2795
Name: id, Length: 1119, dtype: int64

In [22]:
dfTest

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EnvironmentSatisfaction,Gender,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,19,Non-Travel,992,Research & Development,1,1,Medical,1,4,Male,...,4,80,0,1,2,2,1,0,0,0
1,45,Travel_Rarely,1136,Sales,4,4,Marketing,1,3,Male,...,3,80,1,7,3,3,2,2,2,2
2,37,Travel_Rarely,155,Research & Development,13,3,Life Sciences,1,4,Male,...,2,80,2,13,2,2,7,7,1,7
3,32,Travel_Rarely,688,Research & Development,1,4,Life Sciences,1,3,Male,...,3,80,0,14,2,2,14,10,11,8
4,29,Travel_Frequently,464,Research & Development,9,1,Life Sciences,1,3,Male,...,1,80,0,1,5,3,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1114,31,Travel_Rarely,755,Sales,1,1,Life Sciences,1,3,Male,...,3,80,1,1,1,3,1,0,0,0
1115,40,Travel_Rarely,654,Research & Development,26,5,Medical,1,3,Male,...,4,80,1,20,4,3,20,10,1,8
1116,42,Travel_Frequently,884,Research & Development,1,4,Medical,1,2,Female,...,1,80,1,10,2,2,5,3,0,2
1117,25,Travel_Frequently,1469,Sales,1,2,Technical Degree,1,3,Male,...,4,80,1,5,2,3,4,2,1,3


In [25]:
dfTest = pd.get_dummies(dfTest, columns=cat_col_names)
# tensor = torch.from_numpy(df.values)



KeyError: "None of [Index(['OverTime', 'Department', 'MaritalStatus', 'Gender', 'BusinessTravel',\n       'Over18', 'JobRole', 'EducationField'],\n      dtype='object')] are in the [columns]"

In [27]:
test_set = sca.transform(dfTest)

In [29]:
test_set

array([[-2.00327132,  0.26510292, -0.98212515, ...,  1.43340323,
        -0.22673919, -0.25975555],
       [ 1.0539068 ,  0.64973414, -0.5986802 , ..., -0.6976404 ,
        -0.22673919, -0.25975555],
       [ 0.11323661, -1.97056605,  0.55165465, ..., -0.6976404 ,
        -0.22673919, -0.25975555],
       ...,
       [ 0.70115548, -0.02337049, -0.98212515, ...,  1.43340323,
        -0.22673919, -0.25975555],
       [-1.29776868,  1.53919384, -0.98212515, ..., -0.6976404 ,
        -0.22673919,  3.84977333],
       [ 0.70115548,  0.91149705, -0.85431017, ..., -0.6976404 ,
        -0.22673919, -0.25975555]])

In [46]:
final = []
for data in test_set:
    final.append(model(torch.from_numpy(data).type(torch.FloatTensor)).detach().numpy()[0])

In [47]:
final

[1.2087367e-10,
 9.2770847e-10,
 9.2020446e-14,
 1.5842186e-05,
 0.5043547,
 3.0169816e-09,
 1.232511e-14,
 0.15718132,
 4.1945475e-13,
 2.5778825e-05,
 2.4286147e-16,
 1.2365576e-12,
 9.7108e-16,
 3.4066903e-07,
 0.00011885539,
 0.027245117,
 0.00065033475,
 6.7352346e-11,
 5.912294e-13,
 7.091345e-06,
 0.9998198,
 8.612599e-09,
 0.45721856,
 6.651956e-11,
 1.450085e-07,
 1.3912255e-15,
 2.2380552e-06,
 9.434908e-09,
 1.6413565e-12,
 4.694799e-15,
 0.0003340438,
 1.2084458e-07,
 0.007245498,
 0.00014366205,
 0.03275002,
 0.9658105,
 8.781995e-21,
 0.9903803,
 3.7076595e-07,
 5.8304722e-05,
 2.4896053e-12,
 1.8094941e-09,
 1.4969908e-06,
 7.9047e-18,
 0.3205976,
 1.4897579e-10,
 5.8250284e-06,
 0.9260107,
 1.526591e-25,
 1.2256809e-12,
 1.3123141e-10,
 4.9579575e-07,
 9.36465e-08,
 7.134704e-09,
 1.3267648e-11,
 0.9498455,
 7.2065984e-07,
 2.8910685e-38,
 0.9997614,
 1.4995424e-13,
 1.5338282e-19,
 1.6540618e-13,
 3.3648724e-12,
 1.4357005e-14,
 0.0018399871,
 3.8676372e-08,
 0.9999932

In [48]:

Output = pd.DataFrame({'id':ids,'Attrition':final})




In [49]:
Output.to_csv('Submision.csv',index=False)
Output.head()

Unnamed: 0,id,Attrition
0,1677,1.208737e-10
1,1678,9.277085e-10
2,1679,9.202045e-14
3,1680,1.584219e-05
4,1681,0.5043547
