In [1]:
import torch
import torch.nn as nn

import numpy as np , pandas as pd
import matplotlib.pyplot as plt
import configparser

import pickle


In [2]:
config = configparser.ConfigParser()
config.read('settings.ini')

config= {key: int(value) for key, value in config['INCOME_DATASET'].items()}
config

{'vocab_length': 21979,
 'n_embed': 992,
 'n_heads': 31,
 'context_window': 38,
 'transformer_blocks': 2,
 'batch_size': 32,
 'random_seed': 123,
 'val_split': 20,
 'test_split': 10}

In [3]:
from local_utils import tokenize_dataset , read_dataset , decoder

In [4]:
df = pd.read_csv("../../datasets/income/adult.csv")

In [5]:
df = read_dataset(df)
df

Unnamed: 0,start,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income,end
0,<start>,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K,<end>
1,<start>,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K,<end>
2,<start>,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K,<end>
3,<start>,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K,<end>
4,<start>,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K,<end>
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,<start>,22,Private,310152,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K,<end>
32557,<start>,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K,<end>
32558,<start>,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K,<end>
32559,<start>,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K,<end>


In [6]:
tokenizer , col_code , encoded_docs = tokenize_dataset(df)


    _______________________________ 
    / Tokenizing your dataset       \
    \     > This might take a while /
    ------------------------------- 
           \   ^__^
            \  (oo)\_______
               (__)\       )\/\
                   ||----w |
                   ||     ||
    


In [7]:
# heres how to decode ...
print(f"encoded docs example : {encoded_docs[0]}")
decoded_eg = decoder(encoded_docs[0],df,col_code)
print(f"Decoded : {decoded_eg}")

encoded docs example : [0, 25, 14, 1, 26, 35, 21683, 21716, 21707, 21705, 21717, 21724, 21739, 21745, 21750, 21830, 21822, 21812, 21802, 21792, 21782, 21772, 21762, 21752, 21906, 21895, 21889, 21881, 21867, 21855, 21851, 21832, 21930, 21923, 21915, 21932, 21974, 21976]
Decoded : ['<start>', '89.69979', '?', '77053', 'HS-grad', '8.93521', 'Widowed', '?', 'Not-in-family', 'White', 'Female', '0.00000', '4355.99989', '39.64639', 'United-States', '<=50K', '<end>']


In [8]:
encoded_docs = torch.tensor(encoded_docs)
encoded_docs.shape

torch.Size([32561, 38])

In [9]:
torch.save(encoded_docs,"encoded_vars/encoded_docs_income.pt")

In [10]:
token_vars = {'col_code': col_code, 'df': df, 'tokenizer': tokenizer}

with open('encoded_vars/token_vars_income.pkl', 'wb') as file:
    pickle.dump(token_vars, file)

In [11]:
# make train - test split here
gen =torch.Generator()
gen.manual_seed(config["random_seed"])
# Shuffle the rows of the tensor
shuffled_indices = torch.randperm(encoded_docs.size(0),generator=gen)
shuffled_tensor = encoded_docs[shuffled_indices]

In [12]:
torch.all(shuffled_tensor == encoded_docs)

tensor(False)

In [13]:
# just use sklearn train_test_split smh
train_size = int((100 - config["val_split"] - config["test_split"]) * 0.01 * shuffled_tensor.shape[0])
splits= shuffled_tensor.split(train_size,0)
train_set = splits[0]
print("Train size",len(train_set), train_size)

print(f"Remaining : {len(splits[1])}")
valid_size = int(len(splits[1]) * (1-config["test_split"]/100))
valid_set, test_set =  splits[1].split(valid_size,0)
print(f"Valid size : {len(valid_set)}")
print(f"Test size : {len(test_set)}")



Train size 22792 22792
Remaining : 9769
Valid size : 8792
Test size : 977


In [14]:
torch.save(train_set,"encoded_vars/train_set.pt")

In [15]:
torch.save(valid_set,"encoded_vars/valid_set.pt")

In [16]:
torch.save(test_set,"encoded_vars/test_split.pt")