# Preprocessing imported datasets

In [1]:
import torch
import pandas as pd
from sklearn.calibration import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

  from .autonotebook import tqdm as notebook_tqdm


## Adult dataset preprocessing

In [2]:
df_adults = pd.read_csv('datasets/adult.csv')

In [3]:
# Preprocessing the data
# Encode categorical variables
label_encoders = {}
categorical_cols = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country']
for col in categorical_cols:
    label_encoders[col] = LabelEncoder()
    df_adults[col] = label_encoders[col].fit_transform(df_adults[col])

# Map income column to binary values
df_adults['income'] = df_adults['income'].map({'<=50K': 0, '>50K': 1})

In [4]:
# Splitting the data into features and target variable
X = df_adults.drop('income', axis=1)
y = df_adults['income']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

In [5]:
# split training and testing data
df_train = df_adults.sample(frac=0.8, random_state=200)
df_test = df_adults.drop(df_train.index)
# save
df_train.to_csv('datasets/adult_train.csv', index=False)
df_test.to_csv('datasets/adult_test.csv', index=False)

### IMDB Preprocessing

In [9]:
df_imdb = pd.read_csv('datasets/imdb.csv')

In [10]:
# split training and testing data
df_train = df_imdb.sample(frac=0.8, random_state=200)
df_test = df_imdb.drop(df_train.index)
# save
df_train.to_csv('datasets/imdb_train.csv', index=False)
df_test.to_csv('datasets/imdb_test.csv', index=False)