In [1]:
# !pip install --upgrade pip

In [2]:
# !pip install pandas

In [3]:
# !pip install -U scikit-learn

# Prepare the train and test data

In [4]:
# Import the libraries
import pandas as pd
import numpy as np
from numpy import savez_compressed
from multiprocessing import  Pool
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from scipy.sparse import save_npz

### Load in the data

In [5]:
# Load in the data
data_list = [a, b, c, d, e, f, g, h, i, j] = [None, None, None, None, None, None, None, None, None, None]
data_location = '../Datasets/AmazonCat-13K/processed/' 
for i in range(len(data_list)):
    data_list[i] = pd.read_csv(data_location + f'tokenized_no{i + 1}.csv', encoding='latin1')[0:85248]
    
# Concatenate all the data and reset the index
data = pd.concat(data_list, sort=False)
data = data.reset_index()

# Delete unused var (so save memory)
del data_list

In [6]:
# Convery stringged arrays to arrays
data['tokenized_title_and_description'] = data['tokenized_title_and_description'].apply(eval)
data['labels'] = data['labels'].apply(eval)

In [7]:
# # Create parallelization function
# def parallelize_dataframe(df, func, n_cores=12):
#     df_split = np.array_split(df, n_cores)
#     pool = Pool(n_cores)
#     df = pd.concat(pool.map(func, df_split))
#     pool.close()
#     pool.join()
#     return df

In [8]:
# # Create function to convert strings to arrays
# def convert_to_array(df):
#     df['tokenized_title_and_description'] = df['tokenized_title_and_description'].apply(eval)
#     df['labels'] = df['labels'].apply(eval)
#     return df

In [9]:
# # Convery stringged arrays to arrays
# data = parallelize_dataframe(data, convert_to_array)

In [10]:
# Check the first 3 rows
data.head(n=3)

Unnamed: 0,index,item_id,tokenized_title_and_description,labels
0,0,ID:B0027DQHA0,"[29260, 21551, 12365, 3328, 4450, 19, 237, 211...","[Music, TV, Movies & TV, Classical]"
1,1,ID:0756400120,"[381, 15160, 38609, 41, 5949, 10, 477, 1179, 3...","[Books, General, Science Fiction, United State..."
2,2,ID:B00024YAOQ,"[646, 150, 56, 73, 5, 99, 1, 883, 3, 4, 3470, ...","[Motivation & Self-Improvement, Business & Inv..."


In [11]:
# Check the shape
data.shape

(852480, 4)

### Prepare the X data

In [12]:
sequences = list(data['tokenized_title_and_description'])

In [13]:
# Add padding to the sequences
MAX_SEQUENCE_LENGTH = 512
padded_sequences = pad_sequences(sequences,
                                 maxlen=MAX_SEQUENCE_LENGTH,
                                 padding='post') # Add padding to the end if needs padding

In [14]:
# Delete unused vars (to save memory)
del sequences

### Prepare the y data

In [15]:
# Get the labels
all_labels = list(data['labels'])

In [16]:
# Get count of unique labels
unique_labels = []
for labels in all_labels:
    for label in labels:
        unique_labels.append(label)

# Get counts of labels and instnaces        
labels_count = len(set(unique_labels))
instances_count = data.shape[0]

# Delete unused var (to save memory)
del unique_labels

In [17]:
# Convert the tag sets into a sparse matrix of binary vectors (as int8 to save memory)
# Leaving it as a sparse matrix until after the train-test split will save a lot of memory 
mlb = MultiLabelBinarizer(sparse_output=True)
sparse_binary_vectors = mlb.fit_transform(all_labels).astype('int8')

In [18]:
# Delete unused vars (to save memory)
del data, all_labels, labels

### Prepare the data for training and testing

In [19]:
# Get the data
X = padded_sequences
y_sparse = sparse_binary_vectors

In [20]:
# Create the train-test split
X_train, X_test, y_train_sparse, y_test_sparse = train_test_split(X, y_sparse, test_size=0.25, random_state=100)

In [21]:
# Delete unused vars (to save memory)
del X, padded_sequences
del y_sparse, sparse_binary_vectors, mlb

In [22]:
# Have a look at the shapes
# If all the dimaneions are divisible by 8, this will enable the use of Tensor Cores
print(f'X_train: {X_train.shape}')
print(f'X_test: {X_test.shape}')
print(f'y_train_sparse: {y_train_sparse.shape}')
print(f'y_test_sparse: {y_test_sparse.shape}')

X_train: (639360, 512)
X_test: (213120, 512)
y_train_sparse: (639360, 13680)
y_test_sparse: (213120, 13680)


### Save Train and Test data

In [23]:
# Define save location
save_path = '../Datasets/AmazonCat-13K/processed/'

In [24]:
# Save X data
savez_compressed(save_path + 'X_train.npz', X_train)
savez_compressed(save_path + 'X_test.npz', X_test)

In [25]:
# Save y data
save_npz(save_path + 'y_train_sparse.npz', y_train_sparse)
save_npz(save_path + 'y_test_sparse.npz', y_test_sparse)