In [1]:
!pip install --upgrade pip

Requirement already up-to-date: pip in /usr/local/lib/python3.6/dist-packages (20.0.2)


In [2]:
!pip install pandas



In [3]:
!pip install -U scikit-learn

Requirement already up-to-date: scikit-learn in /usr/local/lib/python3.6/dist-packages (0.22.1)


# Prepare the train and test data

In [4]:
# Import the libraries
import pandas as pd
import numpy as np
from numpy import load
from numpy import savez_compressed
from multiprocessing import  Pool
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from scipy.sparse import save_npz

### Load in the data

In [5]:
# Load in the data
data_list = [a, b, c, d, e, f, g, h, i, j] = [None, None, None, None, None, None, None, None, None, None]
data_location = '../Datasets/AmazonCat-13K/processed/' 
for i in range(len(data_list)):
    data_list[i] = pd.read_csv(data_location + f'tokenized_no{i + 1}.csv', encoding='latin1')
    
# Concatenate all the data and reset the index
data = pd.concat(data_list, sort=False)
data = data.reset_index()

# Delete unused var (so save memory)
del data_list

In [6]:
# # Convery stringged arrays to arrays
# data['tokenized_title_and_description'] = data['tokenized_title_and_description'].apply(eval)
# data['labels'] = data['labels'].apply(eval)

In [7]:
# Create parallelization function
def parallelize_dataframe(df, func, n_cores=12):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [8]:
# Create function to convert strings to arrays
def convert_to_array(df):
    df['tokenized_title_and_description'] = df['tokenized_title_and_description'].apply(eval)
    df['labels'] = df['labels'].apply(eval)
    return df

In [9]:
# Convery stringged arrays to arrays
data = parallelize_dataframe(data, convert_to_array)

In [10]:
# Check the first 3 rows
data.head(n=3)

Unnamed: 0,index,item_id,tokenized_title_and_description,labels
0,0,ID:B0027DQHA0,"[29260, 21551, 12365, 3328, 4450, 19, 237, 211...","[Movies & TV, Music, TV, Classical]"
1,1,ID:0756400120,"[381, 15160, 38609, 41, 5949, 10, 477, 1179, 3...","[Short Stories, United States, Anthologies, Sc..."
2,2,ID:B00024YAOQ,"[646, 150, 56, 73, 5, 99, 1, 883, 3, 4, 3470, ...","[Books, Motivation & Self-Improvement, Busines..."


In [11]:
# Check the shape
data.shape

(1494364, 4)

### Create the document vectors

In [12]:
# Load in the embedding matrix
embedding_matrix = load(data_location + 'embedding_matrix.npz', allow_pickle=True)
embedding_matrix = list(embedding_matrix['arr_0'])

In [13]:
# Create function to get document vector
def get_document_vector(instance):
    num_tokens = len(instance)
    if num_tokens == 0:
        document_vector = embedding_matrix[0]
        document_vector = np.append(document_vector, 1) # Add a bias term
        return document_vector
    document_vector = 0
    for token_index in instance:
        document_vector += embedding_matrix[token_index]
    document_vector = document_vector / num_tokens
    document_vector = np.append(document_vector, 1) # Add a bias term
    return document_vector

In [14]:
# Create function to get document vectors
def get_document_vectors(df):
    df['tokenized_title_and_description'] = df['tokenized_title_and_description'].apply(get_document_vector)
    return df

In [15]:
# Get document vectors
data = parallelize_dataframe(data, get_document_vectors)

In [16]:
# Check the first 3 rows
data.head(n=3)

Unnamed: 0,index,item_id,tokenized_title_and_description,labels
0,0,ID:B0027DQHA0,"[-0.4271874405988833, 0.09367887896099468, 1.2...","[Movies & TV, Music, TV, Classical]"
1,1,ID:0756400120,"[0.0594934499193861, 0.4740327557589373, 0.374...","[Short Stories, United States, Anthologies, Sc..."
2,2,ID:B00024YAOQ,"[-0.10279246065185557, 0.7254439690456465, -1....","[Books, Motivation & Self-Improvement, Busines..."


In [17]:
# Delete unused var (so save memory)
del embedding_matrix

### Prepare the y data

In [18]:
# Get the labels
all_labels = list(data['labels'])

In [19]:
# Get count of unique labels
unique_labels = []
for labels in all_labels:
    for label in labels:
        unique_labels.append(label)

# Get counts of labels and instnaces        
labels_count = len(set(unique_labels))
instances_count = data.shape[0]

# Delete unused var (to save memory)
del unique_labels

In [20]:
# Convert the tag sets into a sparse matrix of binary vectors (as int8 to save memory)
# Leaving it as a sparse matrix until after the train-test split will save a lot of memory 
mlb = MultiLabelBinarizer(sparse_output=True)
sparse_binary_vectors = mlb.fit_transform(all_labels).astype('int8')

In [21]:
# Delete unused vars (to save memory)
del all_labels, labels

### Prepare the data for training and testing

In [22]:
# Get the data
X = np.array(data['tokenized_title_and_description'])
y_sparse = sparse_binary_vectors

# Delete unused vars (to save memory)
del data

In [23]:
# Create the train-test split
X_train, X_test, y_train_sparse, y_test_sparse = train_test_split(X, y_sparse, test_size=0.25, random_state=100)

In [24]:
# Delete unused vars (to save memory)
del X
del y_sparse, sparse_binary_vectors, mlb

In [25]:
# Have a look at the shapes
print(f'X_train: ({X_train.shape[0]}, {len(X_train[0])})')
print(f'X_test: ({X_test.shape[0]}, {len(X_test[0])})')
print(f'y_train_sparse: {y_train_sparse.shape}')
print(f'y_test_sparse: {y_test_sparse.shape}')

X_train: (1120773, 101)
X_test: (373591, 101)
y_train_sparse: (1120773, 14295)
y_test_sparse: (373591, 14295)


### Save Train and Test data

In [26]:
# Define save location
save_path = '../Datasets/AmazonCat-13K/processed/'

In [27]:
# Save X data
savez_compressed(save_path + 'X_train_averaged.npz', X_train)
savez_compressed(save_path + 'X_test_averaged.npz', X_test)

In [28]:
# Save y data
save_npz(save_path + 'y_train_sparse_averaged.npz', y_train_sparse)
save_npz(save_path + 'y_test_sparse_averaged.npz', y_test_sparse)