In [6]:
import os
import numpy as np
import pandas as pd
import pickle

In [18]:
def unpickle(file):
    with open(file, 'rb') as fo:
        dict_file = pickle.load(fo, encoding='bytes')
    return {'labels': dict_file[b'labels'], 'data': dict_file[b'data']}

In [21]:
train_batches = pd.DataFrame([])
for batch in range(1, 6):
    batch_data = unpickle(f"./data/cifar-10-python/cifar-10-batches-py/data_batch_{batch}")
    temp_df = pd.DataFrame({
        'labels': batch_data['labels'], 
        'data': [np.array(row) for row in batch_data['data']]
    })
    train_batches = pd.concat([train_batches, temp_df], ignore_index=True)

In [22]:
# Expand the 'data' into separate columns
expanded_data = pd.DataFrame(train_batches['data'].tolist())
expanded_data['labels'] = train_batches['labels']

In [23]:
expanded_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3063,3064,3065,3066,3067,3068,3069,3070,3071,labels
0,59,43,50,68,98,119,139,145,149,149,...,58,65,59,46,57,104,140,84,72,6
1,154,126,105,102,125,155,172,180,142,111,...,42,67,101,122,133,136,139,142,144,9
2,255,253,253,253,253,253,253,253,253,253,...,83,80,69,66,72,79,83,83,84,9
3,28,37,38,42,44,40,40,24,32,43,...,39,59,42,44,48,38,28,37,46,4
4,170,168,177,183,181,177,181,184,189,189,...,88,85,82,83,79,78,82,78,80,1


In [24]:
expanded_data.to_csv("data/newtrainoriginal.csv")

In [4]:
# train_batches.to_csv("data/train.csv")
train_batches.to_pickle("data/train.pkl")

In [26]:
# test_batch = unpickle(f"./data/cifar-10-python/cifar-10-batches-py/test_batch")
# test_batch = pd.DataFrame({'labels': test_batch['labels'], 'data': [np.array(row) for row in test_batch['data']]})

test_batch_data = unpickle(f"./data/cifar-10-python/cifar-10-batches-py/test_batch")
test_batch = pd.DataFrame({
    'labels': test_batch_data['labels'], 
    'data': [np.array(row) for row in test_batch_data['data']]
})

# Expand the test 'data' into separate columns
expanded_test_data = pd.DataFrame(test_batch['data'].tolist())
expanded_test_data['labels'] = test_batch['labels']



In [27]:
expanded_test_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3063,3064,3065,3066,3067,3068,3069,3070,3071,labels
0,158,159,165,166,160,156,162,159,158,159,...,123,145,167,182,175,145,124,129,110,3
1,235,231,232,232,232,232,232,232,232,232,...,117,123,133,141,153,163,178,191,199,8
2,158,158,139,132,166,182,187,193,199,205,...,46,44,44,43,52,37,8,3,7,8
3,155,167,176,190,177,166,168,166,170,179,...,55,70,103,105,72,53,50,52,50,0
4,65,70,48,30,23,40,44,45,45,40,...,127,156,139,131,130,147,136,146,117,6


In [28]:
expanded_test_data.to_csv("data/newtestoriginal.csv")

In [6]:
# test_batch.to_csv("data/test.csv")

test_batch.to_pickle("data/test.pkl")

In [32]:
def load_cifar10():
    train_data, train_labels = [] , []
    for i in range(1,6):
        batch = unpickle(f"data/cifar-10-python/cifar-10-batches-py/data_batch_{i}")
        train_data.append(batch[b'data'])
        train_labels.extend(batch[b'labels'])
    train_data = np.concatenate(train_data, axis=0)
    train_labels = np.array(train_labels)

    # Load test batch
    test_batch = unpickle(f"data/cifar-10-python/cifar-10-batches-py/test_batch")
    test_data = np.array(test_batch[b'data'])
    test_labels = np.array(test_batch[b'labels'])

    # Reshape the data to (N, 32, 32, 3)
    train_data = train_data.reshape(-1, 3, 32, 32).transpose(0, 2, 3, 1)
    test_data = test_data.reshape(-1, 3, 32, 32).transpose(0, 2, 3, 1)

    return train_data, train_labels, test_data, test_labels

In [33]:
# Save data to CSV
def save_to_csv(data, labels, file_path):
    # Combine labels and data
    combined = np.column_stack((labels, data))
    # Save as a CSV file
    np.savetxt(file_path, combined, delimiter=",", fmt="%f")
    print(f"Saved {file_path} successfully!")

In [34]:
def unpickle(file):
    with open(file, 'rb') as fo:
        return pickle.load(fo, encoding='bytes')

In [36]:
# Prepare data
def normalize_images(data):
    return data / 255.0

# Convert labels to one-hot encoding
def one_hot_encode(labels, num_classes):
    one_hot = np.zeros((labels.size, num_classes))
    one_hot[np.arange(labels.size),labels] = 1
    return one_hot


In [37]:
print("Loading CIFAR-10 dataset...")
x_train, y_train, x_test, y_test = load_cifar10()

# Preprocess data
print("Preprocessing data...")
x_train = normalize_images(x_train).reshape(x_train.shape[0], -1)
x_test = normalize_images(x_test).reshape(x_test.shape[0],-1)
num_classes = 10

print("Saving to CSV...")
save_to_csv(x_train, y_train, "train.csv")
save_to_csv(x_test, y_test, "test.csv")
print("CSV files created.")


Loading CIFAR-10 dataset...
Preprocessing data...
Saving to CSV...
Saved train.csv successfully!
Saved test.csv successfully!
CSV files created.
