In [1]:
import pandas as pd
import numpy as np


Retrieve CSV files

In [2]:
df_train = pd.read_csv("../data/set_train.csv")
df_validation = pd.read_csv("../data/set_validation.csv")
df_test = pd.read_csv("../data/set_test.csv")
print("successfully retrieved all csv files")

successfully retrieved all csv files


In [3]:
#Create mapper for all 3 csv files
grouped_training = df_train.groupby('Genus')['Path'].apply(list).to_dict()
mapping_training = {key: np.array(value) for key, value in grouped_training.items()}

grouped_validation = df_validation.groupby('Genus')['Path'].apply(list).to_dict()
mapping_validation = {key: np.array(value) for key, value in grouped_validation.items()}

grouped_testing = df_test.groupby('Genus')['Path'].apply(list).to_dict()
mapping_testing = {key: np.array(value) for key, value in grouped_testing.items()}

In [4]:
#all 10 classes are evenly distributed in train/validation/testing
#Seperating data before augmentation ensure same image wont appear in 2 tensor sets
totalTrainingImg, totalValImg, totalTestImg = 0, 0, 0
for dictkeys in mapping_training.keys():
    totalCount = len(mapping_training[dictkeys]) + len(mapping_validation[dictkeys])
    totalCount += len( mapping_testing[dictkeys])

    totalTrainingImg += len(mapping_training[dictkeys])
    totalValImg += len(mapping_validation[dictkeys])
    totalTestImg += len( mapping_testing[dictkeys])

    print(f"Class Name: {dictkeys}, Number of images: {totalCount}")

print(f"Total training Images: {totalTrainingImg}")
print(f"Total validation Images: {totalValImg}")
print(f"Total testing Images: {totalTestImg}")

Class Name: Auricularia, Number of images: 1559
Class Name: Cookeina, Number of images: 2885
Class Name: Entoloma, Number of images: 1944
Class Name: Geastrum, Number of images: 1390
Class Name: Hygrocybe, Number of images: 3919
Class Name: Marasmius, Number of images: 4494
Class Name: Ophiocordyceps, Number of images: 1943
Class Name: Oudemansiella, Number of images: 1870
Class Name: Phallus, Number of images: 1383
Class Name: Trametes, Number of images: 1278
Total training Images: 16962
Total validation Images: 3379
Total testing Images: 2324


In [5]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from PIL import Image

In [6]:
#need to install torchvision
from torch.utils.data import TensorDataset
from torchvision import transforms
#adjust to 224 dimension (input to alexnet)
transform = transforms.Compose([transforms.ToTensor(), transforms.Resize((224,224))])

In [7]:
#use alexnet
import torchvision.models
alexnet = torchvision.models.alexnet(pretrained=True)



In [8]:
numClasses = len(mapping_training)

In [9]:
def transferLearning(mapping):
    dict = {}
    print(f"Total number of classes {numClasses}")
    print("Start Transfer Learning Section: AlexNet feature extract for each class")
    iterCount = 1

    for key, images in mapping.items():
        print(f"Class {iterCount} out of {numClasses}")
        dict[key] = []
        for img in images:
            newImg = Image.open(img)
            newImg = transform(newImg)
            feat = alexnet.features(newImg)
            dict[key].append(feat)
        iterCount += 1
    return dict

In [10]:
training_dict = transferLearning(mapping_training)
validation_dict = transferLearning(mapping_validation)
testing_dict = transferLearning(mapping_testing)

Total number of classes 10
Start Transfer Learning Section: AlexNet feature extract for each class
Class 1 out of 10
Class 2 out of 10
Class 3 out of 10
Class 4 out of 10
Class 5 out of 10
Class 6 out of 10
Class 7 out of 10
Class 8 out of 10
Class 9 out of 10
Class 10 out of 10
Total number of classes 10
Start Transfer Learning Section: AlexNet feature extract for each class
Class 1 out of 10
Class 2 out of 10
Class 3 out of 10
Class 4 out of 10
Class 5 out of 10
Class 6 out of 10
Class 7 out of 10
Class 8 out of 10
Class 9 out of 10
Class 10 out of 10
Total number of classes 10
Start Transfer Learning Section: AlexNet feature extract for each class
Class 1 out of 10
Class 2 out of 10
Class 3 out of 10
Class 4 out of 10
Class 5 out of 10
Class 6 out of 10
Class 7 out of 10
Class 8 out of 10
Class 9 out of 10
Class 10 out of 10


#The mapping across all 3 dictionaries is the same
#Same the conversion dictionaries for qualitative results

In [11]:
mapping = {}

train_numericalDict = {}
for idx, (key, value) in enumerate(training_dict.items()):
    train_numericalDict[idx] = value
    mapping[idx] = key

validation_numericalDict = {}
for idx, (key, value) in enumerate(validation_dict.items()):
    validation_numericalDict[idx] = value

test_numericalDict = {}
for idx, (key, value) in enumerate(testing_dict.items()):
    test_numericalDict[idx] = value

In [13]:
# Split data into training, validation, and testing sets
tData, tLabel = [], []
vData, vLabel = [], []
testingData, testingLabel = [], []

#Training
for key, values in train_numericalDict.items():
    tLabel.extend([key] * len(values))
    tData.extend(values)
tTensor = torch.stack(tData)
tLabelTensor = torch.tensor(tLabel)


#Validation
for key, values in validation_numericalDict.items():
    vLabel.extend([key] * len(values))
    vData.extend(values)
vTensor = torch.stack(vData)
vLabelTensor = torch.tensor(vLabel)


#Testing
for key, values in test_numericalDict.items():
    testingLabel.extend([key] * len(values))
    testingData.extend(values)
testingTensor = torch.stack(testingData)
testingLabelTensor = torch.tensor(testingLabel)

train_set = TensorDataset(tTensor, tLabelTensor)
validation_set = TensorDataset(vTensor, vLabelTensor)
test_set = TensorDataset(testingTensor, testingLabelTensor)

In [14]:
print(f'tTensor shape: {tTensor.shape}')
print(f'vTensor shape: {vTensor.shape}')
print(f'testingTensor shape: {testingTensor.shape}')

tTensor shape: torch.Size([16962, 256, 6, 6])
vTensor shape: torch.Size([3379, 256, 6, 6])
testingTensor shape: torch.Size([2324, 256, 6, 6])


In [15]:
#testing if train, validation, and test sets work
print(len(train_set))
print(len(validation_set))
print(len(test_set))

16962
3379
2324


In [16]:
#save the pt files
torch.save(train_set, 'Btrain_set.pt')
torch.save(validation_set, 'Bvalidation_set.pt')
torch.save(test_set, 'Btest_set.pt')

In [12]:
#save dictionary mapper
import pickle
with open('mapping.pkl', 'wb') as file:
    pickle.dump(mapping, file)

^Might need a citation for the pickle thing