In [None]:
import torch 
from torch.utils.data import Dataset, DataLoader

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import cv2 as cv


# Prepare dataset

In [None]:
data_dir = "../sportClassification/data/sportData/"
data_df = pd.read_csv(data_dir+"sports.csv")
data_df.head()

In [None]:
unique_labels = data_df.labels.unique().tolist()
labels_map = dict()
for i in unique_labels:
    labels_map[i] = unique_labels.index(i)

len(unique_labels)


In [None]:
data_df.shape

In [None]:
data_df.dataset.unique()

In [None]:
data_df.columns

In [None]:
image_dir = data_dir+data_df.filepaths.iloc[0]
label = data_df.labels.iloc[0]
label

In [None]:
def showSamples(df,filepathColumnName,labelColumnName,data_dir):
    data_df = df
    cols, rows = 4,2
    fig = plt.figure(figsize=(10,5))
    for i in range(1, cols * rows + 1):
        sample_idx = torch.randint(data_df.shape[0], size=(1,)).item()
        image_dir = data_dir+data_df[filepathColumnName].iloc[sample_idx]
        img = Image.open(image_dir)
        label = data_df[labelColumnName].iloc[sample_idx]
        fig.add_subplot(rows, cols, i)
        plt.title(label)
        plt.axis("off")
        plt.imshow(img)
    plt.show()
    return

In [None]:
showSamples(df=data_df,filepathColumnName="filepaths",labelColumnName="labels",data_dir=data_dir)

# Creating a Custom Dataset

In [None]:
df = data_df.copy()
df_train = df[df["dataset"]=="train"]
df_test = df[df["dataset"]=="test"]
df_valid = df[df["dataset"]=="valid"]
df_train.head()

In [None]:
train_img_dirs = [data_dir+i for i in df_train["filepaths"].to_list()]
test_img_dirs = [data_dir+i for i in df_test["filepaths"].to_list()]
valid_img_dirs = [data_dir+i for i in df_valid["filepaths"].to_list()]


train_labels = df_train.labels.to_list()
test_labels = df_test.labels.to_list()
valid_labels = df_valid.labels.to_list()

In [None]:
class SportDataset(Dataset):
    def __init__(self,paths,labels,unique_labels,transform=None,target_transform=None):
        self.paths = paths
        self.labels = labels
        self.unique_labels = unique_labels
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.labels)

    def __getitem__(self,idx):
        imgPath = self.paths[idx]
        img = cv.imread(imgPath)
        label = self.labels[idx]

        if self.transform:
            img = self.transform(img)
        if self.target_transform:
            label = self.target_transform(self.unique_labels,label)
        
        return img,label



In [None]:
def target_transforms(unique_labels,label):
    """
    return one hot encoded tensors of the label
    """
    label_one_hot = label==np.array(unique_labels)
    label_tensors = torch.tensor(label_one_hot,dtype=torch.float)
    return label_tensors

def features_transforms(img):
    """
    return normalize tensors of the img
    """
    img = cv.resize(img,(224,224))
    img = img.transpose((2,0,1)) #channel must come first 
    img = torch.tensor(img, dtype = torch.float)
    img = img/255.0 #normalise img
    
    return img

In [None]:
train_dataset = SportDataset(train_img_dirs,train_labels,unique_labels,transform=features_transforms,target_transform=target_transforms)
valid_dataset = SportDataset(valid_img_dirs,valid_labels,unique_labels,transform=features_transforms,target_transform=target_transforms)
test_dataset = SportDataset(test_img_dirs,test_labels,unique_labels,transform=features_transforms,target_transform=target_transforms)



# Preparing the data for training with DataLoaders

In [None]:
batch_size = 64
train_dataloader = DataLoader(train_dataset,batch_size,shuffle=True)
batch = next(iter(train_dataloader))
X,y = batch

In [None]:
X.shape

In [None]:
y.shape