# Labeling the dataset

This notebook requires the following libraries:

- Numpy
- Pandas
- glob
- os

The output of this notebook reads data in the $data\_dir$ and creates a CSV containing labels, saving that as per the $output$ path.

In [7]:
import os


# This is the main data directory
height = 25
width = 25

data_dir = os.path.join(os.path.pardir, "Datasets", 'Resized_data_{}_{}'.format(height, width))

# This is for the output CSV file: it will reside under the name "output_filename" in output_dir
output_dir = data_dir
output_filename = 'labels.csv'

## Imports

In [8]:
import numpy as np
import glob


import pandas as pd

## Loading the data and normalising dimensions

### Directories

### Exception handling

In [9]:
if not os.path.isdir(data_dir):
    raise Exception("Data Directory not found! Please run the data_download.ipynb notebook before proceeding.")

In [10]:
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

### Data loading

In [11]:
file_list_parasitized = glob.glob(os.path.join(data_dir, 'Parasitized', '*.png'))
file_list_uninfected  = glob.glob(os.path.join(data_dir, 'Uninfected', '*.png'))

In [12]:
# os.path.join(os.path.dirname(__file__),'corrected_images','False_uninfected.csv')

In [13]:
n_parasitized = len(file_list_parasitized)
n_uninfected  = len(file_list_uninfected)

n_parasitized, n_uninfected

(13132, 13029)

In [14]:
file_list_parasitized = np.array(file_list_parasitized)
file_list_uninfected  = np.array(file_list_uninfected)

file_list_parasitized = np.reshape(file_list_parasitized, newshape = (n_parasitized, 1))
file_list_uninfected  = np.reshape(file_list_uninfected , newshape = (n_uninfected , 1))

In [15]:
file_list_parasitized = np.append(file_list_parasitized, np.ones(file_list_parasitized.shape), axis = 1)
file_list_uninfected  = np.append(file_list_uninfected , np.zeros(file_list_uninfected.shape), axis = 1)

In [16]:
file_list_parasitized.shape, file_list_parasitized.shape

((13132, 2), (13132, 2))

In [17]:
file_list = np.append(file_list_uninfected, file_list_parasitized, axis = 0)

file_list.shape

(26161, 2)

### Removing mislabeled examples

In [28]:
thoo = pd.read_csv('corrected_images/False_parasitized.csv', index_col= 0)

thoo['False_parasitized'] = thoo['False_parasitized'].apply(lambda row: os.path.join(data_dir, 'Parasitized', row))

In [29]:
thoo

Unnamed: 0,False_parasitized
0,..\Datasets\Resized_data_25_25\Parasitized\C11...
1,..\Datasets\Resized_data_25_25\Parasitized\C59...
2,..\Datasets\Resized_data_25_25\Parasitized\C12...
3,..\Datasets\Resized_data_25_25\Parasitized\C12...
4,..\Datasets\Resized_data_25_25\Parasitized\C67...
...,...
642,..\Datasets\Resized_data_25_25\Parasitized\C18...
643,..\Datasets\Resized_data_25_25\Parasitized\C39...
644,..\Datasets\Resized_data_25_25\Parasitized\C13...
645,..\Datasets\Resized_data_25_25\Parasitized\C99...


We see that there are 13779 images in both directories, indicating balanced data.

## Creating a labeled dataframe

In [18]:
df = pd.DataFrame(file_list, columns = ['Image_Path', 'Parasitized'])
df

Unnamed: 0,Image_Path,Parasitized
0,..\Datasets\Resized_data_25_25\Uninfected\25x2...,0.0
1,..\Datasets\Resized_data_25_25\Uninfected\25x2...,0.0
2,..\Datasets\Resized_data_25_25\Uninfected\25x2...,0.0
3,..\Datasets\Resized_data_25_25\Uninfected\25x2...,0.0
4,..\Datasets\Resized_data_25_25\Uninfected\25x2...,0.0
...,...,...
26156,..\Datasets\Resized_data_25_25\Parasitized\25x...,1.0
26157,..\Datasets\Resized_data_25_25\Parasitized\25x...,1.0
26158,..\Datasets\Resized_data_25_25\Parasitized\25x...,1.0
26159,..\Datasets\Resized_data_25_25\Parasitized\25x...,1.0


In [19]:
df.to_csv(os.path.join(output_dir, output_filename), index = False)