## Importing Data

In [None]:
import zipfile
with zipfile.ZipFile('/content/drive/MyDrive/ML_Team101/Image_classification_data.zip', 'r') as zip_ref:
    zip_ref.extractall('./image_classification_data')

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd

data = pd.read_csv('./image_classification_data/data_labels_mainData.csv')
data.head()

Unnamed: 0,InstanceID,patientID,ImageName,cellTypeName,cellType,isCancerous
0,22405,1,22405.png,fibroblast,0,0
1,22406,1,22406.png,fibroblast,0,0
2,22407,1,22407.png,fibroblast,0,0
3,22408,1,22408.png,fibroblast,0,0
4,22409,1,22409.png,fibroblast,0,0


## EDA

### Column Names and Data Types


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9896 entries, 0 to 9895
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   InstanceID    9896 non-null   int64 
 1   patientID     9896 non-null   int64 
 2   ImageName     9896 non-null   object
 3   cellTypeName  9896 non-null   object
 4   cellType      9896 non-null   int64 
 5   isCancerous   9896 non-null   int64 
dtypes: int64(4), object(2)
memory usage: 464.0+ KB


### Verify CellType Values

As we can see from the above output - `cellType` is a attribute that turns our categorical attribute `cellTypeName` into an integer value

We will validate whether the mapping was done correctly.

In [35]:
correlation_dict = {'fibroblast': 0, 'inflammatory': 1, 'epithelial': 2, 'others': 3}

# Produce Expected CellTypeValues using Dictionary
data['expectedCellType'] = data['cellTypeName'].map(correlation_dict)

# Compare Expected with Actual CellType Values
data['results'] = data['cellType'] == data['expectedCellType']

# Print Whether CellType Maps to Expected Value
if(data['results'].sum().all() == True): 
    print("Consistent Mapping of CellType Values")
else:
    print("Inconsistent Mapping of CellType Values")

Consistent Mapping of CellType Values


### Verify If Missing Values Exist

In [None]:
# Check if Total Null Values Across all Columns are NULL
if(data.isnull().sum().all() == 0): 
    print("No Missing Values")
else:
    print("Missing Values Found")

No Missing Values


In [None]:
print("Data count by cellTypeName") 
print(data['cellTypeName'].value_counts())

print("Data count by isCancerous")
print(data['isCancerous'].value_counts())

Data count by cellTypeName
epithelial      4079
inflammatory    2543
fibroblast      1888
others          1386
Name: cellTypeName, dtype: int64
Data count by isCancerous
0    5817
1    4079
Name: isCancerous, dtype: int64


## Splitting Data

In [None]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.25, random_state=42)

print("Train data : {}, Val Data: {}, Test Data: {}".format(train_data.shape[0], val_data.shape[0], test_data.shape[0]))

Train data : 5937, Val Data: 1979, Test Data: 1980
