In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**File + Data Field Descriptions:**
* train/: A folder containing images in the TIFF format to be used as training data.
* test/: A folder containing images to be used as test data. The actual test data comprises about 280 images.
* other/: A supplemental set of images with a either an unknown etiology or an etiology other than CE or LAA.
* train.csv: Contains annotations for images in the train/ folder.
    * image_id: A unique identifier for this instance having the form {patient_id}_{image_num}. Corresponds to the image {image_id}.tif.
    * center_id: Identifies the medical center where the slide was obtained.
    * patient_id: Identifies the patient from whom the slide was obtained.
    * image_num: Enumerates images of clots obtained from the same patient.
    * label: The etiology of the clot, either CE or LAA. This field is the classification target.
* test.csv: Annotations for images in the test/ folder. Has the same fields as train.csv excluding label.
* other.csv: Annotations for images in the other/ folder. Has the same fields as train.csv. The center_id is unavailable for these images however.
* label: The etiology of the clot, either Unknown or Other.
* other_specified: The specific etiology, when known, in case the etiology is labeled as Other.
* sample_submission.csv: A sample submission file in the correct format. Note in particular that you should make one prediction per patient_id, not per image_id.

https://www.kaggle.com/code/nghihuynh/mc-strip-ai-exploratory-data-analysis

In [5]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import cv2

from glob import glob
from pprint import pprint
from collections import defaultdict
import gc

import plotly
from plotly import tools
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.offline as pyo
import plotly.io as pio
import plotly.graph_objects as go
#pio.templates.default = 'plotly_white'
sns.set_theme(style="dark")
import cv2
import tifffile as tiff
from PIL import Image

import warnings
warnings.simplefilter("ignore")

In [6]:
train_df = pd.read_csv('../input/mayo-clinic-strip-ai/train.csv')
test_df = pd.read_csv('../input/mayo-clinic-strip-ai/test.csv')
other_df = pd.read_csv('../input/mayo-clinic-strip-ai/other.csv')


In [7]:
gmap = np.array([[1,2,3], [2,3,4], [1,2,3],[2,3,4],[1,2,3]])
train_df.head(5).style.background_gradient(axis=None,gmap=gmap, cmap='Purples', 
                                            subset=['image_id','patient_id','label'])

Unnamed: 0,image_id,center_id,patient_id,image_num,label
0,006388_0,11,006388,0,CE
1,008e5c_0,11,008e5c,0,CE
2,00c058_0,11,00c058,0,LAA
3,01adc5_0,11,01adc5,0,LAA
4,026c97_0,4,026c97,0,CE


In [8]:
test_df.head().style.background_gradient(axis=None,gmap=[[1,2],[2,3],[1,2],[2,3]], 
                                          cmap='Purples', 
                                          subset=['image_id','patient_id'])

Unnamed: 0,image_id,center_id,patient_id,image_num
0,006388_0,11,006388,0
1,008e5c_0,11,008e5c,0
2,00c058_0,11,00c058,0
3,01adc5_0,11,01adc5,0


In [9]:
other_df.head(5).style.background_gradient(axis=None,gmap=gmap, cmap='Purples', 
                                            subset=['image_id','image_num','label'])

Unnamed: 0,image_id,patient_id,image_num,other_specified,label
0,01f2b3_0,01f2b3,0,,Unknown
1,01f2b3_1,01f2b3,1,,Unknown
2,02ebd5_0,02ebd5,0,,Unknown
3,0412ab_0,0412ab,0,,Unknown
4,04414e_0,04414e,0,Hypercoagulable,Other


In [10]:
# https://www.kaggle.com/code/toomuchsauce/mental-health-plotly-interactive-viz
def EDA(df):
    
    print('\033[1m' +'EXPLORATORY DATA ANALYSIS :'+ '\033[0m\n')
    print('\033[1m' + 'Shape of the data (rows, columns):' + '\033[0m')
    print(df.shape, 
          '\n------------------------------------------------------------------------------------\n')
    
    print('\033[1m' + 'All columns from the dataframe :' + '\033[0m')
    print(df.columns, 
          '\n------------------------------------------------------------------------------------\n')
    
    print('\033[1m' + 'Datatypes and Missing values:' + '\033[0m')
    print(df.info(), 
          '\n------------------------------------------------------------------------------------\n')
    
    for col in df.columns:
        print('\033[1m' + 'Unique values in {} :'.format(col) + '\033[0m',len(df[col].unique()))
    print('\n------------------------------------------------------------------------------------\n')
    
    print('\033[1m' + 'Summary statistics for the data :' + '\033[0m')
    print(df.describe(include='all'), 
          '\n------------------------------------------------------------------------------------\n')
    
        
    print('\033[1m' + 'Memory used by the data :' + '\033[0m')
    print(df.memory_usage(), 
          '\n------------------------------------------------------------------------------------\n')
    
    print('\033[1m' + 'Number of duplicate values :' + '\033[0m')
    print(df.duplicated().sum())
          
EDA(train_df)

[1mEXPLORATORY DATA ANALYSIS :[0m

[1mShape of the data (rows, columns):[0m
(754, 5) 
------------------------------------------------------------------------------------

[1mAll columns from the dataframe :[0m
Index(['image_id', 'center_id', 'patient_id', 'image_num', 'label'], dtype='object') 
------------------------------------------------------------------------------------

[1mDatatypes and Missing values:[0m
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 754 entries, 0 to 753
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   image_id    754 non-null    object
 1   center_id   754 non-null    int64 
 2   patient_id  754 non-null    object
 3   image_num   754 non-null    int64 
 4   label       754 non-null    object
dtypes: int64(2), object(3)
memory usage: 29.6+ KB
None 
------------------------------------------------------------------------------------

[1mUnique values in image_id :[0m 754
[1mUn

In [11]:
export CV_IO_MAX_IMAGE_PIXELS=1099511627776
import cv2

img = cv2.imread("/kaggle/input/mayo-clinic-strip-ai/train/a4c7df_0.tif")
print(img.shape)

SyntaxError: invalid syntax (1884500399.py, line 1)

In [12]:
from PIL import Image

for image in data_dir_list_Train:    
    print(image)
    img = Image.open(image)
    try:
        img = Image.open(image)
        width, height = img.size
        print(width, height)
#         img = img.resize((width/2, height/2))

#         #Saved in the same relative location
#         img.save("./"+ image)
#         print("Done")
    except IOError:
         print("Not Done")

NameError: name 'data_dir_list_Train' is not defined

In [None]:
from PIL import Image
  
def main():
    for image_class in os.listdir(data_dir_list_Train): 
        for image in os.listdir(os.path.join(data_dir_list_Train, image_class)):
            image_path = os.path.join(data_dir_list_Train, image_class, image)
            try:
                 #Relative Path
                img = Image.open(image)
                width, height = img.size

                img = img.resize((width/2, height/2))

                #Saved in the same relative location
                img.save("resized_picture.jpg") 
            except IOError:
                pass

In [None]:
train = pd.read_csv("/kaggle/input/mayo-clinic-strip-ai/train.csv")

In [None]:
train.info()

In [13]:
train.shape

NameError: name 'train' is not defined

In [14]:
train.label.describe()

NameError: name 'train' is not defined

In [15]:
train.head(10)

NameError: name 'train' is not defined

In [16]:
y_train = train.label

X_train = train.drop(labels = ["label"], axis =1)


NameError: name 'train' is not defined

In [17]:
y_train.head()

NameError: name 'y_train' is not defined

In [18]:
X_train.head()

NameError: name 'X_train' is not defined

In [19]:
train.isnull().sum().sum()

NameError: name 'train' is not defined

https://www.kaggle.com/code/orhansertkaya/cnn-humpback-whale-identification-with-keras

In [20]:
os.environ["OPENCV_IO_MAX_IMAGE_PIXELS"] = pow(2,40).__str__()
import cv2
import glob
import os

In [21]:
inputFolder="/kaggle/input/mayo-clinic-strip-ai/train/"
folderLen = len(inputFolder)
#os.mkdir("./")


for img in glob.glob(inputFolder +"/*.*"):
    image = cv2.imread(img)
    imageResized = cv2.resize(image, (150,150))
    cv2.imwrite("./"+img[folderLen:],imgResized)

error: OpenCV(4.5.4) /tmp/pip-req-build-jpmv6t9_/opencv/modules/imgcodecs/src/loadsave.cpp:77: error: (-215:Assertion failed) pixels <= CV_IO_MAX_IMAGE_PIXELS in function 'validateInputImageSize'


In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds

In [None]:
from keras.preprocessing import image
from keras.applications.imagenet_utils import preprocess_input
import PIL.Image


def prepareImages(train, shape, path):
    
    x_train = np.zeros((shape, 100, 100, 3))
    count = 0
    PIL.Image.MAX_IMAGE_PIXELS = 5067523579
    for fig in train['image_id']:
        
        #load images into images of size 100x100x3
        img = image.load_img("/kaggle/input/mayo-clinic-strip-ai/"+path+"/"+fig+".tif", target_size=(100, 100, 3))
        
        x = image.img_to_array(img)
        x = preprocess_input(x)

        x_train[count] = x
        if (count%500 == 0):
            print("Processing image: ", count+1, ", ", fig)
        count += 1
    
    return x_train

In [None]:

x_train = prepareImages(train, train.shape[0], "train")