In [2]:
import os
import gc
import cv2
import math
import copy
import time
import random

# For data manipulation
import numpy as np
import pandas as pd

# Pytorch Imports
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.cuda import amp

# Albumentations for augmentations
import albumentations as A
from albumentations.pytorch import ToTensorV2


from sklearn.metrics import f1_score,roc_auc_score


import timm
from timm.models.efficientnet import *

# Utils
import joblib
from tqdm import tqdm
from collections import defaultdict


import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import scipy
from scipy import ndimage

import glob

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
test_ct_all_list=list(glob.glob("/ssd2/ming/2024COVID/test_crop/*/*")) 

In [14]:
len(test_ct_all_list)

436394

In [4]:
test_area=[]
for path in tqdm(test_ct_all_list):
    img = cv2.imread(path)
    img2=ndimage.minimum_filter(img,5)
    img_b=np.where(img2<100,0,255)
    mask=scipy.ndimage.binary_fill_holes(img_b[:,:,0])
    mask_=mask*255
    aaa=mask_-img_b[:,:,0]
    test_area.append(aaa.sum()/255)

100%|██████████| 436395/436395 [1:45:38<00:00, 68.85it/s]  


In [6]:
test_area_df=pd.DataFrame((zip(test_ct_all_list, test_area)), columns = ['path', 'area'])

In [7]:
test_area_df

Unnamed: 0,path,area
0,/ssd2/ming/2024COVID/test_crop/11mmey0x.3c4/21...,21565.0
1,/ssd2/ming/2024COVID/test_crop/11mmey0x.3c4/54...,13964.0
2,/ssd2/ming/2024COVID/test_crop/11mmey0x.3c4/98...,25716.0
3,/ssd2/ming/2024COVID/test_crop/11mmey0x.3c4/16...,27389.0
4,/ssd2/ming/2024COVID/test_crop/11mmey0x.3c4/21...,21975.0
...,...,...
436390,/ssd2/ming/2024COVID/test_crop/vekbjvxc.ylo/43...,3310.0
436391,/ssd2/ming/2024COVID/test_crop/vekbjvxc.ylo/38...,1396.0
436392,/ssd2/ming/2024COVID/test_crop/vekbjvxc.ylo/23...,2326.0
436393,/ssd2/ming/2024COVID/test_crop/vekbjvxc.ylo/22...,2743.0


In [12]:
test_area_df.to_csv("/ssd2/ming/2024COVID/test_area_df.csv",index=False)

NameError: name 'test_area_df' is not defined

In [16]:
test_area_df=pd.read_csv("/ssd2/ming/2024COVID/test_area_df.csv")

In [17]:
test_area_df["ct_path"]=test_area_df["path"].apply(lambda x: "/".join(x.split("/")[:-1]))
test_area_df["ct_slice"]=test_area_df["path"].apply(lambda x: int(x.split("/")[-1].split(".")[0]))

In [18]:
test_area_df.sort_values(by=['ct_path', 'ct_slice'], inplace=True)

In [19]:
test_area_df=test_area_df.reset_index(drop=True)

In [21]:
test_area_df

Unnamed: 0,path,area,ct_path,ct_slice
0,/ssd2/ming/2024COVID/test_crop/00c5hbz5.4rp/0.jpg,827.0,/ssd2/ming/2024COVID/test_crop/00c5hbz5.4rp,0
1,/ssd2/ming/2024COVID/test_crop/00c5hbz5.4rp/1.jpg,1371.0,/ssd2/ming/2024COVID/test_crop/00c5hbz5.4rp,1
2,/ssd2/ming/2024COVID/test_crop/00c5hbz5.4rp/2.jpg,1121.0,/ssd2/ming/2024COVID/test_crop/00c5hbz5.4rp,2
3,/ssd2/ming/2024COVID/test_crop/00c5hbz5.4rp/3.jpg,976.0,/ssd2/ming/2024COVID/test_crop/00c5hbz5.4rp,3
4,/ssd2/ming/2024COVID/test_crop/00c5hbz5.4rp/4.jpg,852.0,/ssd2/ming/2024COVID/test_crop/00c5hbz5.4rp,4
...,...,...,...,...
436390,/ssd2/ming/2024COVID/test_crop/zge41qxy.04q/36...,14040.0,/ssd2/ming/2024COVID/test_crop/zge41qxy.04q,365
436391,/ssd2/ming/2024COVID/test_crop/zge41qxy.04q/36...,13967.0,/ssd2/ming/2024COVID/test_crop/zge41qxy.04q,366
436392,/ssd2/ming/2024COVID/test_crop/zge41qxy.04q/36...,10400.0,/ssd2/ming/2024COVID/test_crop/zge41qxy.04q,367
436393,/ssd2/ming/2024COVID/test_crop/zge41qxy.04q/36...,8487.0,/ssd2/ming/2024COVID/test_crop/zge41qxy.04q,368


In [22]:
def sum_max(a,w=0.4):
    l=len(a)
    k=int(np.ceil(l*w))
    d=0
    tmp_max=0
    # print(l, k)
    for i in range(l-k+1):
        if np.sum(a[i:i+k])>tmp_max:
            tmp_max=np.sum(a[i:i+k])
            d=i
    return d,d+k

def remove_elements(arr):
    # Determine the action based on the array length
    if len(arr) < 60:
        # Remove the first 20 and the last 20 elements
        return arr[20:-20]
    else:
        # Remove the last 40 and the first 20 elements
        return arr[20:-40]
def sum_max_with_adjustment(a, w=0.4):
    # Adjust the array based on its length
    if len(a) < 60:
        # Remove the first 20 and the last 20 elements
        adjusted_a = a[20:-20]
    else:
        # Remove the last 40 and the first 20 elements
        adjusted_a = a[20:-45]
    
    l = len(adjusted_a)
    k = int(np.ceil(l * w))
    d = 0
    tmp_max = 0
    
    for i in range(l - k + 1):
        current_sum = np.sum(adjusted_a[i:i+k])
        if current_sum > tmp_max:
            tmp_max = current_sum
            d = i
    
    # Adjust d and d+k back to the original array's index space
    if len(a) < 60:
        d += 20  # Adjust index if the array was shorter than 60
    else:
        d += 20  # Always adjust by 20 because we always remove the first 20 elements
    
    return d, d + k    

In [23]:
from scipy.stats import gaussian_kde
import numpy as np
from scipy.integrate import cumtrapz
def kd_sampling(area_list,start_index,k,sample):
# 提供的数据
    data = area_list
    data_np = np.array(data)
    # 使用gaussian_kde计算Kernel Density Estimate
    kde = gaussian_kde(data)
    # 为了绘制KDE, 我们需要一个范围和密度值
    x_grid = np.linspace(min(data), max(data), 100)
    kde_values = kde.evaluate(x_grid)
    # 计算KDE曲线下的面积，以此近似CDF
    cdf = cumtrapz(kde_values, x_grid, initial=0)
    cdf /= cdf[-1]  # 归一化，使得CDF的最大值为1
    # 找出等分的百分位数对应的值作为区间边界
    percentiles = np.linspace(0, 1, k+1)
    bins_percentiles = np.interp(percentiles, cdf, x_grid)

    # 根据新的区间边界进行抽样和绘图过程的剩余部分
    bin_counts_percentiles = [sum((data >= bins_percentiles[i]) & (data < bins_percentiles[i+1])) for i in range(len(bins_percentiles)-1)]
    samples_per_bin_percentiles = np.ones(k, dtype=int)
    remaining_samples_percentiles = sample - k

    proportions_percentiles = np.array(bin_counts_percentiles) / sum(bin_counts_percentiles)
    additional_samples_percentiles = np.floor(proportions_percentiles * remaining_samples_percentiles).astype(int)
    samples_per_bin_percentiles += additional_samples_percentiles

    remaining_samples_percentiles -= additional_samples_percentiles.sum()
    while remaining_samples_percentiles > 0:
        indices_percentiles = np.argsort(proportions_percentiles)[::-1][:remaining_samples_percentiles]
        for i in indices_percentiles:
            samples_per_bin_percentiles[i] += 1
            remaining_samples_percentiles -= 1

    sampled_values_corrected_percentiles = []
    for i in range(len(bins_percentiles) - 1):
        bin_data = data_np[(data_np >= bins_percentiles[i]) & (data_np < bins_percentiles[i+1])]
        samples_count = samples_per_bin_percentiles[i]
        if len(bin_data) <= samples_count:
            sampled_values_corrected_percentiles.extend(bin_data.tolist())
        else:
            sampled_values_corrected_percentiles.extend(np.random.choice(bin_data, samples_count, replace=False).tolist())

    sampled_values_corrected_percentiles = np.sort(sampled_values_corrected_percentiles)
    sampled_indices = [np.where(data_np == value)[0][0] for value in sampled_values_corrected_percentiles]
    sampled_indices = [x+start_index for x in sampled_indices]
    #print(sampled_indices)
    return sampled_indices


In [24]:
ct_path_list=test_area_df["ct_path"].unique()

In [54]:
test_dic={}
for i in tqdm(range(len(ct_path_list))):
    tmp_df=test_area_df[test_area_df["ct_path"]==ct_path_list[i]].reset_index(drop=True)
    tmp_df=test_area_df[test_area_df["ct_path"]==ct_path_list[i]].reset_index(drop=True)
    #test_dic[ct_path_list[i]]=list(sum_max_with_adjustment(tmp_df["area"].values,0.5))
    #test_dic[ct_path_list[i]]=[list(sum_max_with_adjustment(tmp_df["area"].values,0.5)),len(tmp_df)]
    d1,d2=list(sum_max(tmp_df["area"].values,0.5))
    test_dic[ct_path_list[i]]=[d1,d2]
    k=[]
    for j in range(len(tmp_df['area'][d1:d2])):
        k.append(tmp_df['area'][j+d1])
    test_dic[ct_path_list[i]].append(k)
    #print(test_dic[ct_path_list[i]][2],d1,d2)
    if d2-d1>=8:
        test_dic[ct_path_list[i]].append(sorted(kd_sampling(test_dic[ct_path_list[i]][2],d1,4,8)))
    elif (d2-d1)>=2 and (d2-d1)<8:
        test_dic[ct_path_list[i]].append(sorted([random.randint(d1, d2-1) for _ in range(8)]))
    else:
        test_dic[ct_path_list[i]].append([d1 for _ in range(8)])

100%|██████████| 1413/1413 [02:08<00:00, 10.98it/s]


In [55]:
for i in range(len(test_dic)):
    print(test_dic[ct_path_list[i]][3],test_dic[ct_path_list[i]][0],test_dic[ct_path_list[i]][1])

[104, 122, 181, 191, 227, 235, 247, 263] 91 274
[35, 36, 39, 65, 98, 99, 128, 132] 29 152
[77, 84, 96, 156, 173, 205, 210, 214] 75 249
[99, 108, 129, 134, 150, 168, 171, 230] 81 246
[86, 96, 99, 113, 154, 161, 174, 195] 68 201
[53, 58, 68, 86, 108, 119, 124, 141] 48 153
[115, 135, 142, 153, 179, 184, 212, 246] 97 275
[107, 139, 146, 164, 183, 186, 194, 244] 99 255
[18, 24, 27, 35, 40, 48, 50, 54] 17 55
[116, 137, 143, 205, 208, 229, 250, 279] 99 286
[78, 88, 106, 141, 193, 196, 203, 215] 52 236
[81, 89, 104, 151, 170, 201, 221, 224] 81 255
[31, 34, 36, 38, 70, 81, 86, 101] 31 124
[194, 216, 307, 347, 350, 383, 391, 434] 164 438
[106, 113, 133, 159, 163, 172, 194, 200] 62 216
[120, 136, 150, 175, 180, 208, 217, 245] 80 255
[139, 174, 178, 200, 232, 233, 250, 264] 110 283
[140, 173, 211, 223, 237, 261, 278, 333] 118 358
[8, 12, 16, 17, 18, 26, 28, 32] 7 33
[70, 80, 87, 112, 176, 186, 198, 219] 70 240
[113, 167, 203, 221, 257, 263, 282, 295] 87 302
[12, 13, 16, 18, 19, 20, 22, 28] 11 33
[

In [60]:
import pickle
with open('/ssd2/ming/2024COVID/test_dic1_05.pickle', 'wb') as handle:
    pickle.dump(test_dic, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
32mwmw5y.zzs  287 1
gbwa1jlx.ajf 163 1
21kqbeui.1pz 243 1
43evg34y.vsa 91 1

In [56]:
test_dic['/ssd2/ming/2024COVID/test_crop/43evg34y.vsa']

[0, 1, [14780.0], [0, 0, 0, 0, 0, 0, 0, 0]]

In [41]:
test_dic['/ssd2/ming/2024COVID/test_crop/32mwmw5y.zzs']=[20, 20, [], [20, 20, 20, 20, 20, 20, 20, 20]]

In [57]:
len(test_dic)

1413

In [58]:
k=0
for i in range(len(test_dic)):
    if test_dic[ct_path_list[i]][1] - test_dic[ct_path_list[i]][0] <5:
        print(test_dic[ct_path_list[i]])
        k+=1
print(k)

[0, 1, [80555.0], [0, 0, 0, 0, 0, 0, 0, 0]]
[0, 1, [35098.0], [0, 0, 0, 0, 0, 0, 0, 0]]
[0, 1, [14780.0], [0, 0, 0, 0, 0, 0, 0, 0]]
[0, 4, [5500.0, 3638.0, 3162.0, 2167.0], [1, 1, 2, 2, 2, 2, 3, 3]]
[0, 4, [6464.0, 4357.0, 3444.0, 2468.0], [0, 0, 0, 0, 1, 2, 3, 3]]
[0, 1, [0.0], [0, 0, 0, 0, 0, 0, 0, 0]]
[0, 1, [25752.0], [0, 0, 0, 0, 0, 0, 0, 0]]
7


In [27]:
#ct_path_list=test_area_df["ct_path"].unique()
#for i in range(100):
    
#    tmp_df=test_area_df[test_area_df["ct_path"]==ct_path_list[i][:1]].reset_index(drop=True)
#    a,b=test_dic[ct_path_list[i]]
#    print(ct_path_list[i])
#    plt.plot(tmp_df["area"])
#    plt.plot(tmp_df["area"][a:b])
#    print(a,b)
#    plt.show()

In [59]:
test_dic

{'/ssd2/ming/2024COVID/test_crop/00c5hbz5.4rp': [91,
  274,
  [43080.0,
   43754.0,
   44345.0,
   44857.0,
   45446.0,
   46042.0,
   46685.0,
   47218.0,
   47806.0,
   48316.0,
   48839.0,
   49416.0,
   49933.0,
   50475.0,
   51095.0,
   51795.0,
   52502.0,
   53028.0,
   53514.0,
   53916.0,
   54361.0,
   54718.0,
   55124.0,
   55443.0,
   55829.0,
   56194.0,
   56551.0,
   56899.0,
   57149.0,
   57397.0,
   57687.0,
   57853.0,
   58031.0,
   58106.0,
   58345.0,
   58576.0,
   58486.0,
   58596.0,
   58673.0,
   58684.0,
   58703.0,
   58648.0,
   58705.0,
   58671.0,
   58746.0,
   58814.0,
   58845.0,
   58819.0,
   58867.0,
   58866.0,
   58945.0,
   59015.0,
   59082.0,
   59158.0,
   59253.0,
   59378.0,
   59534.0,
   59656.0,
   59770.0,
   59868.0,
   59999.0,
   60087.0,
   60118.0,
   60143.0,
   60197.0,
   60197.0,
   60065.0,
   60079.0,
   60076.0,
   60091.0,
   60060.0,
   60004.0,
   59881.0,
   59782.0,
   59677.0,
   59665.0,
   59587.0,
   59535.0,
   5