In [1]:
import pandas as pd
import numpy as np
import os           
from tqdm import *
tqdm.pandas()

In [2]:
def softmax(X, theta = 1.0, axis = None):
    """
    Compute the softmax of each element along an axis of X.

    Parameters
    ----------
    X: ND-Array. Probably should be floats.
    theta (optional): float parameter, used as a multiplier
        prior to exponentiation. Default = 1.0
    axis (optional): axis to compute values along. Default is the
        first non-singleton axis.

    Returns an array the same size as X. The result will sum to 1
    along the specified axis.
    """

    # make X at least 2d
    y = np.atleast_2d(X)

    # find axis
    if axis is None:
        axis = next(j[0] for j in enumerate(y.shape) if j[1] > 1)

    # multiply y against the theta parameter,
    y = y * float(theta)

    # subtract the max for numerical stability
    y = y - np.expand_dims(np.max(y, axis = axis), axis)

    # exponentiate y
    y = np.exp(y)

    # take the sum along the specified axis
    ax_sum = np.expand_dims(np.sum(y, axis = axis), axis)

    # finally: divide elementwise
    p = y / ax_sum

    # flatten if X was 1D
    if len(X.shape) == 1: p = p.flatten()

    return p

In [3]:
from sklearn.preprocessing import LabelEncoder


CLASS_NAME=\
['The_Eiffel_Tower', 'The_Great_Wall_of_China', 'The_Mona_Lisa', 'airplane', 'alarm_clock', 'ambulance', 'angel',
 'animal_migration', 'ant', 'anvil', 'apple', 'arm', 'asparagus', 'axe', 'backpack', 'banana', 'bandage', 'barn',
 'baseball', 'baseball_bat', 'basket', 'basketball', 'bat', 'bathtub', 'beach', 'bear', 'beard', 'bed', 'bee',
 'belt', 'bench', 'bicycle', 'binoculars', 'bird', 'birthday_cake', 'blackberry', 'blueberry', 'book',
 'boomerang', 'bottlecap', 'bowtie', 'bracelet', 'brain', 'bread', 'bridge', 'broccoli', 'broom',
 'bucket', 'bulldozer', 'bus', 'bush', 'butterfly', 'cactus', 'cake', 'calculator', 'calendar', 'camel',
 'camera', 'camouflage', 'campfire', 'candle', 'cannon', 'canoe', 'car', 'carrot', 'castle', 'cat', 'ceiling_fan',
 'cell_phone', 'cello', 'chair', 'chandelier', 'church', 'circle', 'clarinet', 'clock', 'cloud', 'coffee_cup',
 'compass', 'computer', 'cookie', 'cooler', 'couch', 'cow', 'crab', 'crayon', 'crocodile', 'crown', 'cruise_ship',
 'cup', 'diamond', 'dishwasher', 'diving_board', 'dog', 'dolphin', 'donut', 'door', 'dragon', 'dresser',
 'drill', 'drums', 'duck', 'dumbbell', 'ear', 'elbow', 'elephant', 'envelope', 'eraser', 'eye', 'eyeglasses',
 'face', 'fan', 'feather', 'fence', 'finger', 'fire_hydrant', 'fireplace', 'firetruck', 'fish', 'flamingo',
 'flashlight', 'flip_flops', 'floor_lamp', 'flower', 'flying_saucer', 'foot', 'fork', 'frog', 'frying_pan',
 'garden', 'garden_hose', 'giraffe', 'goatee', 'golf_club', 'grapes', 'grass', 'guitar', 'hamburger',
 'hammer', 'hand', 'harp', 'hat', 'headphones', 'hedgehog', 'helicopter', 'helmet', 'hexagon', 'hockey_puck',
 'hockey_stick', 'horse', 'hospital', 'hot_air_balloon', 'hot_dog', 'hot_tub', 'hourglass', 'house', 'house_plant',
 'hurricane', 'ice_cream', 'jacket', 'jail', 'kangaroo', 'key', 'keyboard', 'knee', 'ladder', 'lantern', 'laptop',
 'leaf', 'leg', 'light_bulb', 'lighthouse', 'lightning', 'line', 'lion', 'lipstick', 'lobster', 'lollipop', 'mailbox',
 'map', 'marker', 'matches', 'megaphone', 'mermaid', 'microphone', 'microwave', 'monkey', 'moon', 'mosquito',
 'motorbike', 'mountain', 'mouse', 'moustache', 'mouth', 'mug', 'mushroom', 'nail', 'necklace', 'nose', 'ocean',
 'octagon', 'octopus', 'onion', 'oven', 'owl', 'paint_can', 'paintbrush', 'palm_tree', 'panda', 'pants',
 'paper_clip', 'parachute', 'parrot', 'passport', 'peanut', 'pear', 'peas', 'pencil', 'penguin', 'piano',
 'pickup_truck', 'picture_frame', 'pig', 'pillow', 'pineapple', 'pizza', 'pliers', 'police_car', 'pond',
 'pool', 'popsicle', 'postcard', 'potato', 'power_outlet', 'purse', 'rabbit', 'raccoon', 'radio', 'rain',
 'rainbow', 'rake', 'remote_control', 'rhinoceros', 'river', 'roller_coaster', 'rollerskates', 'sailboat',
 'sandwich', 'saw', 'saxophone', 'school_bus', 'scissors', 'scorpion', 'screwdriver', 'sea_turtle', 'see_saw',
 'shark', 'sheep', 'shoe', 'shorts', 'shovel', 'sink', 'skateboard', 'skull', 'skyscraper', 'sleeping_bag',
 'smiley_face', 'snail', 'snake', 'snorkel', 'snowflake', 'snowman', 'soccer_ball', 'sock', 'speedboat',
 'spider', 'spoon', 'spreadsheet', 'square', 'squiggle', 'squirrel', 'stairs', 'star', 'steak', 'stereo',
 'stethoscope', 'stitches', 'stop_sign', 'stove', 'strawberry', 'streetlight', 'string_bean', 'submarine',
 'suitcase', 'sun', 'swan', 'sweater', 'swing_set', 'sword', 't-shirt', 'table', 'teapot', 'teddy-bear',
 'telephone', 'television', 'tennis_racquet', 'tent', 'tiger', 'toaster', 'toe', 'toilet', 'tooth',
 'toothbrush', 'toothpaste', 'tornado', 'tractor', 'traffic_light', 'train', 'tree', 'triangle',
 'trombone', 'truck', 'trumpet', 'umbrella', 'underwear', 'van', 'vase', 'violin', 'washing_machine',
 'watermelon', 'waterslide', 'whale', 'wheel', 'windmill', 'wine_bottle', 'wine_glass', 'wristwatch',
 'yoga', 'zebra', 'zigzag']

lb = LabelEncoder()
lb.fit(CLASS_NAME)

all_cls = lb.classes_
idx2cls = np.zeros(all_cls.shape, dtype=object)
for i, cls in enumerate(all_cls):
    idx2cls[i] = cls

In [4]:
data_2_train = "/media/ngxbac/Bac/competition/kaggle/competition_data/quickdraw/data/30k/data_2/train/"
data_2_valid = "/media/ngxbac/Bac/competition/kaggle/competition_data/quickdraw/data/30k/data_2/valid/"

In [5]:
train_predict = np.load("./logs/clean_model_1_resnet34/dataset.predictions.data_2_train.logits.satge1.5.npy")
# valid_predict = np.load("./logs/clean_model_1_resnet34/dataset.predictions.data_2_valid.logits.satge1.5.npy")

In [6]:
train_predict = softmax(train_predict, axis=1)

In [7]:
# all_class = os.listdir(data_2_train)
dfs = []
for cls in CLASS_NAME:
    cls = cls.replace("_", ' ')
    df = pd.read_csv(os.path.join(data_2_train, cls + ".csv"))
    df["word"] = cls.split(".")[0]
    dfs.append(df)
dfs = pd.concat(dfs, axis=0)

In [8]:
dfs.head()

Unnamed: 0,countrycode,drawing,key_id,recognized,word
0,US,"[[[0, 27, 59, 70, 74, 83, 93, 99, 121, 151], [...",5594562984673280,True,The Eiffel Tower
1,US,"[[[0, 18, 48, 73, 79, 77, 71, 99, 110, 132, 17...",5559551606652928,True,The Eiffel Tower
2,GB,"[[[0, 7, 29, 56, 86, 107, 109, 106, 103, 122, ...",5310851772841984,True,The Eiffel Tower
3,US,"[[[0, 51, 87, 117, 143, 153, 157, 154, 163, 17...",6680277264891904,True,The Eiffel Tower
4,QA,"[[[169, 123, 91, 87, 75, 75, 81, 79, 54, 0, 21...",4801613069811712,True,The Eiffel Tower


In [9]:
pred = np.argmax(train_predict, axis=1)
pred_cls = idx2cls[pred]
# prob_cls = train_predict[0][pred]

In [10]:
probs = []
for i, predict_prob in enumerate(train_predict):
    probs.append(predict_prob[pred[i]])

In [11]:
dfs["predict_word"] = pred_cls
dfs["predict_word"] = dfs["predict_word"].apply(lambda x: x.replace("_", " "))
dfs["prob"] = probs

In [12]:
del train_predict
import gc
gc.collect()

14

In [13]:
threshold = 0.90
def re_assign_label(row):
    if row["recognized"] == False and row["word"] != row["predict_word"] and row["prob"] > threshold:
        row["new_word"] = row["predict_word"]
    else:
        row["new_word"] = row["word"]
    return row

In [14]:
recognized_df = dfs[dfs["recognized"] == True]
unrecognized_df = dfs[dfs["recognized"] == False]
del dfs
gc.collect()

0

In [15]:
clean_df = unrecognized_df.progress_apply(re_assign_label, axis=1)

100%|██████████| 377596/377596 [03:53<00:00, 1615.13it/s]


In [16]:
clean_df.head()

Unnamed: 0,countrycode,drawing,key_id,recognized,word,predict_word,prob,new_word
26,TH,"[[[87, 86, 92, 87, 84, 87, 80, 63], [96, 93, 9...",5114298047135744,False,The Eiffel Tower,passport,0.231137,The Eiffel Tower
135,US,"[[[2, 0, 3, 15, 104, 112, 129, 126, 128], [245...",4867259245789184,False,The Eiffel Tower,skyscraper,0.862774,The Eiffel Tower
221,US,"[[[117, 133, 142, 126], [70, 73, 69, 39]], [[1...",4827162169311232,False,The Eiffel Tower,camouflage,0.079629,The Eiffel Tower
248,FI,"[[[1, 2, 22, 27, 38, 46, 58, 65, 68, 54, 48, 5...",6616599626776576,False,The Eiffel Tower,The Eiffel Tower,0.609578,The Eiffel Tower
256,US,"[[[6, 0, 3, 11, 47, 66, 60, 91, 171, 172, 181,...",5675405107265536,False,The Eiffel Tower,The Eiffel Tower,0.999657,The Eiffel Tower


In [17]:
recognized_df["new_word"] = recognized_df["word"]

In [18]:
clean_dfs = pd.concat([clean_df, recognized_df], axis=0)

In [19]:
del clean_df, recognized_df
gc.collect()

0

In [20]:
clean_dfs.head()

Unnamed: 0,countrycode,drawing,key_id,recognized,word,predict_word,prob,new_word
26,TH,"[[[87, 86, 92, 87, 84, 87, 80, 63], [96, 93, 9...",5114298047135744,False,The Eiffel Tower,passport,0.231137,The Eiffel Tower
135,US,"[[[2, 0, 3, 15, 104, 112, 129, 126, 128], [245...",4867259245789184,False,The Eiffel Tower,skyscraper,0.862774,The Eiffel Tower
221,US,"[[[117, 133, 142, 126], [70, 73, 69, 39]], [[1...",4827162169311232,False,The Eiffel Tower,camouflage,0.079629,The Eiffel Tower
248,FI,"[[[1, 2, 22, 27, 38, 46, 58, 65, 68, 54, 48, 5...",6616599626776576,False,The Eiffel Tower,The Eiffel Tower,0.609578,The Eiffel Tower
256,US,"[[[6, 0, 3, 11, 47, 66, 60, 91, 171, 172, 181,...",5675405107265536,False,The Eiffel Tower,The Eiffel Tower,0.999657,The Eiffel Tower


In [21]:
all_class = clean_dfs["new_word"].unique()

In [22]:
data_2_clean_train = "/media/ngxbac/Bac/competition/kaggle/competition_data/quickdraw/data/30k/data_2_clean/train/"
os.makedirs(data_2_clean_train, exist_ok=True)

In [23]:
for cls in tqdm(all_class):
    cls_df = clean_dfs[clean_dfs["new_word"] == cls]
    cls_df.to_csv(data_2_clean_train + cls + ".csv", index=False)

100%|██████████| 340/340 [02:07<00:00,  2.84it/s]


In [24]:
del cls_df
del clean_dfs
gc.collect()

0

In [25]:
data_2_train = "/media/ngxbac/Bac/competition/kaggle/competition_data/quickdraw/data/30k/data_2/train/"
data_2_valid = "/media/ngxbac/Bac/competition/kaggle/competition_data/quickdraw/data/30k/data_2/valid/"

In [26]:
# train_predict = np.load("./logs/clean_model_1_resnet34/dataset.predictions.data_2_train.logits.satge1.5.npy")
valid_predict = np.load("./logs/clean_model_1_resnet34/dataset.predictions.data_2_valid.logits.satge1.5.npy")

In [27]:
valid_predict = softmax(valid_predict, axis=1)

In [28]:
# all_class = os.listdir(data_2_train)
dfs = []
for cls in CLASS_NAME:
    cls = cls.replace("_", ' ')
    df = pd.read_csv(os.path.join(data_2_valid, cls + ".csv"))
    df["word"] = cls.split(".")[0]
    dfs.append(df)
dfs = pd.concat(dfs, axis=0)

In [29]:
dfs.head()

Unnamed: 0,countrycode,drawing,key_id,recognized,word
0,US,"[[[0, 6, 17, 60, 62, 54, 52, 71, 73, 73], [246...",6396323441934336,True,The Eiffel Tower
1,US,"[[[0, 32, 2], [227, 230, 228]], [[17, 69, 69, ...",4765333883715584,True,The Eiffel Tower
2,US,"[[[202, 191, 165, 146, 133, 103, 95, 90], [253...",6154169775816704,True,The Eiffel Tower
3,CA,"[[[13, 24, 12, 3, 0], [1, 197, 243, 255, 243]]...",6336834588114944,True,The Eiffel Tower
4,RU,"[[[1, 4, 15, 73, 114, 124, 140, 129, 91, 70, 5...",5500606401740800,True,The Eiffel Tower


In [30]:
pred = np.argmax(valid_predict, axis=1)
pred_cls = idx2cls[pred]
# prob_cls = train_predict[0][pred]

In [31]:
probs = []
for i, predict_prob in enumerate(valid_predict):
    probs.append(predict_prob[pred[i]])

In [32]:
dfs["predict_word"] = pred_cls
dfs["predict_word"] = dfs["predict_word"].apply(lambda x: x.replace("_", " "))
dfs["prob"] = probs

In [33]:
del valid_predict
import gc
gc.collect()

7

In [34]:
threshold = 0.90
def re_assign_label(row):
    if row["recognized"] == False and row["word"] != row["predict_word"] and row["prob"] > threshold:
        row["new_word"] = row["predict_word"]
    else:
        row["new_word"] = row["word"]
    return row

In [35]:
recognized_df = dfs[dfs["recognized"] == True]
unrecognized_df = dfs[dfs["recognized"] == False]
del dfs
gc.collect()

7

In [36]:
clean_df = unrecognized_df.progress_apply(re_assign_label, axis=1)

100%|██████████| 42215/42215 [00:26<00:00, 1622.57it/s]


In [37]:
clean_df.head()

Unnamed: 0,countrycode,drawing,key_id,recognized,word,predict_word,prob,new_word
168,US,"[[[29, 27, 12, 0, 36, 59, 75, 88, 114, 118, 11...",6456669942841344,False,The Eiffel Tower,The Eiffel Tower,0.999952,The Eiffel Tower
187,BR,"[[[30, 29, 31, 37], [66, 225, 232, 229]], [[14...",4868656318119936,False,The Eiffel Tower,tree,0.230682,The Eiffel Tower
198,US,"[[[0, 28, 59, 84], [229, 157, 110, 53]], [[75,...",6341731920379904,False,The Eiffel Tower,remote control,0.481222,The Eiffel Tower
231,GB,"[[[72, 80, 134, 177, 221], [128, 101, 0, 93, 1...",5225712921346048,False,The Eiffel Tower,triangle,0.445684,The Eiffel Tower
270,US,"[[[111, 116, 143, 155, 153, 156, 124, 116, 107...",4960697769263104,False,The Eiffel Tower,The Eiffel Tower,0.970289,The Eiffel Tower


In [38]:
recognized_df["new_word"] = recognized_df["word"]

In [39]:
clean_dfs = pd.concat([clean_df, recognized_df], axis=0)

In [40]:
del clean_df, recognized_df
gc.collect()

0

In [41]:
clean_dfs.head()

Unnamed: 0,countrycode,drawing,key_id,recognized,word,predict_word,prob,new_word
168,US,"[[[29, 27, 12, 0, 36, 59, 75, 88, 114, 118, 11...",6456669942841344,False,The Eiffel Tower,The Eiffel Tower,0.999952,The Eiffel Tower
187,BR,"[[[30, 29, 31, 37], [66, 225, 232, 229]], [[14...",4868656318119936,False,The Eiffel Tower,tree,0.230682,The Eiffel Tower
198,US,"[[[0, 28, 59, 84], [229, 157, 110, 53]], [[75,...",6341731920379904,False,The Eiffel Tower,remote control,0.481222,The Eiffel Tower
231,GB,"[[[72, 80, 134, 177, 221], [128, 101, 0, 93, 1...",5225712921346048,False,The Eiffel Tower,triangle,0.445684,The Eiffel Tower
270,US,"[[[111, 116, 143, 155, 153, 156, 124, 116, 107...",4960697769263104,False,The Eiffel Tower,The Eiffel Tower,0.970289,The Eiffel Tower


In [42]:
all_class = clean_dfs["new_word"].unique()

In [43]:
data_2_clean_valid = "/media/ngxbac/Bac/competition/kaggle/competition_data/quickdraw/data/30k/data_2_clean/valid/"
os.makedirs(data_2_clean_valid, exist_ok=True)

In [44]:
for cls in tqdm(all_class):
    cls_df = clean_dfs[clean_dfs["new_word"] == cls]
    cls_df.to_csv(data_2_clean_valid + cls + ".csv", index=False)

100%|██████████| 340/340 [00:15<00:00, 22.56it/s]


In [45]:
clean_dfs[clean_dfs["word"] != clean_dfs["new_word"]].head()

Unnamed: 0,countrycode,drawing,key_id,recognized,word,predict_word,prob,new_word
297,US,"[[[12, 12], [241, 241]], [[3, 7, 26, 46, 54], ...",5531471441821696,False,The Eiffel Tower,crayon,0.901299,crayon
321,US,"[[[0, 15, 45, 80, 100, 107, 115, 151, 187, 201...",6585399507943424,False,The Eiffel Tower,roller coaster,0.989047,roller coaster
930,IQ,"[[[3, 42, 46, 51, 79, 81, 84, 105, 106, 109, 1...",6070255761752064,False,The Eiffel Tower,castle,0.987388,castle
133,US,"[[[14, 21, 20, 113, 252, 255, 249, 142, 105, 5...",5748407941464064,False,The Great Wall of China,envelope,0.958819,envelope
566,US,"[[[4, 249, 253, 253, 1, 3], [4, 2, 11, 73, 71,...",4893926362185728,False,The Great Wall of China,keyboard,0.916405,keyboard
