In [1]:
import os
import re

import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

from pathlib import Path
from collections import Counter
from sklearn.model_selection import train_test_split

In [2]:
tf.__version__

'2.3.0'

In [3]:
physical_devices = tf.config.list_physical_devices('GPU')
try:
  tf.config.experimental.set_memory_growth(physical_devices[0], True)
except:
  print("Invalid device or cannot modify virtual devices once initialized.")
  pass

In [4]:
def preprocess(string):
    string = string.lower()
    return re.sub(r"[\W_]+", " ", string)

In [5]:
train_df = pd.read_json('data/train.jsonl', lines = True)
train_df["text"] = train_df["text"].apply(preprocess)
train_df["img"] = "data/" + train_df["img"]
train_df

Unnamed: 0,id,img,label,text
0,42953,data/img/42953.png,0,its their character not their color that matters
1,23058,data/img/23058.png,0,don t be afraid to love again everyone is not ...
2,13894,data/img/13894.png,0,putting bows on your pet
3,37408,data/img/37408.png,0,i love everything and everybody except for squ...
4,82403,data/img/82403.png,0,everybody loves chocolate chip cookies even hi...
...,...,...,...,...
8495,10423,data/img/10423.png,1,nobody wants to hang auschwitz me
8496,98203,data/img/98203.png,1,when god grants you a child after 20 years of ...
8497,36947,data/img/36947.png,1,gays on social media equality body positivity ...
8498,16492,data/img/16492.png,1,having a bad day you could be a siamese twin a...


In [6]:
val_df = pd.read_json('data/dev.jsonl', lines = True)

In [7]:
val_df["img"] = "data/" + val_df["img"]
val_df["text"] = val_df["text"].apply(preprocess)

In [8]:
val_df

Unnamed: 0,id,img,label,text
0,8291,data/img/08291.png,1,white people is this a shooting range
1,46971,data/img/46971.png,1,bravery at its finest
2,3745,data/img/03745.png,1,your order comes to 37 50 and your white privi...
3,83745,data/img/83745.png,1,it is time to send these parasites back to the...
4,80243,data/img/80243.png,1,mississippi wind chime
...,...,...,...,...
495,83675,data/img/83675.png,0,i m gonna be like phelps one day
496,37198,data/img/37198.png,0,when you re so relaxed you can feel yourself g...
497,48670,data/img/48670.png,0,look at this sandwich maker club i found on wi...
498,9863,data/img/09863.png,0,diverse group of women


In [9]:
chars = set(char for label in train_df["text"] for char in label)

In [10]:
img_width = 128
img_height = 128

In [11]:
downsample_factor = 4

In [12]:
max_len = max([len(label) for label in train_df["text"]])
max_len

432

In [13]:
char_to_num = tf.keras.layers.experimental.preprocessing.StringLookup(
    vocabulary = list(chars), num_oov_indices = 0, mask_token = None
)

num_to_char = tf.keras.layers.experimental.preprocessing.StringLookup(
    vocabulary = char_to_num.get_vocabulary(), mask_token = None, invert = True
)

In [14]:
images_train, images_test, text_train, text_test, label_train, label_test = train_df["img"], val_df["img"], \
train_df["text"], val_df["text"], train_df["label"], val_df["label"]

In [None]:
def encode_single_sample(img_path, label, text):
    img = tf.io.read_file(img_path)
    img = tf.io.decode_png(img, channels = 3)
    img = tf.image.convert_image_dtype(img, tf.float32)
    img = tf.image.resize(img, [img_height, img_width])
    text = char_to_num(tf.strings.unicode_split(text, input_encoding = "UTF-8"))
    label = tf.cast(x, tf.unicode_s)
    return {"image": img, "label": label, "text": text}

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((images_train, label_train, text_train))
train_dataset = (
    train_dataset.map(
        encode_single_sample, num_parallel_calls = tf.data.experimental.AUTOTUNE
    )
    .batch(16)
    .prefetch(buffer_size = tf.data.experimental.AUTOTUNE)
)

validation_dataset = tf.data.Dataset.from_tensor_slices((images_test, label_test, text_test))
validation_dataset = (
    validation_dataset.map(
        encode_single_sample, num_parallel_calls = tf.data.experimental.AUTOTUNE
    )
    .batch(16)
    .prefetch(buffer_size = tf.data.experimental.AUTOTUNE)
)