In [1]:
from tensorflow.keras import layers
from tensorflow import keras
import tensorflow as tf

from sklearn.model_selection import train_test_split
from ast import literal_eval

import pathlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

2023-06-03 12:56:50.173089: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
DATA_DIR = pathlib.Path.cwd().parent / 'data'

dataframe = pd.read_csv(DATA_DIR / 'processed_data.csv')

In [3]:
df = dataframe.copy()

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1154491 entries, 0 to 1154490
Data columns (total 4 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   Unnamed: 0  1154491 non-null  int64 
 1   Id          1154491 non-null  int64 
 2   Body        1154491 non-null  object
 3   Tag         1154491 non-null  object
dtypes: int64(2), object(2)
memory usage: 35.2+ MB


In [5]:
def get_size(x:str):
    x = x.strip("[]")
    x = x.replace("'","")
    size = len(x.split(','))
    return size

df['size'] = df['Tag'].apply(get_size)

In [6]:
df['size'].describe()

count    1.154491e+06
mean     1.959110e+00
std      9.438675e-01
min      1.000000e+00
25%      1.000000e+00
50%      2.000000e+00
75%      3.000000e+00
max      5.000000e+00
Name: size, dtype: float64

In [7]:
df_f = df[df['size']>1]
df_filtered = df_f.copy()

In [8]:
df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 722200 entries, 0 to 1154489
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  722200 non-null  int64 
 1   Id          722200 non-null  int64 
 2   Body        722200 non-null  object
 3   Tag         722200 non-null  object
 4   size        722200 non-null  int64 
dtypes: int64(3), object(2)
memory usage: 33.1+ MB


In [9]:
df_filtered['Tag'] = df_filtered['Tag'].apply(lambda x: literal_eval(x))
df_filtered['Tag'].values[:5]

array([list(['flex', 'actionscript-3']), list(['sql', 'asp.net']),
       list(['c#', '.net']), list(['c++', 'oop', 'class']),
       list(['.net', 'web-services'])], dtype=object)

In [10]:
# Divisao em train e teste com stratify

train_df, test_df = train_test_split(
    df_filtered,
    test_size=0.15
)

val_df = test_df.sample(frac=0.3)
test_df.drop(val_df.index, inplace=True)

print(f"Number of rows in training set: {len(train_df)}")
print(f"Number of rows in test set: {len(test_df)}")
print(f"Number of rows in validation set: {len(val_df)}")

Number of rows in training set: 613870
Number of rows in test set: 75831
Number of rows in validation set: 32499


In [11]:
tags = tf.ragged.constant(df_filtered["Tag"].values)
lookup = tf.keras.layers.StringLookup(output_mode="multi_hot")
lookup.adapt(tags)
vocab = lookup.get_vocabulary()

2023-06-03 12:57:13.846178: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-06-03 12:57:13.847036: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2023-06-03 12:57:14.341919: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Pla

In [12]:
def invert_multi_hot(encoded_labels):
    hot_indices = np.argwhere(encoded_labels == 1.0)[..., 0]
    return np.take(vocab, hot_indices)


print("Vocabulary:\n")
len(vocab)

Vocabulary:



301

In [13]:
sample_label = df_filtered["Tag"].iloc[0]
print(f"Original label: {sample_label}")

label_binarized = lookup([sample_label])
print(f"Label-binarized representation: {label_binarized}")

Original label: ['flex', 'actionscript-3']
Label-binarized representation: [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

In [14]:
train_df['Body'].apply(lambda x: len(x.split(" "))).describe()

count    613870.000000
mean        217.639556
std         247.414756
min           5.000000
25%          94.000000
50%         153.000000
75%         253.000000
max       10871.000000
Name: Body, dtype: float64

In [15]:
max_seqlen = 150
batch_size = 128

def make_dataset(dataframe, is_train=True):
    labels = tf.ragged.constant(dataframe["Tag"].values)
    label_binarized = lookup(labels).numpy()
    dataset = tf.data.Dataset.from_tensor_slices(
        (dataframe["Body"].values, label_binarized)
    )
    dataset = dataset.shuffle(batch_size*10) if is_train else dataset
    return dataset.batch(batch_size)

In [16]:
train_dataset = make_dataset(train_df, is_train=True)
validation_dataset = make_dataset(val_df, is_train=False)
test_dataset = make_dataset(test_df, is_train=False)

2023-06-03 12:57:32.324641: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 739099480 exceeds 10% of free system memory.
2023-06-03 12:57:32.974672: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 739099480 exceeds 10% of free system memory.


In [17]:
text_batch, label_batch = next(iter(train_dataset))

for i, text in enumerate(text_batch[:5]):
    label = label_batch[i].numpy()[None, ...]
    print(f"Abstract: {text}")
    print(f"Label(s): {invert_multi_hot(label[0])}")
    print(" ")

2023-06-03 12:57:33.998110: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 739099480 exceeds 10% of free system memory.


Abstract: b' I have a pretty simple looking problem but I can t seem to find the answer anywhere Given I have several arrays of tasks of different types e g Task lt Dog gt dogTasks GetDogTasks Task lt Cat gt catTasks GetCatTasks Task lt Fish gt fishTasks GetFishTasks code pre What would be the best way to code WaitAll code for these tasks '
Label(s): ['c#' 'asynchronous']
 
Abstract: b' Ok I have a cookie set and I can clearly see it if I go to private data in Firefox ok so when I echo it on one page in a certain directory it works www example com dir but on the index page of the site www example com it wont echo it says the cookie is not set Yes I have cookies enabled yes I tried clearing cache and all that Any ideas PHP btw '
Label(s): ['php' 'cookies']
 
Abstract: b' I have the following code that gives the error blockquote Default parameter specifiers are not permitted blockquote How can this be fixed bool listSubscribe string apikey string id string email address string merge vars

2023-06-03 12:57:34.500855: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype float and shape [613870,301]
	 [[{{node Placeholder/_1}}]]
2023-06-03 12:57:34.501062: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype float and shape [613870,301]
	 [[{{node Placeholder/_1}}]]


In [18]:
vocabulary = set()
train_df["Body"].str.lower().str.split().apply(vocabulary.update)
vocabulary_size = len(vocabulary)
print(vocabulary_size)

: 

: 