# Exercise: multi-class classification on Stack Overflow questions

In [1]:
import matplotlib.pyplot as plt
import os
import re
import shutil
import string
import tensorflow as tf  
from tensorflow.keras import layers
from tensorflow.keras import losses

In [6]:
url = 'https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz'

dataset = tf.keras.utils.get_file('stack_overflow_16k', url, untar=True, cache_dir='.', cache_subdir='')

dataset_dir = os.path.join(os.path.dirname(dataset), 'stack_overflow')

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz


In [7]:
os.listdir(dataset_dir)

['README.md', 'test', 'train']

In [8]:
train_dir = os.path.join(dataset_dir, 'train')
train_dir

'.\\stack_overflow\\train'

In [9]:
os.listdir(train_dir)

['csharp', 'java', 'javascript', 'python']

In [10]:
sample_file = os.path.join(train_dir, 'python/15.txt')
with open(sample_file) as f:
    print(f.read())

"float rounding problems in blank amount = 0.002638309660058967.price = 1392.18..lowest_ask = 1391.6..result = price*amount/lowest_ask..print(result)...the above code will print out:..0.002639409271731024...however when i perform the calculation here: http://web2.0calc.com/.it gives me: 0.0026394092717310237698..so obviously blank is rounding up the result of this calculation......my question is, how do you prevent blank from rounding up result? i.e. i want result to be: 0.002639409271731023"


In [27]:
batch_size = 32
seed = 42

raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    'stack_overflow/train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    seed=seed
)

Found 8000 files belonging to 4 classes.
Using 6400 files for training.


In [29]:
for text_batch, label_batch in raw_train_ds.take(1):
    for i in range(2):
        print("Questions", text_batch.numpy()[i])
        print("Label", label_batch.numpy()[i])

Questions b'"blank8 why is my solution faster than the neat solution? (hackerrank chocolate feast) edit: simplified my solution..edit: removed opinion based secondary question...background: atarted learning blank a week or two ago using hackerranks problems as exercises and stackoverflow search + google as my teacher, i\'ve had some limited experience learning other languages...i did the exercise my own ""noobish learner way"" which i can\'t help but feel is a ""botched job"" when i see ""neat &amp; short"" solutions...however, when submitting both solutions one after another a couple of times i found the ""neat"" solution was quite a bit slower. ..i vaguely remember something about % operations being costly, is mine faster because of no % operations or is there more to it than just that?..exercise: https://www.hackerrank.com/challenges/chocolate-feast..neat solution from discussion:..import blank.io.*;.import blank.util.*;..public class solution {.    static int cc; .    public static

In [31]:
raw_train_ds.class_names[0]

'csharp'

In [32]:
raw_train_ds.class_names[1]

'java'

In [33]:
raw_train_ds.class_names[2]

'javascript'

In [34]:
raw_train_ds.class_names[3]

'python'

In [36]:
raw_val_ds = tf.keras.utils.text_dataset_from_directory(
    'stack_overflow/train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    seed=seed
)

Found 8000 files belonging to 4 classes.
Using 1600 files for validation.


In [37]:
raw_test_ds = tf.keras.utils.text_dataset_from_directory(
    'stack_overflow/test',
    batch_size=batch_size
)

Found 8000 files belonging to 4 classes.


In [72]:
max_features = 10000
sequence_length = 250

vectorize_layer = layers.TextVectorization(
    standardize='lower',
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length
)

In [73]:
train_text = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

In [74]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label 

In [75]:
text_batch, label_batch = next(iter(raw_train_ds))
first_question, first_label = text_batch[0], label_batch[0]
print("Question: ", first_question)
print("Label: ", raw_train_ds.class_names[first_label])
print("Vectorized question: ", vectorize_text(first_question, first_label))

Question:  tf.Tensor(b'"how to change data format in write function in blank? how to change the data format in f.write function? ..loaded_data = 349.00  or 3.00..i want to change data format in write function like %6f in print function. ..ex)  349.00 -> 349.000000 ,   3.00 -> 3.000000..f = open(""test.txt"", \'w\').f.write( str.(loaded_data).zfill(?) )  ...what is the code that performs above function?"\n', shape=(), dtype=string)
Label:  python
Vectorized question:  (<tf.Tensor: shape=(1, 250), dtype=int64, numpy=
array([[  90,    4,  174,   99,  375,    7,  167,   53,    7,  278,   41,
           4,  174,    2,   99,  375,    7,    1, 1433,    1,    3,    1,
          46,    1,   45,    4,  174,   99,  375,    7,  167,   53,   51,
           1,    7,   88,  702,    1,    1, 2335,    1,  222,    1, 2335,
           1,    3,    1,    1,    1,  367, 4370,    8,    2,   37,   15,
        3890,  282, 5033,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    

In [76]:
print("55 => ", vectorize_layer.get_vocabulary()[55])
print("550 => ", vectorize_layer.get_vocabulary()[550])

55 =>  static
550 =>  const


In [77]:
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

### Configuring the dataset for performance

In [78]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

### Creating the model

In [87]:
embedding_dim = 16

model = tf.keras.Sequential([
    layers.Embedding(max_features + 1, embedding_dim),
    layers.Dropout(0.2),
    layers.GlobalAveragePooling1D(),
    layers.Dropout(0.2),
    layers.Dense(4)
])

model.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, None, 16)          160016    
                                                                 
 dropout_16 (Dropout)        (None, None, 16)          0         
                                                                 
 global_average_pooling1d_8  (None, 16)                0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dropout_17 (Dropout)        (None, 16)                0         
                                                                 
 dense_9 (Dense)             (None, 4)                 68        
                                                                 
Total params: 160084 (625.33 KB)
Trainable params: 160084 (625.33 KB)
Non-trainable params: 0 (0.00 Byte)
______________

### Loss function and optimizer

In [88]:
model.compile(loss=losses.SparseCategoricalCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=['accuracy'])

### Training the model

In [89]:
epochs = 40

history = model.fit(
    train_ds, validation_data=val_ds,
    epochs=epochs
)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


### Evaluating the model

In [90]:
loss, accuracy = model.evaluate(test_ds)

print("Loss = ", loss)
print("Accuracy = ", accuracy)

Loss =  0.5244626998901367
Accuracy =  0.8028749823570251


In [91]:
history_dict = history.history

acc = history_dict['']

dict_keys(['loss', 'accuracy', 'val_loss', 'val_accuracy'])