In [4]:
import pandas as pd
import random
import tensorflow as tf 

from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.layers import Embedding
from tensorflow.keras import Sequential
from sklearn.model_selection import train_test_split

## Лабороторна робота 3
## Абрамова Марія

### Get a text dataset

In [5]:
siameseData = pd.read_csv("siamese_nn.csv")
# siameseData["target"] = "siamese neural network"

In [6]:
emotionData = pd.read_csv("emotion_recognition.csv",sep=";")
covidData = pd.read_csv("coranavirus_disease.csv",sep=";")
covidData["target"] = 2
emotionData["target"] = 3

In [7]:
siameseData.head(10)

Unnamed: 0,text,target
0,Similarity has always been a key aspect in com...,1
1,"Any time two element vectors are compared, man...",1
2,But if the comparison has to be applied to mor...,1
3,"In these cases, a siamese neural network may b...",1
4,The two neural networks are both feedforward p...,1
5,they work parallelly in tandem and compare th...,1
6,The output generated by a siamese neural netwo...,1
7,In this overview we first describe the siamese...,1
8,"Additionally, we list the programming language...",1
9,This section reviews existing tracking method...,1


In [8]:
train_df = pd.concat([emotionData,covidData, siameseData], ignore_index=True )
train_df

Unnamed: 0,text,target
0,The primary objective of Speech Emotion Recogn...,3
1,"The ideal way to reach this objective, as the ...",3
2,"Nowadays, we are at the dawn of Deep Learning ...",3
3,SER is not an exception since convolutional ne...,3
4,The main advantage of DL is the fact that it r...,3
...,...,...
295,The network applies a ReLU activation function...,1
296,Thus the kth filter map in each layer takes th...,1
297,We have presented a strategy for performing on...,1
298,We outlined new results comparing the performa...,1


### Shuffle training dataframe


In [9]:
train_shuffle=train_df.sample(frac=1,random_state=9) 
train_shuffle.head(10)

Unnamed: 0,text,target
39,In order to demonstrate the high effectivennes...,3
38,"In this experimental work, we have used Multiv...",3
183,Hao has constructed the ensemble predictor of ...,2
13,. After the last convolutional layer we divide...,3
90,Speech is the main and direct means of transmi...,3
165,Region-wise this distribution depicts total de...,2
44,. It contains 535 utterances spoken by 10 acto...,3
4,The main advantage of DL is the fact that it r...,3
57,The researchers are still debating for what fe...,3
149,"In addition, the introduced method learns to g...",2


In [10]:
train_shuffle.target.value_counts()

3    100
2    100
1    100
Name: target, dtype: int64

In [11]:
train_sentences, test_sentences, train_labels,test_labels = train_test_split(
    train_shuffle['text'].to_numpy(),
    train_shuffle['target'].to_numpy(),
    test_size=0.1, 
    random_state=40
)

len(train_sentences),len(test_sentences),len(train_labels),len(test_labels)

(270, 30, 270, 30)

### Converting text into numbers 

In [12]:
# avg len of sent
max_sq_len = round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))
max_sq_len

24

In [13]:
text_vectorizer = TextVectorization(max_tokens=10000,  #number of word in vocabulary
                                    standardize="lower_and_strip_punctuation", 
                                    split ="whitespace",
                                    output_mode="int",
                                    output_sequence_length=max_sq_len,
                                    pad_to_max_tokens=True)

2021-11-18 17:54:10.539109: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-11-18 17:54:10.592881: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2021-11-18 17:54:10.593029: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (mariia-HP-ProBook-430-G2): /proc/driver/nvidia/version does not exist
2021-11-18 17:54:10.593711: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [14]:
text_vectorizer.adapt(train_sentences)

2021-11-18 17:54:10.979157: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


In [15]:
### Embedding

embedding = Embedding(
                        input_dim=10000, 
                        output_dim=128, 
                        input_length=max_sq_len, 
                        name = 'embeding_1'
)

In [16]:
random_text = random.choice(train_sentences)
print(random_text)
sample_embed = embedding(text_vectorizer([random_text]))
sample_embed 

In addition to asymptomatic infection, SARS-CoV-2 virus, the disease’s etiological agent, is capable of producing acute respiratory syndrome, varying between mild cases (around 80%) to very severe cases (between 5% and 10%), which develop respiratory insufficiency and require medical care in hospital. 


<tf.Tensor: shape=(1, 24, 128), dtype=float32, numpy=
array([[[-0.02726536,  0.00975678,  0.00276816, ..., -0.00205692,
          0.03157706,  0.04043994],
        [ 0.047692  , -0.03201105,  0.03351451, ...,  0.04547742,
         -0.00546256, -0.04595573],
        [-0.02897015, -0.02936975,  0.00482843, ..., -0.02307955,
         -0.02342745, -0.03784245],
        ...,
        [ 0.04430914, -0.00957718, -0.04431483, ...,  0.01402852,
          0.03528208, -0.00907106],
        [ 0.01899046,  0.03779479,  0.00416166, ...,  0.04132314,
         -0.02292631, -0.0364764 ],
        [ 0.0203934 ,  0.03986657,  0.00136803, ..., -0.01176503,
          0.01919306, -0.0465233 ]]], dtype=float32)>

## Helper Function to evaluate: accuracy, precision, recall, f1-score

In [17]:

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
  """
  Calculates model accuracy, precision, recall and f1 score of a binary classification model.

  Args:
  -----
  y_true = true labels in the form of a 1D array
  y_pred = predicted labels in the form of a 1D array

  Returns a dictionary of accuracy, precision, recall, f1-score.
  """
  # Calculate model accuracy
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  # Calculate model precision, recall and f1 score using "weighted" average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results

## Model 0: Naive Bayes

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [19]:
model_0= Pipeline([ 
    ("tfidf",TfidfVectorizer()), 
    ("clf", MultinomialNB())
])

In [20]:
model_0.fit(train_sentences,train_labels)
y_hat0=model_0.predict(test_sentences)

In [21]:
baseline_score= model_0.score(test_sentences,test_labels)

baseline_score

0.9666666666666667

In [22]:
calculate_results(test_labels,y_hat0)

{'accuracy': 96.66666666666667,
 'precision': 0.9694444444444444,
 'recall': 0.9666666666666667,
 'f1': 0.9664109121909632}

In [42]:
y = embedding(text_vectorizer([random_text]))
y

<tf.Tensor: shape=(1, 24, 128), dtype=float32, numpy=
array([[[-0.07142892,  0.05905135, -0.04300959, ..., -0.05127347,
          0.08046202, -0.00239245],
        [ 0.02947135, -0.01375762,  0.01498308, ...,  0.0268305 ,
          0.01335344, -0.06382021],
        [-0.07319129,  0.02022594, -0.04089615, ..., -0.07241718,
          0.02536135, -0.08075272],
        ...,
        [ 0.00753059,  0.03182925, -0.08248116, ..., -0.02725436,
          0.07600991, -0.04474253],
        [ 0.00519415,  0.05171209, -0.00996297, ...,  0.02707604,
         -0.00852028, -0.04991342],
        [ 0.0023114 ,  0.06064826, -0.01731135, ..., -0.03225316,
          0.03919969, -0.0639346 ]]], dtype=float32)>

## Model 1: Feed forward NN

In [36]:
from tensorflow.keras import layers



inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = embedding(x)
#x = layers.Dense(256, input_shape=(784,), activation="sigmoid")(x)
#x = layers.Dense(128, activation="sigmoid")(x)
x = layers.GlobalAveragePooling1D()(x) 
outputs = layers.Dense(3, activation="softmax")(x)

model_1 = tf.keras.Model(inputs,outputs,name="model_1_dense")


In [37]:
model_1.compile(loss="categorical_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [38]:
model_1.summary()

Model: "model_1_dense"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 1)]               0         
_________________________________________________________________
text_vectorization (TextVect (None, 24)                0         
_________________________________________________________________
embeding_1 (Embedding)       (None, 24, 128)           1280000   
_________________________________________________________________
global_average_pooling1d_1 ( (None, 128)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 129       
Total params: 1,280,129
Trainable params: 1,280,129
Non-trainable params: 0
_________________________________________________________________


In [39]:
# Fit the model
model_1_history = model_1.fit(train_sentences,train_labels, epochs=5,validation_data=(test_sentences,test_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
from tensorflow.keras import layers
inputs = layers.Input(shape=(1,), dtype="string") # inputs are 1-dimensional strings
x = text_vectorizer(inputs) # turn the input text into numbers
x = embedding(x) # create an embedding of the numerized numbers
x = layers.GlobalAveragePooling1D()(x) # lower the dimensionality of the embedding (try running the model without this layer and see what happens)
outputs = layers.Dense(1, activation="sigmoid")(x) # create the output layer, want binary outputs so use sigmoid activation
model_1_5 = tf.keras.Model(inputs, outputs, name="model_1_dense") # construct the model

In [None]:
# Compile model
model_1_5.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [None]:
# Get a summary of the model
model_1_5.summary()

Model: "model_1_dense"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_13 (InputLayer)        [(None, 1)]               0         
_________________________________________________________________
text_vectorization_3 (TextVe (None, 24)                0         
_________________________________________________________________
embeding_1 (Embedding)       (None, 24, 128)           1280000   
_________________________________________________________________
global_average_pooling1d_5 ( (None, 128)               0         
_________________________________________________________________
dense_15 (Dense)             (None, 1)                 129       
Total params: 1,280,129
Trainable params: 1,280,129
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Fit the model
model_1_history = model_1_5.fit(train_sentences, # input sentences can be a list of strings due to text preprocessing layer built-in model
                              train_labels,
                              epochs=5,
                              validation_data=(test_sentences, test_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
# Make predictions (these come back in the form of probabilities)
model_1_pred_probs = model_1_5.predict(test_sentences)
model_1_pred_probs[:10] 

array([[0.6420795 ],
       [0.67691565],
       [0.62670267],
       [0.6556613 ],
       [0.6573303 ],
       [0.6160018 ],
       [0.5897418 ],
       [0.6404785 ],
       [0.6620507 ],
       [0.5861466 ]], dtype=float32)