<a href="https://colab.research.google.com/github/malgorzatagwinner/text_classification/blob/main/sbert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<b>Training overview</b>

In [13]:
!pip install -U sentence-transformers



In [14]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
from sentence_transformers import SentenceTransformer, models, util
import nltk
nltk.download('punkt')
import glob
import os

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [16]:
from tqdm.notebook import tqdm_notebook
import pickle

In [17]:
l_sents= []
sents = set()
for filename in tqdm_notebook(glob.glob(f'/content/drive/My Drive/corpus/*.txt')):
    sents = set()
    with open(filename, 'r', encoding="utf8", errors='ignore') as in_file: 
        text = in_file.read().replace('\n', ' ')
    sents |= set(nltk.sent_tokenize(text))
    sents = list(sents)
    _, tail = os.path.split(filename)
    auth = tail.split("_")[0]
    l_sents.append((auth,sents,tail))

#l_sents = l_sents[:50]


  0%|          | 0/100 [00:00<?, ?it/s]

In [18]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [19]:
import numpy as np

In [20]:
from os.path import exists

PATH = '/content/drive/My Drive/encodings/'
embeddings_l = []
authors_d = {}
authors_l = []
for i in tqdm_notebook(l_sents):
    filename = PATH + i[2] + '.pickle'
    if exists(filename): # load
        with open(filename, 'rb') as f:
            embeddings = pickle.load(f)
    else: # calculate and save
        embeddings = model.encode(i[1])
        with open(filename, 'wb') as f:
            pickle.dump(embeddings, f, pickle.HIGHEST_PROTOCOL)
    embeddings_l.append(embeddings)
    if not i[0] in authors_d:
        authors_d[i[0]] = len(authors_d)
    authors_l.append(authors_d[i[0]])

authors_l = np.array(authors_l)
model = 0

sents_len = max([x.shape[0] for x in embeddings_l])
print(sents_len)
for i in range(len(embeddings_l)):
    embeddings_l[i] = np.pad(embeddings_l[i], pad_width=((sents_len-embeddings_l[i].shape[0],0),(0,0)))
embeddings_l = np.stack(embeddings_l)

  0%|          | 0/50 [00:00<?, ?it/s]

23674


**Neural Network**


In [21]:
from tensorflow import keras
import tensorflow as tf

In [22]:
import random

In [23]:
#NUMBER = min(map(len, embeddings_l))
#sample = map(lambda x: random.sample(list(x), NUMBER), embeddings_l)
#x = tf.data.Dataset.from_tensor_slices(list(sample))

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(embeddings_l, authors_l, test_size=0.20)
X_train = np.expand_dims(X_train, axis=-1)
X_test = np.expand_dims(X_test, axis=-1)
#X_train = x.take(80)
#X_test = x.skip(80)

#y_train = tf.data.Dataset.from_tensor_slices(list(map(lambda x: tf.convert_to_tensor(x),authors_l[:80])))
#y_test = tf.data.Dataset.from_tensor_slices(list(map(lambda x: tf.convert_to_tensor(x),authors_l[80:100])))

In [25]:
y_train.shape

(40,)

In [26]:
X_train.shape

(40, 23674, 384, 1)

In [27]:
from keras.models import Sequential
from keras import Input, Model
from keras.layers import Dense, Conv2D, Conv1D, MaxPooling2D, Reshape

'''
nn_model = Sequential()
#nn_model.add(Input(shape=(None,384,1)))
nn_model.add(Input(shape=(None,384)))
nn_model.add(Conv1D(filters=48, kernel_size=3, strides=1, activation="relu"))
nn_model.add(MaxPooling1D(pool_size=2, strides=4))
nn_model.add(Dense(len(authors_d)))
nn_model.compile(loss='binary_crossentropy',metrics=['accuracy'])
nn_model.summary()
'''

_inputs = Input(shape=(sents_len, 384, 1), batch_size=None)#embeddings_l.shape[0])
_outputs = _inputs
#_outputs = Reshape((384,1))(_outputs)
_outputs = Conv2D(filters=20, kernel_size=3, activation="relu")(_outputs)
_outputs = MaxPooling2D(pool_size=(1,64), strides=4)(_outputs)
_outputs = Dense(len(authors_d))(_outputs)
_outputs = Dense(1)(_outputs)
nn_model = Model(inputs=_inputs, outputs=_outputs)
nn_model.compile(loss='binary_crossentropy',metrics=['accuracy'])
nn_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 23674, 384, 1)]   0         
                                                                 
 conv2d (Conv2D)             (None, 23672, 382, 20)    200       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 5918, 80, 20)     0         
 )                                                               
                                                                 
 dense (Dense)               (None, 5918, 80, 27)      567       
                                                                 
 dense_1 (Dense)             (None, 5918, 80, 1)       28        
                                                                 
Total params: 795
Trainable params: 795
Non-trainable params: 0
_______________________________________________________________

In [None]:
history = nn_model.fit(X_train, y_train,
                    epochs=50,
                    verbose=False)
loss, accuracy = nn_model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = nn_model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

In [None]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

def plot_history(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()


In [None]:
plot_history(history)

In [None]:
import tensorflow as tf
# Define Sequential model with 3 layers
mmodel = keras.Sequential(
    [   Input(shape=(None, 3)),
        Dense(2, activation="relu", name="layer1"),
        #Dense(3, activation="relu", name="layer2"),
        #Dense(4, name="layer3"),
    ]
)
# Call model on a test input
x = tf.ones((3, 3))
y = mmodel(x)
y