In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import regularizers

In [3]:
import pandas as pd
import numpy as np

In [4]:
print(tf.__version__)

2.7.0


In [8]:
df = pd.read_csv("/content/sample_data/data.csv", index_col=0)

df.head()

Unnamed: 0,main_text,target,text2
0,Oil prices behind downward revisions to inflat...,1,oil price behind downward revision inflation e...
1,The European Central Bank (ECB) today publishe...,1,european central bank ecb today publishes dece...
2,"353,000 counterfeit euro banknotes were withdr...",1,counterfeit euro banknote withdrawn circulatio...
3,The Governing Council of the European Central ...,1,governing council european central bank today ...
4,The Governing Council of the European Central ...,1,governing council european central bank ecb de...


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1216 entries, 0 to 1215
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   main_text  1215 non-null   object
 1   target     1216 non-null   int64 
 2   text2      1215 non-null   object
dtypes: int64(1), object(2)
memory usage: 38.0+ KB


In [10]:
df.dropna(inplace=True)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1215 entries, 0 to 1215
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   main_text  1215 non-null   object
 1   target     1215 non-null   int64 
 2   text2      1215 non-null   object
dtypes: int64(1), object(2)
memory usage: 38.0+ KB


In [12]:
df.dropna(inplace=True)

In [13]:
df = df.sample(1215)

In [14]:
embedding_dim = 100
max_length = 160
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size= 1215
test_portion=.1

corpus = []

In [15]:
sentences = df.text2.to_list()
type(sentences)

list

In [16]:
sentences[0]

'joint statement european commission european central bank european commission commission european central bank ecb share common vision single euro payment area sepa process leading realisation institution closely process encourage european banking industry relevant stakeholder create technical condition realisation single euro payment area end vision commission ecb see sepa integrated market payment service subject effective competition distinction national payment within euro area call removal technical legal commercial barrier current national payment market introduction euro single currency euro area completed sepa become reality consumer business government able make cashless payment throughout euro area single payment account anywhere euro area using single set payment instrument easily efficiently safely make payment today domestic context creating open common standard overcome technical commercial barrier fostering effective competition improved payment service level benefit se

In [17]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)# YOUR CODE HERE)

In [18]:
word_index = tokenizer.word_index
vocab_size=len(word_index)

In [19]:
sequences = tokenizer.texts_to_sequences(sentences)# YOUR CODE HERE)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [20]:
split = int(test_portion * training_size)

In [21]:
labels = df.target.to_list()

In [22]:
test_sequences = padded[0:split]# YOUR CODE HERE)
training_sequences = padded[split:training_size]# YOUR CODE HERE)
test_labels = labels[0:split]# YOUR CODE HERE)
training_labels = labels[split:training_size]# YOUR CODE HERE)

In [23]:
test_labels.count(0)


68

In [24]:
!gdown --id 1W5vZy2etitAblLdFn8_DxnsQKzfFJ98g

embeddings_index = {};
with open('./glove.6B.100d.txt') as f:
    for line in f:
        values = line.split();
        word = values[0];
        coefs = np.asarray(values[1:], dtype='float32');
        embeddings_index[word] = coefs;

embeddings_matrix = np.zeros((vocab_size+1, embedding_dim));
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word);
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector;

Downloading...
From: https://drive.google.com/uc?id=1W5vZy2etitAblLdFn8_DxnsQKzfFJ98g
To: /content/glove.6B.100d.txt
100% 347M/347M [00:04<00:00, 71.0MB/s]


In [26]:
model = tf.keras.Sequential([
    # YOUR CODE HERE
    tf.keras.layers.Embedding(vocab_size+1, embedding_dim, input_length=max_length, weights=[embeddings_matrix], trainable=False),
    tf.keras.layers.Conv1D(128, 5, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=4),
    tf.keras.layers.LSTM(64,activation='relu',return_sequences=True),
    tf.keras.layers.LSTM(32, activation='relu', return_sequences=False),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])# YOUR CODE HERE)
model.summary()

num_epochs = 100

training_padded = np.array(training_sequences)
training_labels = np.array(training_labels)
testing_padded = np.array(test_sequences)
testing_labels = np.array(test_labels)

history = model.fit(training_padded, training_labels, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=2)

print("Training Complete")

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 160, 100)          878400    
                                                                 
 conv1d_1 (Conv1D)           (None, 156, 128)          64128     
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 39, 128)          0         
 1D)                                                             
                                                                 
 lstm_2 (LSTM)               (None, 39, 64)            49408     
                                                                 
 lstm_3 (LSTM)               (None, 32)                12416     
                                                                 
 dropout_1 (Dropout)         (None, 32)                0         
                                                      

In [27]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_sentence(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])


In [28]:
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) 

(8784, 100)


In [29]:
import io

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

In [30]:
try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>