In [2]:
import re
import sys
import pandas as pd
import string
from sklearn.model_selection import train_test_split
import os
from tensorflow.python.keras.layers import LSTM, GRU, Dense, Embedding, Dropout
from tensorflow.python.keras.models import Sequential
from nltk.tokenize import word_tokenize
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

2023-04-08 10:36:12.359922: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
#URLS_RE = re.compile(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b')
URLS_RE = re.compile(r'((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*')

LISTING_RE = re.compile(r'^(|[a-z]?|[0-9]{0,3})(\-|\.)+( |\n)')

def remove_urls(text):
    return URLS_RE.sub('', text)

def replace_multi_whitespaces(line):
    return ' '.join(line.split())

def remove_listing(line):
    return LISTING_RE.sub('', line)

def remove_punctuation(text):
    text = text.replace('!','')
    text = text.replace('"','')
    return text.translate(str.maketrans('','',string.punctuation))

def normalize(s):
    replacements = (
        ("á", "a"),
        ("é", "e"),
        ("í", "i"),
        ("ó", "o"),
        ("ú", "u"),
        ("ü","u"),
        ("ñ","n"),
        ("ç","c"),
        ("\u2026","..."),
    )
    for a, b in replacements:
        s = s.replace(a, b).replace(a.upper(), b.upper())
    return s

def remove_stopwords(text,stop_words):
    words = text.split(' ')
    not_stop_words = set(words) ^ set(stop_words)
    return ' '.join(not_stop_words)
      

def clean_text(text,stop_words):	

    text = text.lower()
    text = normalize(text)
    text = remove_urls(text)
    text = remove_listing(text)
    text = replace_multi_whitespaces(text)
    text = remove_punctuation(text)
    text = remove_stopwords(text,stop_words)
    text = text.encode('ascii', 'ignore')

    return text.decode()

In [4]:
# load spanish stop words and remove accents (tweets dont have accents)
stop_words_df = pd.read_csv('../spanish-stop-words.txt',header=None)
stop_words = [normalize(w) for w in list(stop_words_df[0])] + ['q','ma']

data = pd.read_excel('../cleaned_users.xlsx')
username_list = data['username']

all_texts = []
for username in username_list:
    with open(f'../Cleaned Documents/{username}.txt','r') as f:
        text = f.read().replace('\n',' ')
        cleaned = clean_text(text,stop_words)
        all_texts.append(cleaned)

df = pd.DataFrame()
df['text'] = all_texts

gender_dict = {'female':0,'male':1}
df['gender'] = data['gender'].map(gender_dict)

X_train, X_test, y_train, y_test = train_test_split(df['text'],df['gender'],shuffle=True,stratify=df['gender'],test_size=0.3)

In [34]:
X_train_tokenized = []
for text in X_train:
    X_train_tokenized.append(' '.join([token for token in word_tokenize(text,language='spanish',preserve_line=True)]))

X_test_tokenized = []
for text in X_test:
    X_test_tokenized.append(' '.join([token for token in word_tokenize(text,language='spanish',preserve_line=True)]))

In [35]:
max_words = 10000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train_tokenized)
X_train_seq = tokenizer.texts_to_sequences(X_train_tokenized)
X_test_seq = tokenizer.texts_to_sequences(X_test_tokenized)

In [37]:
max_len = X_train.apply(lambda x: len(x)).max()
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)
word_index = tokenizer.word_index

In [47]:
embedding_vectors = {}
with open('glove-sbwc.i25.vec','r') as f:
    first_line = f.readline().split(' ')
    for line in f.readlines()[1:]:
        row = line.split(' ')
        # remove accents
        word = normalize(row[0])
        weights = np.asarray([float(val) for val in row[1:]])
        embedding_vectors[word] = weights
    
    
    num_words = int(first_line[0])
    emb_dim = int(first_line[1])

In [48]:
#initialize the embedding_matrix with zeros
if max_words is not None: 
    vocab_len = max_words 
else:
    vocab_len = len(word_index)+1
    
embedding_matrix = np.zeros((vocab_len, emb_dim))
for word, idx in word_index.items():
    if idx < vocab_len:
        embedding_vector = embedding_vectors.get(word)
        if embedding_vector is not None:
            embedding_matrix[idx] = embedding_vector


In [49]:
lstm_model = Sequential()
lstm_model.add(Embedding(vocab_len, emb_dim, trainable = False, weights=[embedding_matrix]))
lstm_model.add(LSTM(128, return_sequences=False))
lstm_model.add(Dropout(0.5))
lstm_model.add(Dense(1, activation = 'sigmoid'))
lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [52]:
print(lstm_model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 300)         3000000   
_________________________________________________________________
lstm (LSTM)                  (None, 128)               219648    
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 3,219,777
Trainable params: 219,777
Non-trainable params: 3,000,000
_________________________________________________________________
None


In [50]:
batch_size = 128
epochs  = 10
history = lstm_model.fit(X_train_pad, np.asarray(y_train), validation_data=(X_test_pad, np.asarray(y_test)), batch_size = batch_size, epochs = epochs)

2023-04-07 17:15:16.188092: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 1/10


2023-04-07 17:15:17.493432: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


ResourceExhaustedError: Graph execution error:

Detected at node 'sequential_1/embedding/embedding_lookup' defined at (most recent call last):
    File "/Users/lunamancebo/miniconda3/lib/python3.10/runpy.py", line 196, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "/Users/lunamancebo/miniconda3/lib/python3.10/runpy.py", line 86, in _run_code
      exec(code, run_globals)
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/traitlets/config/application.py", line 1043, in launch_instance
      app.start()
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 712, in start
      self.io_loop.start()
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 215, in start
      self.asyncio_loop.run_forever()
    File "/Users/lunamancebo/miniconda3/lib/python3.10/asyncio/base_events.py", line 603, in run_forever
      self._run_once()
    File "/Users/lunamancebo/miniconda3/lib/python3.10/asyncio/base_events.py", line 1906, in _run_once
      handle._run()
    File "/Users/lunamancebo/miniconda3/lib/python3.10/asyncio/events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 510, in dispatch_queue
      await self.process_one()
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 499, in process_one
      await dispatch(*args)
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 406, in dispatch_shell
      await result
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 730, in execute_request
      reply_content = await reply_content
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 383, in do_execute
      res = shell.run_cell(
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 528, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3006, in run_cell
      result = self._run_cell(
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3061, in _run_cell
      result = runner(coro)
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3266, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3445, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3505, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/var/folders/_x/cddkf2k97y19q18kzrrrxrzw0000gn/T/ipykernel_9185/1021022132.py", line 3, in <module>
      history = lstm_model.fit(X_train_pad, np.asarray(y_train), validation_data=(X_test_pad, np.asarray(y_test)), batch_size = batch_size, epochs = epochs)
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/tensorflow/python/keras/engine/training.py", line 1189, in fit
      tmp_logs = self.train_function(iterator)
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/tensorflow/python/keras/engine/training.py", line 859, in train_function
      return step_function(self, iterator)
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/tensorflow/python/keras/engine/training.py", line 849, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/tensorflow/python/keras/engine/training.py", line 842, in run_step
      outputs = model.train_step(data)
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/tensorflow/python/keras/engine/training.py", line 799, in train_step
      y_pred = self(x, training=True)
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/tensorflow/python/keras/engine/base_layer.py", line 1044, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/tensorflow/python/keras/engine/sequential.py", line 379, in call
      return super(Sequential, self).call(inputs, training=training, mask=mask)
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/tensorflow/python/keras/engine/functional.py", line 419, in call
      return self._run_internal_graph(
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/tensorflow/python/keras/engine/functional.py", line 555, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/tensorflow/python/keras/engine/base_layer.py", line 1044, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/tensorflow/python/keras/layers/embeddings.py", line 191, in call
      out = embedding_ops.embedding_lookup_v2(self.embeddings, inputs)
Node: 'sequential_1/embedding/embedding_lookup'
Detected at node 'sequential_1/embedding/embedding_lookup' defined at (most recent call last):
    File "/Users/lunamancebo/miniconda3/lib/python3.10/runpy.py", line 196, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "/Users/lunamancebo/miniconda3/lib/python3.10/runpy.py", line 86, in _run_code
      exec(code, run_globals)
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/traitlets/config/application.py", line 1043, in launch_instance
      app.start()
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 712, in start
      self.io_loop.start()
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 215, in start
      self.asyncio_loop.run_forever()
    File "/Users/lunamancebo/miniconda3/lib/python3.10/asyncio/base_events.py", line 603, in run_forever
      self._run_once()
    File "/Users/lunamancebo/miniconda3/lib/python3.10/asyncio/base_events.py", line 1906, in _run_once
      handle._run()
    File "/Users/lunamancebo/miniconda3/lib/python3.10/asyncio/events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 510, in dispatch_queue
      await self.process_one()
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 499, in process_one
      await dispatch(*args)
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 406, in dispatch_shell
      await result
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 730, in execute_request
      reply_content = await reply_content
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 383, in do_execute
      res = shell.run_cell(
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 528, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3006, in run_cell
      result = self._run_cell(
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3061, in _run_cell
      result = runner(coro)
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3266, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3445, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3505, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/var/folders/_x/cddkf2k97y19q18kzrrrxrzw0000gn/T/ipykernel_9185/1021022132.py", line 3, in <module>
      history = lstm_model.fit(X_train_pad, np.asarray(y_train), validation_data=(X_test_pad, np.asarray(y_test)), batch_size = batch_size, epochs = epochs)
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/tensorflow/python/keras/engine/training.py", line 1189, in fit
      tmp_logs = self.train_function(iterator)
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/tensorflow/python/keras/engine/training.py", line 859, in train_function
      return step_function(self, iterator)
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/tensorflow/python/keras/engine/training.py", line 849, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/tensorflow/python/keras/engine/training.py", line 842, in run_step
      outputs = model.train_step(data)
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/tensorflow/python/keras/engine/training.py", line 799, in train_step
      y_pred = self(x, training=True)
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/tensorflow/python/keras/engine/base_layer.py", line 1044, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/tensorflow/python/keras/engine/sequential.py", line 379, in call
      return super(Sequential, self).call(inputs, training=training, mask=mask)
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/tensorflow/python/keras/engine/functional.py", line 419, in call
      return self._run_internal_graph(
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/tensorflow/python/keras/engine/functional.py", line 555, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/tensorflow/python/keras/engine/base_layer.py", line 1044, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/Users/lunamancebo/miniconda3/lib/python3.10/site-packages/tensorflow/python/keras/layers/embeddings.py", line 191, in call
      out = embedding_ops.embedding_lookup_v2(self.embeddings, inputs)
Node: 'sequential_1/embedding/embedding_lookup'
2 root error(s) found.
  (0) RESOURCE_EXHAUSTED:  OOM when allocating tensor with shape[256,20450,300] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator Simple allocator
	 [[{{node sequential_1/embedding/embedding_lookup}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.

	 [[Func/gradient_tape/sequential_1/lstm/while/sequential_1/lstm/while_grad/body/_364/input/_1074/_102]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.

  (1) RESOURCE_EXHAUSTED:  OOM when allocating tensor with shape[256,20450,300] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator Simple allocator
	 [[{{node sequential_1/embedding/embedding_lookup}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.

0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_3882]