In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_full = pd.read_csv('/kaggle/input/redit-suicide-dataset/combined-set.csv', encoding='utf-8')
df_full = pd.DataFrame({'Input' : df_full['selftext'], 'Sentiment' : df_full['is_suicide']})
df_full

In [None]:
!pip install text_hammer

In [None]:
import text_hammer as th

In [None]:
%%time

from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

def text_preprocessing(df,col_name):
    column = col_name
    df[column] = df[column].progress_apply(lambda x:str(x).lower())
#     df[column] = df[column].progress_apply(lambda x: th.cont_exp(x))
#     df[column] = df[column].progress_apply(lambda x: contractions.fix(x))
    df[column] = df[column].progress_apply(lambda x: th.remove_emails(x))
    df[column] = df[column].progress_apply(lambda x: th.remove_html_tags(x))
#     df[column] = df[column].progress_apply(lambda x: ps.remove_stopwords(x))

    df[column] = df[column].progress_apply(lambda x: th.remove_special_chars(x))
    df[column] = df[column].progress_apply(lambda x: th.remove_accented_chars(x))
#     df[column] = df[column].progress_apply(lambda x: th.make_base(x))
    return(df)

In [None]:
df_cleaned = text_preprocessing(df_full,'Input')

In [None]:
df_cleaned = df_cleaned.copy()

In [None]:
df_cleaned

In [None]:
df_cleaned['num_words'] = df_cleaned.Input.apply(lambda x:len(x.split()))

In [None]:
encoded_dict  = {'Not Suicide':0,'Suicide':1}

In [None]:
print(df_cleaned.num_words.max())
print(df_cleaned.num_words.mean())

In [None]:
from sklearn.model_selection import train_test_split
data_train,data_test = train_test_split(df_cleaned, test_size = 0.3, random_state = 42, stratify = df_cleaned.Sentiment)

In [None]:
print(data_train.shape)
print(data_test.shape)

# Loading BERT Model

In [None]:
!pip install transformers

In [None]:
from transformers import AutoTokenizer,TFBertModel
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
bert = TFBertModel.from_pretrained('bert-base-cased')

# from transformers import BertTokenizer,TFBertForSequenceClassification
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
# bert = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

# from transformers import RobertaTokenizer, TFRobertaModel
# tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
# roberta = TFRobertaModel.from_pretrained('roberta-base')

# from transformers import DistilBertTokenizer, TFDistilBertModel
# tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
# dbert = TFDistilBertModel.from_pretrained("distilbert-base-uncased")

In [None]:
query = df_cleaned.sample(n = 1)['Input'].to_string()
tokenizer(query)

In [None]:
# Tokenize the input (takes some time) 
# here tokenizer using from bert-base-cased
x_train = tokenizer(
    text=data_train.Input.tolist(),
    add_special_tokens=True,
    max_length=170,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)


x_test = tokenizer(
    text=data_test.Input.tolist(),
    add_special_tokens=True,
    max_length=170,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

In [None]:
x_test['input_ids']

In [None]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical

In [None]:
import tensorflow as tf
tf.config.experimental.list_physical_devices('GPU')

In [None]:
max_len = 170
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense

input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
input_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
embeddings = bert(input_ids,attention_mask = input_mask)[0] #(0 is the last hidden states,1 means pooler_output)

# embeddings = dbert(input_ids,attention_mask = input_mask)[0] #(0 is the last hidden states,1 means pooler_output)

out = tf.keras.layers.GlobalMaxPool1D()(embeddings)
out = Dense(128, activation='relu')(out)
out = tf.keras.layers.Dropout(0.1)(out)
out = Dense(32,activation = 'relu')(out)

y = Dense(2,activation = 'sigmoid')(out)
    
model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=y)
model.layers[2].trainable = True

In [None]:
optimizer = tf.keras.optimizers.Adam(
    learning_rate=5e-05, # As suggested by huggingface
    epsilon=1e-08,
    clipnorm=1.0
)

# Set loss and metrics
loss = CategoricalCrossentropy(from_logits=True)
metric = CategoricalAccuracy('balanced_accuracy')

# Compile the model
model.compile(
    optimizer=optimizer,
    loss=loss, 
    metrics=metric
)

In [None]:
model.summary()

In [None]:
tf.config.experimental_run_functions_eagerly(True)
tf.config.run_functions_eagerly(True)

# Model Fitting and Evaluation

In [None]:
x_train['input_ids'].shape

In [None]:
from tensorflow.keras.utils import plot_model
plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

In [None]:
train_history = model.fit(
    x ={'input_ids':x_train['input_ids'],'attention_mask':x_train['attention_mask']} ,
    y = to_categorical(data_train.Sentiment),
    validation_data = (
    {'input_ids':x_test['input_ids'],'attention_mask':x_test['attention_mask']}, to_categorical(data_test.Sentiment)
    ),
    epochs=5,
    batch_size=10
)

In [None]:
import matplotlib.pyplot as plt

accuracy = train_history.history['balanced_accuracy']

epochs = list(range(1, len(accuracy)+1))

plt.figure(figsize=(6,6))
plt.plot(epochs, accuracy)
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Accuracy by Epochs')
plt.show()
plt.savefig('accuracy_by_epochs.png')

In [None]:
predicted_raw = model.predict({'input_ids':x_test['input_ids'],'attention_mask':x_test['attention_mask']})
y_predicted = np.argmax(predicted_raw, axis = 1)
y_predicted

In [None]:
from sklearn.metrics import classification_report
print(classification_report(data_test.Sentiment, y_predicted))

In [None]:
import seaborn as sns
from sklearn import metrics
import matplotlib.pyplot as plt

cf_matrix = metrics.confusion_matrix(data_test.Sentiment, y_predicted)
group_names = ['True Neg','False Pos','False Neg','True Pos']
group_counts = ["{0:0.0f}".format(value) for value in cf_matrix.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in cf_matrix.flatten()/np.sum(cf_matrix)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
plt.figure(figsize = (10,10))
ax = sns.heatmap(cf_matrix, annot=labels, annot_kws={"fontsize": 20}, fmt='', cmap='GnBu')
ax.set_xticklabels(['Not Suicide', 'Suicide'])
ax.set_yticklabels(['Not Suicide', 'Suicide'], va='center')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
plt.savefig('confusion_matrix.png')

In [None]:
def single_text_preprocessing(text):
    text = text.lower()
    text = th.remove_emails(text)
    text = th.remove_html_tags(text)
    text = th.remove_special_chars(text)
    text = th.remove_accented_chars(text)
    return text

def predict_sentiment(texts):
    texts = single_text_preprocessing(texts)
    x_val = tokenizer(
        text=texts,
        add_special_tokens=True,
        max_length=170,
        truncation=True,
        padding='max_length', 
        return_tensors='tf',
        return_token_type_ids = False,
        return_attention_mask = True,
        verbose = True) 
    validation = model.predict({'input_ids':x_val['input_ids'],'attention_mask':x_val['attention_mask']})*100
    classes = ['Not Suicide', 'Suicide']
    sentiment_predicted = classes[np.argmax(validation[0])]
    return sentiment_predicted

In [None]:
predict_sentiment('Today, I felt good in the morning; everything was good, but in the evening, it rained, and as a result, I got stuck in traffic. My life sucks; I should end it; I should kill myself.')

In [None]:
model.save_weights('best_model.h5')