In [2]:
import pandas as pd
jokes_df = pd.read_csv('/kaggle/input/200k-short-texts-for-humor-detection/dataset.csv')


In [3]:
# Drop duplicate rows
jokes_df.drop_duplicates(subset=['text'], inplace=True)

# Define a function to count words in a text
def word_count(text):
    return len(text.split())

# Filter rows based on character length and word length
jokes_df = jokes_df[(jokes_df['text'].str.len() >= 30) & (jokes_df['text'].str.len() <= 100) &
                    (jokes_df['text'].apply(word_count) >= 5) & (jokes_df['text'].apply(word_count) <= 25)]


In [4]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(jokes_df['text'], jokes_df['humor'], test_size=0.2, random_state=42)


In [5]:
# Define a function to split text into sentences
from textblob import TextBlob

def split_sentences(text):
    return [str(sentence) for sentence in TextBlob(text).sentences]

# Get sentence-wise embeddings for training and validation sets
X_train_sentences = X_train.apply(split_sentences)
X_val_sentences = X_val.apply(split_sentences)

# Pad or truncate the sentences to have exactly 3 sentences per text
def pad_or_truncate_sentences(sentences_list, num_sentences=3):
    if len(sentences_list) > num_sentences:
        return sentences_list[:num_sentences]
    else:
        return sentences_list + [''] * (num_sentences - len(sentences_list))



In [6]:
X_train_sentences = X_train_sentences.apply(pad_or_truncate_sentences)
X_val_sentences = X_val_sentences.apply(pad_or_truncate_sentences)



In [7]:
# Load the embeddings from disk (later when you need them)
import numpy as np

X_train_embeddings = np.load('/kaggle/input/embeddings-data/X_train_embeddings.npy')
X_val_embeddings = np.load('/kaggle/input/embeddings-data/X_val_embeddings.npy')

In [8]:
import os

# Load the embeddings from disk
def load_embeddings(prefix):
    embeddings = []
    i = 0
    while True:
        file_path = f'/kaggle/input/embeddings-data/{prefix}_embedding_{i}.npy'
        if os.path.exists(file_path):
            emb = np.load(file_path,allow_pickle=True)
            embeddings.append(emb)
            i += 1
        else:
            break
    return embeddings

X_train_sentence_embeddings = load_embeddings('X_train_sentence')
X_val_sentence_embeddings = load_embeddings('X_val_sentence')


In [9]:
# Remove the extra element if present
if X_train_sentence_embeddings[-1].shape != (len(X_train), 5, 512):
    X_train_sentence_embeddings.pop()

if X_val_sentence_embeddings[-1].shape != (len(X_val), 5, 512):
    X_val_sentence_embeddings.pop()


In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from textblob import TextBlob
from transformers import BertTokenizer, TFBertModel
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam


In [11]:
from tensorflow.keras.models import load_model

model = load_model('/kaggle/input/model-beta/model.h5')


In [12]:
# Evaluate the model on the validation set
scores = model.evaluate([X_val_embeddings] + X_val_sentence_embeddings, y_val, verbose=1)
print("Validation Accuracy: %.2f%%" % (scores[1] * 100))

Validation Accuracy: 91.42%


In [None]:
#y_pred = [inner_list[0] for inner_list in y_pred]
#len(y_pred)

In [13]:

for i in range(5):
    # Make predictions on the validation set
    y_pred = model.predict([X_val_embeddings] + X_val_sentence_embeddings)

    # Convert predicted probabilities to binary predictions
    y_pred = np.where(y_pred > 0.5, 1, 0)
    y_pred = [inner_list[i] for inner_list in y_pred]
    # Calculate precision, recall, and F1-score
    from sklearn.metrics import precision_recall_fscore_support
    from sklearn.metrics import accuracy_score

    precision, recall, f1, _ = precision_recall_fscore_support(y_val, y_pred, labels=[0, 1], average='binary')

    # Calculate accuracy
    accuracy = accuracy_score(y_val, y_pred)

    print("Validation Accuracy: %.2f%%" % (accuracy * 100))
    print("Validation Precision: %.2f%%" % (precision * 100))
    print("Validation Recall: %.2f%%" % (recall * 100))
    print("Validation F1-score: %.2f%%" % (f1 * 100))
    print("-----------------")


Validation Accuracy: 91.44%
Validation Precision: 92.89%
Validation Recall: 89.75%
Validation F1-score: 91.29%
-----------------
Validation Accuracy: 91.65%
Validation Precision: 93.01%
Validation Recall: 90.06%
Validation F1-score: 91.52%
-----------------
Validation Accuracy: 91.54%
Validation Precision: 92.60%
Validation Recall: 90.29%
Validation F1-score: 91.43%
-----------------
Validation Accuracy: 91.13%
Validation Precision: 92.64%
Validation Recall: 89.35%
Validation F1-score: 90.97%
-----------------
Validation Accuracy: 91.31%
Validation Precision: 92.83%
Validation Recall: 89.54%
Validation F1-score: 91.16%
-----------------


In [None]:
from tensorflow.keras import metrics

precision = metrics.Precision(name='precision')
recall = metrics.Recall(name='recall')

#adam 16 & leaky and more layers

In [None]:
def build_model():
    # Define input layers
    whole_text_input = Input(shape=(5, 512), dtype='float32', name='whole_text_input')
    sentence_inputs = [Input(shape=(5, 512), dtype='float32', name=f'sentence_input_{i}') for i in range(2)]  # Change to 2 inputs

    # Sentence-specific hidden layers
    sentence_hidden_layers = []
    for sentence_input in sentence_inputs:
        hidden = Dense(20, activation='relu')(sentence_input)
        hidden = Dense(40, activation='LeakyReLU')(sentence_input)

        sentence_hidden_layers.append(hidden)

    # Concatenate sentence-specific hidden layers
    concatenated_sentences = Concatenate()(sentence_hidden_layers)

    # Whole text hidden layers
    whole_text_hidden = Dense(60, activation='relu')(whole_text_input)

    # Combine sentence-specific and whole text hidden layers
    combined = Concatenate()([concatenated_sentences, whole_text_hidden])

    # Final hidden layers and output layer
    hidden_1 = Dense(200, activation='LeakyReLU')(combined)
    hidden_2 = Dense(180, activation='relu')(hidden_1)
    hidden_3 = Dense(160, activation='LeakyReLU')(hidden_2)

    hidden_4 = Dense(140, activation='relu')(hidden_3)

    hidden_5 = Dense(120, activation='LeakyReLU')(hidden_4)
    hidden_6 = Dense(100, activation='relu')(hidden_5)
    hidden_7 = Dense(80, activation='LeakyReLU')(hidden_6)
    hidden_8 = Dense(60, activation='relu')(hidden_7)
    hidden_9 = Dense(40, activation='LeakyReLU')(hidden_8)
    hidden_10 = Dense(20, activation='relu')(hidden_9)

    output = Dense(1, activation='sigmoid')(hidden_10)

    # Build the model
    model = Model(inputs=[whole_text_input] + sentence_inputs, outputs=output)
    return model


In [None]:
print("X_train_embeddings shape:", X_train_embeddings.shape)
print("X_val_embeddings shape:", X_val_embeddings.shape)
print("X_train_sentence_embeddings shapes:", [x.shape for x in X_train_sentence_embeddings])
print("X_val_sentence_embeddings shapes:", [x.shape for x in X_val_sentence_embeddings])
print("y_train shape:", y_train.shape)
print("y_val shape:", y_val.shape)


In [None]:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam

# Use MirroredStrategy for multi-GPU training
strategy = tf.distribute.MirroredStrategy(devices=["/gpu:0", "/gpu:1"])
print(f"Number of devices: {strategy.num_replicas_in_sync}")

with strategy.scope():
    # Build the model
    model = build_model()

    # Compile the model
    model.compile(optimizer=Adam(learning_rate=1e-4), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(
    [X_train_embeddings] + X_train_sentence_embeddings, y_train,
    validation_data=([X_val_embeddings] + X_val_sentence_embeddings, y_val),
    epochs=30,
    batch_size=16
)


In [None]:
model.save('model.h5')