In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/hackaton-god4/SampleSubmission.csv
/kaggle/input/hackaton-god4/train.csv
/kaggle/input/hackaton-god4/test.csv


In [2]:
import torch
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using GPU")
else:
    device = torch.device("cpu")
    print("Using CPU")

Using GPU


In [3]:
import pandas as pd
train_df=pd.read_csv("/kaggle/input/hackaton-god4/train.csv")

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from tf_keras.optimizers import Adam
from tf_keras.callbacks import ReduceLROnPlateau, EarlyStopping , LearningRateScheduler
from tf_keras.layers import Dropout, Dense, Input
from tf_keras.models import Model
from tf_keras.losses import SparseCategoricalCrossentropy
from transformers import AdamW
import tf_keras.preprocessing.text as kpt
from tf_keras.preprocessing.text import Tokenizer
import random

# Check if a GPU is available
if tf.test.gpu_device_name():
    print(f"Using GPU: {tf.test.gpu_device_name()}")
else:
    print("No GPU found, using CPU.")

# Combine title and content for analysis
train_df['content'] = train_df['content'].fillna('')
train_df['text'] = train_df['title'] + " " + train_df['content']

# Map string labels to integers
label_encoder = LabelEncoder()
train_df['target'] = label_encoder.fit_transform(train_df['target'])

# Split the data into training and validation sets
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the data
train_encodings = tokenizer(train_df['text'].tolist(), padding='max_length', truncation=True, max_length=128)
val_encodings = tokenizer(val_df['text'].tolist(), padding='max_length', truncation=True, max_length=128)

# Data augmentation function
def augment_data(text):
    aug_text = kpt.text_to_word_sequence(text)
    # Simple augmentation: shuffle the words
    random.shuffle(aug_text)
    return ' '.join(aug_text)

# Apply data augmentation
train_df['aug_text'] = train_df['text'].apply(augment_data)
train_encodings_aug = tokenizer(train_df['aug_text'].tolist(), padding='max_length', truncation=True, max_length=128)

# Convert to TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': tf.constant(train_encodings['input_ids']),
     'attention_mask': tf.constant(train_encodings['attention_mask'])},
    tf.constant(train_df['target'])
)).shuffle(100).batch(32)

train_dataset_aug = tf.data.Dataset.from_tensor_slices((
    {'input_ids': tf.constant(train_encodings_aug['input_ids']),
     'attention_mask': tf.constant(train_encodings_aug['attention_mask'])},
    tf.constant(train_df['target'])
)).shuffle(100).batch(32)

train_dataset_combined = train_dataset.concatenate(train_dataset_aug)

val_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': tf.constant(val_encodings['input_ids']),
     'attention_mask': tf.constant(val_encodings['attention_mask'])},
    tf.constant(val_df['target'])
)).batch(32)


Using GPU: /device:GPU:0


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [5]:
from sklearn.model_selection import KFold

# Load pre-trained BERT model
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# Create a new model with a dropout layer
input_ids = Input(shape=(128,), dtype=tf.int32, name="input_ids")
attention_mask = Input(shape=(128,), dtype=tf.int32, name="attention_mask")
bert_outputs = bert_model(input_ids, attention_mask=attention_mask)
dropout = Dropout(0.5)(bert_outputs.pooler_output)
output = Dense(len(label_encoder.classes_), activation='softmax')(dropout)
model = Model(inputs=[input_ids, attention_mask], outputs=output)

# Prepare optimizer and loss function
loss = SparseCategoricalCrossentropy(from_logits=False)
optimizer = Adam(learning_rate=0.000001)

# Define callbacks
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.0000001)
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Function to adjust the learning rate
def scheduler(epoch, lr):
    if epoch < 10:
        return lr
    else:
        return lr * tf.math.exp(-0.1)

lr_scheduler = LearningRateScheduler(scheduler)

# Number of folds for cross-validation
num_folds = 5

# Initialize KFold
kf = KFold(n_splits=num_folds)

# Prepare datasets
X_input_ids = np.array(train_encodings['input_ids'])
X_attention_mask = np.array(train_encodings['attention_mask'])
y = np.array(train_df['target'])

# Cross-validation
fold_no = 1
for train_index, val_index in kf.split(X_input_ids):
    print(f"Training fold {fold_no}...")

    X_train_input_ids, X_val_input_ids = X_input_ids[train_index], X_input_ids[val_index]
    X_train_attention_mask, X_val_attention_mask = X_attention_mask[train_index], X_attention_mask[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    train_dataset = tf.data.Dataset.from_tensor_slices((
        {'input_ids': tf.constant(X_train_input_ids),
         'attention_mask': tf.constant(X_train_attention_mask)},
        tf.constant(y_train)
    )).batch(32)
    
    val_dataset = tf.data.Dataset.from_tensor_slices((
        {'input_ids': tf.constant(X_val_input_ids),
         'attention_mask': tf.constant(X_val_attention_mask)},
        tf.constant(y_val)
    )).batch(32)

    # Compile and train the model
    model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
    history = model.fit(train_dataset, validation_data=val_dataset, epochs=10, callbacks=[reduce_lr, early_stopping, lr_scheduler])
    
    # Evaluate the model
    y_pred_prob = model.predict(val_dataset)
    y_pred = y_pred_prob.argmax(axis=1)
    accuracy = accuracy_score(y_val, y_pred)
    print(f"Fold {fold_no} Validation Accuracy: {accuracy:.4f}")

    fold_no += 1


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Training fold 1...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Fold 1 Validation Accuracy: 0.7418
Training fold 2...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Fold 2 Validation Accuracy: 0.7824
Training fold 3...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Fold 3 Validation Accuracy: 0.8028
Training fold 4...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Fold 4 Validation Accuracy: 0.8078
Training fold 5...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Fold 5 Validation Accuracy: 0.8228


In [6]:
import pandas as pd
test_df=pd.read_csv("/kaggle/input/hackaton-god4/test.csv")

In [7]:
# Combine title and content for the test data
test_df['content'] = test_df['content'].fillna('')
test_df['text'] = test_df['title'] + " " + test_df['content']

# Tokenize the test data
test_encodings = tokenizer(test_df['text'].tolist(), padding='max_length', truncation=True, max_length=128)

# Convert to TensorFlow dataset
test_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': tf.constant(test_encodings['input_ids']),
     'attention_mask': tf.constant(test_encodings['attention_mask'])}
)).batch(32)

# Predict on the test data
y_test_pred_prob = model.predict(test_dataset)
y_test_pred = y_test_pred_prob.argmax(axis=1)

# Map integer predictions back to string labels
test_df['predicted_target'] = label_encoder.inverse_transform(y_test_pred)

# Display the first few predictions
print(test_df[['id', 'title', 'predicted_target']].head())

      id                                              title  \
0   3639                                          Tailgated   
1  21493  I am a model and because of mean things my mot...   
2  21215     On-and-off nothingness. I don’t know any more.   
3  13466                              Feeling really scared   
4  14084                  Looking for people who understand   

                 predicted_target  
0                         anxiety  
1  relationship-and-family-issues  
2  relationship-and-family-issues  
3                      depression  
4                         anxiety  


In [18]:
test_df.iloc[3].content

"Hello I am new to all this online forums and stuff, but Im just putting this out there, I didnt think I was depressed, but now Im feeling like it cant be anything else.  I am always so angry when I do the slightest thing wrong, such as spill something, and over the weekend, everytime I did something like that I got so angry started yelling then just broke down in tears.  I also suffer bouts of what I can admit is anorexia, I am currently 54 kgs but whenever I look in the mirror, I just see a fat woman staring back, I try not to talk about my weight with my family, they get upset when I call myself fat, its not that I think Im fat so much, I just feel fat, whether I have a medical condition that makes me feel bloated and weighed down or what I dont know.  I always feel tired and unmotivated to do anything, and can have trouble sleeping too. I also have a very low sex drive which is really taking its toll on my relationship, my partner thinks that I dont have any feelings for him anymor

In [8]:
import os
import joblib  # Import joblib for saving the label encoder
from sklearn.preprocessing import LabelEncoder

save_directory = '/kaggle/working/saved_model'
os.makedirs(save_directory, exist_ok=True)

# Save the tokenizer
tokenizer.save_pretrained(save_directory)

# Save the Keras model
model.save(os.path.join(save_directory, 'bert_model'))

# Save the label encoder
label_encoder_filename = os.path.join(save_directory, 'label_encoder.pkl')
joblib.dump(label_encoder, label_encoder_filename)

print("Model, tokenizer, and label encoder saved successfully!")

Model, tokenizer, and label encoder saved successfully!


In [9]:
import shutil

# Define the directory to be zipped
directory_to_zip = '/kaggle/working/saved_model'

# Define the output zip file path
zip_output_path = '/kaggle/working/saved_model.zip'

# Zip the directory
shutil.make_archive(directory_to_zip, 'zip', directory_to_zip)

'/kaggle/working/saved_model.zip'

In [10]:
from IPython.display import FileLink
FileLink(r'saved_model.zip')