In [5]:
import pandas as pd
import numpy as np
import os
import pickle
import datetime

import tensorflow as tf
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Bidirectional, LSTM, Dense, Dropout, Input
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score, cohen_kappa_score

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
import emoji

# Download NLTK resources
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('omw-1.4')

# Initialize stop words and lemmatizer
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Kiran\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Kiran\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kiran\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Kiran\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Kiran\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Kiran\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_pe

In [6]:
# Base Varials
DATASET_DIR = os.path.join('..', 'Dataset')
MODEL_DIR = os.path.join('..', 'Model')
IMAGE_DIR = os.path.join('..', 'Image')

DATA_PATH = os.path.join(DATASET_DIR, 'Suicide_Detection.csv')

In [None]:
# Functions

def read_csv(csv_file_path):
    # read csv
    df = pd.read_csv(csv_file_path)
    
    # replace class column values
    df['class'].replace({'suicide':1 , 'non-suicide':0} , inplace = True)
    
    # show head
    display(df.head())
    print('_'*50)
    
    # show info
    display(df.info())
    print('_'*50)
    
    return df

def save_model(model, file_path):
    """
    Saves the Keras Model to a .pkl file.
    """
    with open(file_path, 'wb') as file:
        pickle.dump(tokenizer, file)
        
    print(f"Model saved to {file_path}")
    
def load_model(file_path):
    """
    Loads the Keras Tokenizer from a .pkl file.
    """
    with open(file_path, 'rb') as file:
        model = pickle.load(file)
        
    print(f"Model loaded from {file_path}")
    
    return model

'''
Use example:
# Load the tokenizer when needed
loaded_tokenizer = load_model("tokenizer.pkl")
# Use the loaded tokenizer for new data
new_texts = ["I enjoy coding"]
new_sequences = loaded_tokenizer.texts_to_sequences(new_texts)
new_x_data = pad_sequences(new_sequences, maxlen=max_len)
print("Transformed new data:", new_x_data)
'''

def preprocess_tokenization_padding(df):
    # split text and label
    texts = data['text'].values
    labels = data['class'].values
    
    # Tokenization and padding
    max_words = 20000  # Adjust based on vocabulary size
    max_len = 200  # Adjust based on average post length

    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(texts)
    
    # save model to pkl
    model_file_path = os.path.join(MODEL_DIR, 'tokenizer.csv')
    save_to_pkl(tokenizer, model_file_path)
    
    sequences = tokenizer.texts_to_sequences(texts)
    x_data = pad_sequences(sequences, maxlen=max_len)
    y_data = np.array(labels)
    
    return x_data, y_data 

def split_data(x_data, y_data, test_size=0.2, random_state=42):
    '''
    Split the data into train, val, and test dataset
    '''
    x_train, x_val, y_train, y_val = train_test_split(x_data, y_data, 
                                                      test_size = test_size, 
                                                      random_state = random_state, 
                                                      stratify = y_data)
    
    x_test, _, y_test, _ = train_test_split(x_train, y_train, 
                                            test_size = test_size, 
                                            random_state = random_state, 
                                            stratify = y_train)
    
    return (x_train, y_train), (x_val, y_val), (x_test, y_test)

def compile_model(model, optimizer='adam', loss='binary_crossentropy', metrics='accuracy'):
    '''
    To compile the model
    '''
    # Compile the model
    model.compile(optimizer = optimizer, 
                  loss = loss, 
                  metrics = [metrics])

    # Model summary
    model.summary()
    
    return model

def plot_model(history):
    NAME_PREFIX = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
    
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']

    loss = history.history['loss']
    val_loss = history.history['val_loss']

    epochs = range(1, len(acc)+1)

    plt.plot(epochs, acc, label='training accuracy')
    plt.plot(epochs, val_acc, label='validation accuracy')
    plt.legend()
    plt.xlabel('epochs')
    plt.ylabel('accuracy')
    plt.title('Training validation Accuracy')
    plt.savefig(os.path.join(IMAGE_DIR, f'{NAME_PREFIX}_accuracy_and_val_accuracy.png'), dpi=200)
    plt.show()

    plt.plot(epochs, loss, label='training loss')
    plt.plot(epochs, val_loss, label='validation loss')
    plt.legend()
    plt.xlabel('epochs')
    plt.ylabel('loss')
    plt.title('Training validation Loss')
    plt.savefig(os.path.join(IMAGE_DIR, f'{NAME_PREFIX}_loss_and_val_loss.png'), dpi=200)
    plt.show()
    
    print(f'Image saved to {IMAGE_DIR} - [{NAME_PREFIX}_accuracy_and_val_accuracy.png, {NAME_PREFIX}_loss_and_val_loss.png]')
    
def train_model(model, batch_size=32, epochs=10, x_train, y_train, x_val, y_val):
    '''
    Train the model
    '''
    history = model.fit(x_train, y_train, 
                        epochs = epochs, 
                        batch_size = batch_size, 
                        validation_data = (x_val, y_val), 
                        verbose=1)
    plot_model(history)
    
    return model, history

def evaluate_model():
    ...
    


'20241116220121'