In [None]:
!pip install keras-tuner --quiet

In [None]:
import os
import glob
import pandas as pd
from tqdm import tqdm
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import numpy as np
from nltk.tokenize import sent_tokenize
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from kerastuner import RandomSearch
from kerastuner.engine.hyperparameters import HyperParameters


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# 3. Define dataset path
dataset_path = '/content/drive/MyDrive/Transcript'

# 4. Function to load all CSVs and assign labels
def load_dialogues(folder_path, label):
    all_data = []
    files = glob.glob(os.path.join(folder_path, "*.csv"))
    for file in tqdm(files, desc=f"Loading {os.path.basename(folder_path)}"):
        try:
            df = pd.read_csv(file)
            dialogue_text = " ".join(df.astype(str).values.flatten())
            all_data.append((dialogue_text, label))
        except Exception as e:
            print(f"Error reading {file}: {e}")
    return all_data

In [None]:
path = '/content/drive/MyDrive/Transcript'# Replace with your actual path
print("Folders inside:", os.listdir(path))

In [None]:
# 5. Load depression and non-depression data
depression_data = load_dialogues(os.path.join(dataset_path, 'Depression'), 1)
non_depression_data = load_dialogues(os.path.join(dataset_path, 'Non-Depression'), 0)

# Combine into a DataFrame
all_data = pd.DataFrame(depression_data + non_depression_data, columns=["text", "label"])

# Shuffle the data
all_data = all_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Preview
all_data.head()

In [None]:
aug_df = pd.read_csv('/content/augmented_dataset.csv')
aug_df.head()

In [None]:
all_data.shape

In [None]:
aug_df.shape

In [None]:
!pip install sentence-transformers

from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-mpnet-base-v2')  # Fast & good for Colab

In [None]:
def get_dialogue_embedding(text):
    sentences = sent_tokenize(text)
    embeddings = model.encode(sentences)

    mean_pool = np.mean(embeddings, axis=0)
    max_pool = np.max(embeddings, axis=0)

    combined = np.concatenate([mean_pool, max_pool])
    return combined

In [None]:
X = np.array([get_dialogue_embedding(t) for t in aug_df['text']])
y = np.array(aug_df['label'])

In [None]:
def build_model(hp):
    model = Sequential()
    model.add(Input(shape=(X.shape[1],)))
    model.add(Dense(hp.Int('units1', 128, 256, step=64), activation='relu'))
    model.add(Dropout(hp.Float('dropout1', 0.2, 0.5, step=0.1)))
    model.add(Dense(hp.Int('units2', 64, 128, step=64), activation='relu'))
    model.add(Dropout(hp.Float('dropout2', 0.2, 0.5, step=0.1)))
    model.add(Dense(hp.Int('units3', 32, 64, step=32), activation='relu'))
    model.add(Dropout(hp.Float('dropout3', 0.2, 0.5, step=0.1)))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(
        optimizer=Adam(learning_rate=hp.Choice('lr', [1e-2, 1e-3, 1e-4])),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model

In [None]:
tuner = RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=20,
    executions_per_trial=1,
    directory='tuner_results',
    project_name='mlp_depression_classification'
)

tuner.search(X, y, validation_split=0.1, epochs=150, batch_size=16, verbose=1)

In [None]:
import matplotlib.pyplot as plt

# Get Best Hyperparameters
best_hp = tuner.get_best_hyperparameters(num_trials=1)[0]

# Stratified K-Fold Evaluation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
all_reports = []
fold = 1

for train_idx, test_idx in kf.split(X, y):
    print(f"\n Fold {fold}")
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    model = Sequential()
    model.add(Input(shape=(X.shape[1],)))
    model.add(Dense(best_hp['units1'], activation='relu'))
    model.add(Dropout(best_hp['dropout1']))
    model.add(Dense(best_hp['units2'], activation='relu'))
    model.add(Dropout(best_hp['dropout2']))
    model.add(Dense(best_hp['units3'], activation='relu'))
    model.add(Dropout(best_hp['dropout3']))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(
        optimizer=Adam(learning_rate=best_hp['lr']),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )

    history = model.fit(X_train, y_train, validation_split=0.1, epochs=150, batch_size=16, verbose=0)

    y_pred_probs = model.predict(X_test).flatten()
    y_pred = (y_pred_probs > 0.5).astype(int)

    report = classification_report(y_test, y_pred, output_dict=True)
    acc = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred))

    if fold == 5:
        model.save("best_fold5_model.h5")

    # Plot training accuracy and loss per fold
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Train Acc')
    plt.plot(history.history['val_accuracy'], label='Val Acc')
    plt.title(f'Fold {fold} - Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Val Loss')
    plt.title(f'Fold {fold} - Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    plt.tight_layout()
    plt.show()

    all_reports.append(report)
    fold += 1

In [None]:
# 10. Average Metrics Across Folds
report_df = pd.DataFrame([r['weighted avg'] for r in all_reports])
print("\n📊 Average Metrics Across Folds:")
print(report_df.mean())

In [None]:
report_df

In [None]:
from tensorflow.keras.models import load_model

model = load_model("/content/best_fold5_model.h5")
model.summary()

In [None]:
y_pred_probs = model.predict(X_test).flatten()
y_pred = (y_pred_probs > 0.5).astype(int)

print(classification_report(y_test, y_pred))