In [None]:
# Visualize t-SNE with actual labels
fig = plt.figure(figsize=(14, 10))

statuses = df['status'].iloc[sample_indices]
unique_statuses = statuses.unique()
colors = plt.cm.tab10(np.linspace(0, 1, len(unique_statuses)))

for i, status in enumerate(unique_statuses):
    mask = statuses == status
    plt.scatter(features_tsne[mask, 0], features_tsne[mask, 1], 
               c=[colors[i]], label=status, alpha=0.6, s=30)

plt.xlabel('t-SNE Component 1', fontsize=12)
plt.ylabel('t-SNE Component 2', fontsize=12)
plt.title('t-SNE Visualization of Mental Health Statements (Colored by True Labels)', 
         fontsize=14, fontweight='bold')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Apply t-SNE for visualization (on a sample for speed)
print("Applying t-SNE (this may take a few minutes)...")

# Use a sample for faster computation
sample_size = min(5000, len(features_pca))
sample_indices = np.random.choice(len(features_pca), sample_size, replace=False)

tsne = TSNE(n_components=2, perplexity=30, n_iter=1000, random_state=42, verbose=1)
features_tsne = tsne.fit_transform(features_pca[sample_indices])

print(f"‚úì t-SNE complete!")
print(f"  t-SNE shape: {features_tsne.shape}")

## 8. Dimensionality Reduction - t-SNE

In [None]:
# Apply PCA
print("Applying PCA...")
pca = PCA(n_components=0.95)  # Preserve 95% variance
features_pca = pca.fit_transform(features_scaled)

print(f"‚úì PCA complete")
print(f"  Original dimensions: {features_scaled.shape[1]}")
print(f"  Reduced dimensions: {features_pca.shape[1]}")
print(f"  Variance explained: {pca.explained_variance_ratio_.sum():.2%}")

# Plot explained variance
fig, axes = plt.subplots(1, 2, figsize=(16, 5))

# Scree plot
axes[0].plot(range(1, len(pca.explained_variance_ratio_) + 1), 
             pca.explained_variance_ratio_, 'bo-')
axes[0].set_xlabel('Principal Component')
axes[0].set_ylabel('Explained Variance Ratio')
axes[0].set_title('PCA Scree Plot')
axes[0].grid(True, alpha=0.3)

# Cumulative variance
cumsum = np.cumsum(pca.explained_variance_ratio_)
axes[1].plot(range(1, len(cumsum) + 1), cumsum, 'ro-')
axes[1].axhline(y=0.95, color='g', linestyle='--', label='95% threshold')
axes[1].set_xlabel('Number of Components')
axes[1].set_ylabel('Cumulative Explained Variance')
axes[1].set_title('Cumulative Explained Variance')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 7. Dimensionality Reduction - PCA

In [None]:
# Standardize features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(all_features)

print(f"‚úì Features normalized")
print(f"  Mean: {features_scaled.mean():.6f}")
print(f"  Std:  {features_scaled.std():.6f}")
print(f"  Min:  {features_scaled.min():.2f}")
print(f"  Max:  {features_scaled.max():.2f}")

## 6. Feature Normalization

In [None]:
# Combine all features
print("Combining all features...")

# Linguistic features
linguistic_features = features_df.values

# TF-IDF features
tfidf_features_array = tfidf_array

# Combine
all_features = np.concatenate([linguistic_features, tfidf_features_array], axis=1)

print(f"‚úì Combined feature matrix shape: {all_features.shape}")
print(f"  - Linguistic features: {linguistic_features.shape[1]}")
print(f"  - TF-IDF features: {tfidf_features_array.shape[1]}")
print(f"  - Total features: {all_features.shape[1]}")

In [None]:
# Create TF-IDF features
print("Creating TF-IDF features...")
tfidf = TfidfVectorizer(
    max_features=100,  # Top 100 words
    ngram_range=(1, 2),  # Unigrams and bigrams
    min_df=5,  # Minimum document frequency
    max_df=0.8  # Maximum document frequency
)

tfidf_features = tfidf.fit_transform(df['processed_text'])
tfidf_array = tfidf_features.toarray()

print(f"‚úì TF-IDF shape: {tfidf_array.shape}")
print(f"  - Samples: {tfidf_array.shape[0]:,}")
print(f"  - TF-IDF features: {tfidf_array.shape[1]}")

# Get feature names
feature_names = tfidf.get_feature_names_out()
print(f"\nTop 20 TF-IDF features:")
print(list(feature_names[:20]))

## 5. TF-IDF Vectorization

In [None]:
# Extract basic text features
def extract_features(text):
    """Extract linguistic features from text"""
    tokens = text.split()
    
    # Basic features
    word_count = len(tokens)
    char_count = len(text)
    avg_word_length = np.mean([len(w) for w in tokens]) if tokens else 0
    
    # Emotional keywords
    negative_words = ['sad', 'depressed', 'hopeless', 'anxious', 'worried', 'scared', 
                     'hurt', 'pain', 'lonely', 'empty', 'guilty', 'restless', 'nervous']
    positive_words = ['happy', 'joy', 'good', 'great', 'love', 'wonderful', 'calm', 'peace']
    
    negative_count = sum(1 for w in tokens if w in negative_words)
    positive_count = sum(1 for w in tokens if w in positive_words)
    
    # Pronoun usage
    first_person = sum(1 for w in tokens if w in ['i', 'me', 'my', 'mine', 'myself'])
    
    return {
        'word_count': word_count,
        'char_count': char_count,
        'avg_word_length': avg_word_length,
        'negative_words': negative_count,
        'positive_words': positive_count,
        'first_person_pronouns': first_person,
        'negative_ratio': negative_count / word_count if word_count > 0 else 0,
        'first_person_ratio': first_person / word_count if word_count > 0 else 0
    }

# Extract features for all texts
print("Extracting linguistic features...")
features_list = df['processed_text'].apply(extract_features).tolist()
features_df = pd.DataFrame(features_list)

# Combine with original dataframe
df_with_features = pd.concat([df, features_df], axis=1)

print("‚úì Feature extraction complete!")
print(f"\nExtracted Features: {list(features_df.columns)}")
print(f"\nFeature Statistics:")
print(features_df.describe())

## 4. Linguistic Feature Extraction

In [None]:
# Download NLTK data if needed
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)
    nltk.download('omw-1.4', quiet=True)

print("‚úì NLTK data ready")

# Initialize preprocessing tools
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """
    Complete text preprocessing pipeline
    """
    if pd.isna(text):
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens 
              if word not in stop_words and len(word) > 2]
    
    return ' '.join(tokens)

# Apply preprocessing
print("Preprocessing text...")
import re
df['processed_text'] = df['statement'].apply(preprocess_text)

print("‚úì Text preprocessing complete!")
print(f"\nExample:")
print(f"Original:  {df['statement'].iloc[0]}")
print(f"Processed: {df['processed_text'].iloc[0]}")

## 3. Text Preprocessing

In [None]:
# Visualize class distribution
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Bar plot
df['status'].value_counts().plot(kind='bar', ax=axes[0], color='skyblue', edgecolor='black')
axes[0].set_title('Mental Health Status Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Mental Health Status', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(axis='y', alpha=0.3)

# Pie chart
df['status'].value_counts().plot(kind='pie', ax=axes[1], autopct='%1.1f%%', startangle=90)
axes[1].set_title('Mental Health Status Percentage', fontsize=14, fontweight='bold')
axes[1].set_ylabel('')

plt.tight_layout()
plt.show()

# Print statistics
print("\nüìä Class Distribution Statistics:")
status_counts = df['status'].value_counts()
for status, count in status_counts.items():
    percentage = (count / len(df)) * 100
    print(f"   {status:25s}: {count:6,} ({percentage:5.2f}%)")

In [None]:
# Load dataset
df = pd.read_csv('../data/Combined Data.csv', index_col=0)

print("="*60)
print("DATASET OVERVIEW")
print("="*60)
print(f"\nüìä Dataset Shape: {df.shape}")
print(f"   - Total samples: {len(df):,}")
print(f"   - Features: {len(df.columns)}")

print("\nüìã Column Names:")
print(df.columns.tolist())

print("\nüîç First 5 rows:")
print(df.head())

print("\nüìà Dataset Info:")
print(df.info())

print("\n‚ùì Missing Values:")
print(df.isnull().sum())

print("\nüè∑Ô∏è Mental Health Status Distribution:")
print(df['status'].value_counts())

## 2. Load and Explore Dataset

In [None]:
# Core libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# NLP
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Dimensionality reduction
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# Clustering
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.mixture import GaussianMixture

# Evaluation
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.metrics import confusion_matrix, classification_report

# Visualization
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Statistics
from scipy import stats

# Settings
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 8)
pd.set_option('display.max_columns', None)

print("‚úì All libraries loaded successfully!")
print(f"NumPy: {np.__version__}")
print(f"Pandas: {pd.__version__}")

## 1. Import Libraries

# üß† Mental Health Biomarker Discovery - Text Analysis

## Unsupervised Learning on Mental Health Sentiment Dataset

**Dataset:** Kaggle Mental Health Sentiment Analysis  
**Size:** 94,025 text statements  
**Labels:** Anxiety, Depression, Normal, Bipolar, Personality Disorder, Stress, Suicidal

---

### üéØ Project Goals:
1. **Discover hidden subtypes** within each mental health condition
2. **Extract linguistic biomarkers** that distinguish conditions
3. **Apply unsupervised clustering** to find patterns
4. **Validate with statistical analysis**

### üìä Pipeline:
`Text Data ‚Üí Preprocessing ‚Üí Feature Extraction ‚Üí Dimensionality Reduction ‚Üí Clustering ‚Üí Analysis`