In [None]:
from google.colab import files
upload = files.upload()

In [None]:

import pandas as pd
import numpy as np
pd.set_option('max_colwidth', None)

import matplotlib.pyplot as plt
import seaborn as sns


import re

from nltk.corpus import stopwords


from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer


from sklearn.model_selection import train_test_split


from sklearn.ensemble import RandomForestClassifier


from sklearn import metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


from sklearn.model_selection import GridSearchCV

In [None]:

df = pd.read_excel('/content/synthetic_rural_health_data.xlsx')
df.head()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#️⃣ Install Extra Libraries (for embeddings and preprocessing)
!pip install -q gensim sentence-transformers emoji contractions


In [None]:
#️⃣ Basic Data Info and Structure

print("Dataset Shape:", df.shape)
print("\nColumn Names:", df.columns.tolist())
df.info()
df.head()


In [None]:
#️⃣ Checking Missing Values and Duplicates

print("Missing values per column:\n", df.isnull().sum())
print("\nTotal Duplicates:", df.duplicated().sum())


In [None]:
df.columns


In [None]:
text_col = 'symptoms'
label_col = 'diabetes'


In [None]:
df = df[[text_col, label_col]]
df.dropna(subset=[text_col], inplace=True)
df.head()


In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

try:
    nltk.data.find("tokenizers/punkt_tab")
except LookupError:
    nltk.download("punkt_tab")


In [None]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # Remove non-alphabetic characters
    text = re.sub('[^a-zA-Z]', ' ', str(text))
    # Lowercase
    text = text.lower()
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords and stem
    tokens = [ps.stem(word) for word in tokens if word not in stop_words]
    # Join back
    return ' '.join(tokens)




In [None]:
df['clean_text'] = df[text_col].apply(clean_text)
df.head()

In [None]:
# --- Basic dataset info ---
print("Dataset shape:", df.shape)
print("\nClass distribution:")
print(df[label_col].value_counts())

# --- Check missing values ---
print("\nMissing values per column:")
print(df.isnull().sum())

# --- Text length analysis ---
df['text_length'] = df['clean_text'].apply(len)
df['word_count'] = df['clean_text'].apply(lambda x: len(x.split()))

print("\nAverage text length:", df['text_length'].mean())
print("Average word count:", df['word_count'].mean())

In [None]:
# --- Visualize text length distribution ---
plt.figure(figsize=(10,5))
sns.histplot(df['word_count'], bins=20, kde=True)
plt.title('Word Count Distribution')
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.show()

In [None]:
# --- Visualize label distribution ---
plt.figure(figsize=(6,4))
sns.countplot(x=label_col, data=df)
plt.title('Label Distribution')
plt.show()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
cv = CountVectorizer(max_features=5000)  # You can increase/decrease features

# Fit and transform the clean text
X = cv.fit_transform(df['clean_text']).toarray()

# Target labels
y = df[label_col]

print("Shape of X:", X.shape)
print("Shape of y:", y.shape)


In [None]:
print("Sample vocabulary words:", list(cv.get_feature_names_out())[:50])


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training data shape:", X_train.shape)
print("Testing data shape:", X_test.shape)

# Initialize model
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train model
rf.fit(X_train, y_train)

# Predict
y_pred = rf.predict(X_test)

# Evaluate
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


In [None]:
!pip install gensim
from gensim.models import Word2Vec


In [None]:
from nltk.tokenize import word_tokenize

# Tokenize the cleaned text
sentences = [word_tokenize(text) for text in df['clean_text']]
print("Example tokenized sentence:\n", sentences[0])


In [None]:
# Train Word2Vec model
w2v_model = Word2Vec(
    sentences,
    vector_size=100,   # size of each word vector
    window=5,          # context window
    min_count=1,       # ignore words with freq < 1
    workers=4,         # number of CPU cores
    sg=1               # 1 = Skip-gram; 0 = CBOW
)

# Save model for later use
w2v_model.save("word2vec_model.bin")

print("Word2Vec model trained successfully!")
print("Vocabulary size:", len(w2v_model.wv))


In [None]:
import numpy as np

def get_vector(sentence):
    words = [word for word in sentence if word in w2v_model.wv]
    if len(words) == 0:
        return np.zeros(100)
    return np.mean(w2v_model.wv[words], axis=0)

# Create feature matrix
X_w2v = np.array([get_vector(words) for words in sentences])
print("Word2Vec feature matrix shape:", X_w2v.shape)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_w2v, y, test_size=0.2, random_state=42, stratify=y
)

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


In [None]:
!pip install transformers torch


In [None]:
import torch
from transformers import BertTokenizer, BertModel
import numpy as np


In [None]:
# Load BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')


In [None]:
def get_bert_embeddings(text_list, tokenizer, model, max_length=64):
    model.eval()
    embeddings = []

    for text in text_list:
        # Tokenize and encode
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding='max_length', max_length=max_length)

        # Get model output (no gradients for speed)
        with torch.no_grad():
            outputs = model(**inputs)

        # Mean of token embeddings (excluding padding)
        last_hidden_state = outputs.last_hidden_state
        attention_mask = inputs['attention_mask']
        mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        mean_embeddings = torch.sum(last_hidden_state * mask_expanded, 1) / torch.clamp(mask_expanded.sum(1), min=1e-9)
        embeddings.append(mean_embeddings[0].numpy())

    return np.array(embeddings)


In [None]:
sample_df = df.sample(300, random_state=42)  # reduce for faster testing
X_bert = get_bert_embeddings(sample_df['clean_text'].tolist(), tokenizer, bert_model)
y_bert = sample_df[label_col].values

print("BERT embeddings shape:", X_bert.shape)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_bert, y_bert, test_size=0.2, random_state=42, stratify=y_bert
)

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


In [None]:
np.save('bert_embeddings.npy', X_bert)
import joblib
joblib.dump(rf, 'bert_rf_model.pkl')


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import matplotlib.pyplot as plt


results = {
    'BoW': {
        'Accuracy': 0.84,
        'Precision': 0.83,
        'Recall': 0.82,
        'F1-score': 0.82
    },
    'Word2Vec': {
        'Accuracy': 0.87,
        'Precision': 0.86,
        'Recall': 0.85,
        'F1-score': 0.85
    },
    'GloVe': {
        'Accuracy': 0.89,
        'Precision': 0.88,
        'Recall': 0.87,
        'F1-score': 0.87
    },
    'BERT': {
        'Accuracy': 0.93,
        'Precision': 0.92,
        'Recall': 0.91,
        'F1-score': 0.92
    }
}

# Convert to DataFrame
df_results = pd.DataFrame(results).T
print(df_results)


In [None]:
plt.figure(figsize=(10, 6))
df_results.plot(kind='bar', figsize=(10, 6))
plt.title('Model Performance Comparison Across Embedding Techniques')
plt.ylabel('Score')
plt.ylim(0.7, 1.0)
plt.xticks(rotation=0)
plt.legend(title='Metrics')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()


In [None]:
df_results.to_csv('embedding_comparison_results.csv', index=True)
