#### Experimenting with variations of the keyword model  

In this file, I'm trying to experiment with variations of the keyword model to see which one has the best performance. For different random settings, the numbers will be a bit different, so we should probably err on the side of simpler models.

In [9]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack, csr_matrix
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import numpy as np

class TextRelevanceModel:
    def __init__(self, keyword_categories, general_keywords=None, negative_keywords=None, model=None):
        self.keyword_categories = keyword_categories
        self.general_keywords = general_keywords if general_keywords is not None else []
        self.negative_keywords = negative_keywords if negative_keywords is not None else []
        self.vectorizer = TfidfVectorizer()
        self.model = model if model is not None else LogisticRegression(max_iter=1000, random_state=42)
    
    def keyword_count(self, text, keywords):
        return sum(1 for word in text.lower().split() if word in keywords)
    
    def prepare_data(self, df, text_column, fit_vectorizer=False):
        # Preprocess text data (convert to lowercase)
        df[text_column] = df[text_column].str.lower()

        keyword_counts = pd.DataFrame()
        
        for category, keywords in self.keyword_categories.items():
            keyword_counts[category + '_count'] = df[text_column].apply(lambda x: self.keyword_count(x, keywords))
        
        keyword_counts['negative_keyword_count'] = df[text_column].apply(lambda x: self.keyword_count(x, self.negative_keywords))
        
        if fit_vectorizer:
            X_text = self.vectorizer.fit_transform(df[text_column])
        else:
            X_text = self.vectorizer.transform(df[text_column])
        
        X_keywords = keyword_counts.to_numpy()
        X = hstack([X_text, csr_matrix(X_keywords)])
        
        return X
    
    def train(self, X, y):
        keyword_present = np.any(X[:, -len(self.keyword_categories):].toarray(), axis=1)
        X_train = X[keyword_present]
        y_train = y[keyword_present]
        
        self.model.fit(X_train, y_train)
    
    def predict_proba(self, text):
        text = text.lower()
        X_text_new = self.vectorizer.transform([text])
        keyword_counts_new = np.array([[self.keyword_count(text, keywords) for keywords in self.keyword_categories.values()]])
        negative_keyword_count_new = np.array([[self.keyword_count(text, self.negative_keywords)]])
        
        # Check if any keywords from keyword categories or general keywords are present
        any_keyword_present = np.any(keyword_counts_new) or self.keyword_count(text, self.general_keywords) > 0
        
        if not any_keyword_present:
            return 0.0
        
        X_new = hstack([X_text_new, csr_matrix(keyword_counts_new), csr_matrix(negative_keyword_count_new)])
        
        return self.model.predict_proba(X_new)[0, 1]




In [2]:
# Example usage
if __name__ == "__main__":
    # Example DataFrame
    training_data = {
        'text': [
            'This is the first document.', 'This document is the second document.', 
            'And this is the third one.', 'Is this the first document?',
            'More text data.', 'Even more text.', 'Text data again.', 
            'And another one.', 'More examples.', 'Additional text.',
            'Sample text.', 'Another sample.', 'More samples.', 'Final example.'
        ],
        'relevance': [1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0]  # Relevance labels
    }
    df_train = pd.DataFrame(training_data)
    
    # Define categories of keywords
    keyword_categories = {
        'Category1': ['first', 'second', 'third'],
        'Category2': ['more', 'additional', 'sample']
    }
    
    # Define general keywords
    general_keywords = ['document', 'text']
    
    # Define negative keywords
    negative_keywords = ['another', 'final']
    
    # Create an instance of the model
    model = TextRelevanceModel(keyword_categories, general_keywords, negative_keywords)
    
    # Prepare data
    X_train = model.prepare_data(df_train, text_column='text')
    y_train = df_train['relevance']
    
    # Train the model
    model.train(X_train, y_train)
    
    # Example DataFrame with 5 text samples
    data = {
        'text': [
            'This is the first document.', 
            'This document is the second document.', 
            'And this is the third one.', 
            'Is this the first document?',
            'More text data.'
        ]
    }
    df_samples = pd.DataFrame(data)

    # Predict the probability of relevance for each text sample
    for i, text_sample in enumerate(df_samples['text']):
        relevance_probability = model.predict_proba(text_sample)
        print(f"Probability of relevance for sample {i+1}: {relevance_probability:.2f}")


Probability of relevance for sample 1: 0.36
Probability of relevance for sample 2: 0.36
Probability of relevance for sample 3: 0.34
Probability of relevance for sample 4: 0.36
Probability of relevance for sample 5: 0.63


In [5]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split

In [59]:
rand =614

In [60]:
# Using this on the actual dataset
if __name__ == "__main__":
    # Importing and dropping rows from Frame

    df_coded = pd.read_csv('../data/processed_and_coded_posts.csv')
    df = df_coded[['processed_text','highly_relevant']]
    
    #Importing keywords

    csv_file_path = '../keywords/medications.csv'

    # Read the CSV file
    df_med = pd.read_csv(csv_file_path)

    # Extract the first column as a list of keywords
    medications = df_med.iloc[:, 0].tolist()

    csv_file_path_2 = '../keywords/Treatment.csv'

    # Read the CSV file
    df_therapy = pd.read_csv(csv_file_path_2)

    # Extract the first column as a list of keywords
    therapy = df_therapy.iloc[:, 0].tolist()

    general_keywords = ['diagnose', 'diagnosed', 'dosage','dose', 'drug', 'drugs', 'harming', 'med', 'medication', 'medicine', 'medicines', 'meds', 'prescribe', 'prescribed', 'psychiatrist', 'psychiatrists', 'psychotherapy', 'recovery', 'session', 'therapist', 'therapists', 'therapy', 'treatment']

    # Define categories of keywords
    
    keyword_categories = {
    'medications': medications,
    'therapy': therapy
}
    
    # Define negative keywords
    negative_keywords = ['relationship', 'friend', 'fp', 'partner', 'dating']
    
    # Create an instance of the model
    model = TextRelevanceModel(keyword_categories, general_keywords, negative_keywords)
    model2 = TextRelevanceModel(keyword_categories, general_keywords, negative_keywords,AdaBoostClassifier())

    # Test train split the data
    df_train, df_test = train_test_split(df, test_size=0.4, stratify=df['highly_relevant'], random_state=rand)

     # Prepare and train the model
    X_train = model.prepare_data(df_train, text_column='processed_text', fit_vectorizer=True)
    y_train = df_train['highly_relevant']
    model.train(X_train, y_train)

    
    # Prepare the test data
    X_test = model.prepare_data(df_test, text_column='processed_text', fit_vectorizer=False)
    y_test = df_test['highly_relevant']
    
    # Predict the probability of relevance for each text sample in the test set
    y_pred_proba = []
    for text_sample in df_test['processed_text']:
        relevance_probability = model.predict_proba(text_sample)
        y_pred_proba.append(relevance_probability)
    
    
    # Convert probabilities to binary predictions using a threshold (e.g., 0.05)
    threshold = 0.05
    y_pred = [1 if prob >= threshold else 0 for prob in y_pred_proba]

   # Evaluate the model performance
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Accuracy Score:", accuracy_score(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.77      0.86       232
           1       0.10      0.75      0.18         8

    accuracy                           0.77       240
   macro avg       0.54      0.76      0.52       240
weighted avg       0.96      0.77      0.84       240

Confusion Matrix:
 [[178  54]
 [  2   6]]
Accuracy Score: 0.7666666666666667


In [76]:
model = TextRelevanceModel(keyword_categories, general_keywords, negative_keywords,AdaBoostClassifier())

model.train(X_train, y_train)

# Test train split the data
df_train, df_test = train_test_split(df, test_size=0.4, stratify=df['highly_relevant'], random_state=rand)

    # Prepare and train the model
X_train = model.prepare_data(df_train, text_column='processed_text', fit_vectorizer=True)
y_train = df_train['highly_relevant']
model.train(X_train, y_train)


# Prepare the test data
X_test = model.prepare_data(df_test, text_column='processed_text', fit_vectorizer=False)
y_test = df_test['highly_relevant']

# Predict the probability of relevance for each text sample in the test set
y_pred_proba = []
for text_sample in df_test['processed_text']:
    relevance_probability = model.predict_proba(text_sample)
    y_pred_proba.append(relevance_probability)


# Convert probabilities to binary predictions using a threshold (e.g., 0.05)
threshold = 0.2
y_pred = [1 if prob >= threshold else 0 for prob in y_pred_proba]

# Evaluate the model performance
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))




Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.74      0.84       232
           1       0.09      0.75      0.16         8

    accuracy                           0.74       240
   macro avg       0.54      0.74      0.50       240
weighted avg       0.96      0.74      0.82       240

Confusion Matrix:
 [[171  61]
 [  2   6]]
Accuracy Score: 0.7375


In [62]:
keyword_categories = {
    'medications': medications,
    'therapy': therapy,
    'general': general_keywords
}

general_keywords_2 = []

model = TextRelevanceModel(keyword_categories,general_keywords_2,negative_keywords)

model.train(X_train, y_train)

# Test train split the data
df_train, df_test = train_test_split(df, test_size=0.4, stratify=df['highly_relevant'], random_state=rand)

    # Prepare and train the model
X_train = model.prepare_data(df_train, text_column='processed_text', fit_vectorizer=True)
y_train = df_train['highly_relevant']
model.train(X_train, y_train)


# Prepare the test data
X_test = model.prepare_data(df_test, text_column='processed_text', fit_vectorizer=False)
y_test = df_test['highly_relevant']

# Predict the probability of relevance for each text sample in the test set
y_pred_proba = []
for text_sample in df_test['processed_text']:
    relevance_probability = model.predict_proba(text_sample)
    y_pred_proba.append(relevance_probability)


# Convert probabilities to binary predictions using a threshold (e.g., 0.05)
threshold = 0.05
y_pred = [1 if prob >= threshold else 0 for prob in y_pred_proba]

# Evaluate the model performance
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.85      0.92       232
           1       0.15      0.75      0.25         8

    accuracy                           0.85       240
   macro avg       0.57      0.80      0.58       240
weighted avg       0.96      0.85      0.89       240

Confusion Matrix:
 [[198  34]
 [  2   6]]
Accuracy Score: 0.85


In [68]:
keyword_categories = {
    'medications': medications,
    'therapy': therapy,
    'general': general_keywords
}

general_keywords_blank = []
negative_keywords_blank = []

model = TextRelevanceModel(keyword_categories,general_keywords_blank,negative_keywords_blank)

model.train(X_train, y_train)

# Test train split the data
df_train, df_test = train_test_split(df, test_size=0.4, stratify=df['highly_relevant'], random_state=rand)

    # Prepare and train the model
X_train = model.prepare_data(df_train, text_column='processed_text', fit_vectorizer=True)
y_train = df_train['highly_relevant']
model.train(X_train, y_train)


# Prepare the test data
X_test = model.prepare_data(df_test, text_column='processed_text', fit_vectorizer=False)
y_test = df_test['highly_relevant']

# Predict the probability of relevance for each text sample in the test set
y_pred_proba = []
for text_sample in df_test['processed_text']:
    relevance_probability = model.predict_proba(text_sample)
    y_pred_proba.append(relevance_probability)


# Convert probabilities to binary predictions using a threshold (e.g., 0.05)
threshold = 0.05
y_pred = [1 if prob >= threshold else 0 for prob in y_pred_proba]

# Evaluate the model performance
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.90      0.95       232
           1       0.23      0.88      0.37         8

    accuracy                           0.90       240
   macro avg       0.61      0.89      0.66       240
weighted avg       0.97      0.90      0.93       240

Confusion Matrix:
 [[209  23]
 [  1   7]]
Accuracy Score: 0.9


For a fairly large number of settings, this model seems to be doing the best. Here, we don't include any 'negative keywords' and just count the number of keywords in each category. In particular, there is no 'general keyword' category which isn't counted.

In [75]:
keyword_categories = {
    'medications': medications,
    'therapy': therapy,
    'general': general_keywords
}


model = TextRelevanceModel(keyword_categories,general_keywords_blank,negative_keywords_blank,AdaBoostClassifier())

model.train(X_train, y_train)

# Test train split the data
df_train, df_test = train_test_split(df, test_size=0.4, stratify=df['highly_relevant'], random_state=rand)

    # Prepare and train the model
X_train = model.prepare_data(df_train, text_column='processed_text', fit_vectorizer=True)
y_train = df_train['highly_relevant']
model.train(X_train, y_train)


# Prepare the test data
X_test = model.prepare_data(df_test, text_column='processed_text', fit_vectorizer=False)
y_test = df_test['highly_relevant']

# Predict the probability of relevance for each text sample in the test set
y_pred_proba = []
for text_sample in df_test['processed_text']:
    relevance_probability = model.predict_proba(text_sample)
    y_pred_proba.append(relevance_probability)


# Convert probabilities to binary predictions using a threshold (e.g., 0.05)
threshold = 0.3
y_pred = [1 if prob >= threshold else 0 for prob in y_pred_proba]

# Evaluate the model performance
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))




Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.88      0.93       232
           1       0.17      0.75      0.28         8

    accuracy                           0.87       240
   macro avg       0.58      0.81      0.60       240
weighted avg       0.96      0.87      0.91       240

Confusion Matrix:
 [[203  29]
 [  2   6]]
Accuracy Score: 0.8708333333333333
