#### Decision tree model

This model uses a keyword search to determine if posts contain a keyword. If not, it marks them as irrelevant. If they do contain a keyword, it performs logistic regression to determine relevance. To improve performance, it also takes various lists of keywords (such as medications, therapies, etc) and a list of negative keywords which are negatively associated with relevance (e.g. 'dating', 'fp'). These negative keywords were determined by the baseline model, but can easily be altered to improve (or worsen) model performance. In addition, there is a parameter called threshold which determines the probability needed to mark a post as relevant or not. Right now, it is set to .1 (although it might be better to make it even smaller). Feel free to alter the keywords.

Right now, the model incorporates the test train split within the model, and at some point I'll try to remove that without breaking everything, but my initial attempts to do so did not work.

In [25]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

class TextRelevanceModel:
    def __init__(self, keyword_categories, negative_keywords=None, model=None):
        """
        Initialize the model with keyword categories and an optional model.
        
        :param keyword_categories: Dictionary where keys are category names and values are lists of keywords.
        :param negative_keywords: List of negative keywords.
        :param model: An optional machine learning model. Defaults to LogisticRegression.
        """
        self.keyword_categories = keyword_categories
        self.negative_keywords = negative_keywords if negative_keywords is not None else []
        self.vectorizer = TfidfVectorizer()
        self.model = model if model is not None else LogisticRegression(max_iter=1000, random_state=42)
    
    def keyword_count(self, text, keywords):
        """Count the number of keywords present in the text."""
        return sum(1 for word in text.lower().split() if word in keywords)
    
    def prepare_data(self, df, text_column, target_column):
        """Prepare features and target variable from the dataframe."""
        # Initialize a DataFrame to store keyword counts for each category
        keyword_counts = pd.DataFrame()
        
        for category, keywords in self.keyword_categories.items():
            keyword_counts[category + '_count'] = df[text_column].apply(lambda x: self.keyword_count(x, keywords))
        
        # Calculate negative keyword counts
        keyword_counts['negative_keyword_count'] = df[text_column].apply(lambda x: self.keyword_count(x, self.negative_keywords))
        
        # Vectorize the text data
        X_text = self.vectorizer.fit_transform(df[text_column])
        
        # Combine the text features and the keyword count features
        X_keywords = keyword_counts.to_numpy()
        X = hstack([X_text, csr_matrix(X_keywords)])
        
        # Target variable
        y = df[target_column]
        
        return X, y
    
    def train(self, X, y):
        """Train the model using the provided features and target variable."""
        # Filter out rows where no keywords are present in any category
        keyword_present = np.any(X[:, -len(self.keyword_categories):].toarray(), axis=1)
        X_train = X[keyword_present]
        y_train = y[keyword_present]
        
        # Fit the model on the filtered data
        self.model.fit(X_train, y_train)
    
    def predict_proba(self, text):
        """Predict the probability of relevance for a new text."""
        # Vectorize the input text
        X_text_new = self.vectorizer.transform([text])
        
        # Calculate keyword counts for the new text
        keyword_counts_new = np.array([[self.keyword_count(text, keywords) for keywords in self.keyword_categories.values()]])
        negative_keyword_count_new = np.array([[self.keyword_count(text, self.negative_keywords)]])
        
        # Combine features
        X_new = hstack([X_text_new, csr_matrix(keyword_counts_new), csr_matrix(negative_keyword_count_new)])
        
        # Check if any keyword from the categories is present
        if not np.any(keyword_counts_new):
            return 0.0
        
        # Predict using the model
        return self.model.predict_proba(X_new)[0, 1]  # Return the probability of the positive class
    
    def evaluate(self, df, text_column, target_column, threshold=0.5):
        """Evaluate the model performance on the provided dataframe."""
        X, y = self.prepare_data(df, text_column, target_column)
        
        # Split the data into train and test sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
        
        # Train the model on the training set
        self.train(X_train, y_train)
        
        # Predict probabilities on the test set
        y_proba = np.array([self.predict_proba(text) for text in df[text_column].iloc[y_test.index]])
        
        # Apply the threshold to get the predicted classes
        y_pred = (y_proba >= threshold).astype(int)
        
        # Print the classification report
        print(classification_report(y_test, y_pred))
        
        # Print the confusion matrix
        print(confusion_matrix(y_test, y_pred))
        
        return y_pred

In [26]:
# Example usage
if __name__ == "__main__":
    # Importing and dropping rows from ataFrame

    df_coded = pd.read_csv('../data/processed_and_coded_posts.csv')
    df = df_coded[['processed_text','highly_relevant']]
    
    #Importing keywords

    csv_file_path = '../keywords/medications.csv'

    # Read the CSV file
    df_med = pd.read_csv(csv_file_path)

    # Extract the first column as a list of keywords
    medications = df_med.iloc[:, 0].tolist()

    csv_file_path_2 = '../keywords/Treatment.csv'

    # Read the CSV file
    df_therapy = pd.read_csv(csv_file_path_2)

    # Extract the first column as a list of keywords
    therapy = df_therapy.iloc[:, 0].tolist()

    general_keywords = ['medicine','therapy','treatment','recovery','prescribed','diagnosed','med','meds','prescribe','therapist','session','psychiatrist','psychiatrists','dosage','medication', 'dbt', 'abilify', 'outpatient', 'therapist', 'harming','medicine','therapy','treatment','recovery','prescribed','diagnosed','therapists','prescribe','diagnose','medicines','drugs','drug','therapist','session']

    # Define categories of keywords
    
    keyword_categories = {
    'general_keywords': general_keywords,
    'medications': medications,
    'therapy': therapy
    }
    
    # Define negative keywords
    negative_keywords = ['relationship', 'friend', 'together', 'fp', 'people', 'person', 'partner', 'dating']
    
    

 # Create an instance of the model
    model = TextRelevanceModel(keyword_categories, negative_keywords)
    
    # Evaluate the model on the DataFrame with a specified threshold to increase sensitivity
    model.evaluate(df, text_column='processed_text', target_column='highly_relevant', threshold=0.1)



              precision    recall  f1-score   support

           0       1.00      0.95      0.97       116
           1       0.40      1.00      0.57         4

    accuracy                           0.95       120
   macro avg       0.70      0.97      0.77       120
weighted avg       0.98      0.95      0.96       120

[[110   6]
 [  0   4]]
