#### Initial decision tree model

There is nothing wrong with this model, but I wanted to add a few more features to the class for ease of use. I would recommend using the 'Decision_tree_model_in_one' model instead

In [34]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack, csr_matrix
import numpy as np



Prediction probability for the sample text: 0.5922116401377012


In [35]:
class TextRelevanceModel:
    def __init__(self, keyword_categories, negative_keywords=None, model=None):
        """
        Initialize the model with keyword categories and an optional model.
        
        :param keyword_categories: Dictionary where keys are category names and values are lists of keywords.
        :param negative_keywords: List of negative keywords.
        :param model: An optional machine learning model. Defaults to LogisticRegression.
        """
        self.keyword_categories = keyword_categories
        self.negative_keywords = negative_keywords if negative_keywords is not None else []
        self.vectorizer = TfidfVectorizer()
        self.model = model if model is not None else LogisticRegression(max_iter=1000, random_state=42)
    
    def keyword_count(self, text, keywords):
        """Count the number of keywords present in the text."""
        return sum(1 for word in text.lower().split() if word in keywords)
    
    def prepare_data(self, df, text_column, target_column):
        """Prepare features and target variable from the dataframe."""
        # Initialize a DataFrame to store keyword counts for each category
        keyword_counts = pd.DataFrame()
        
        for category, keywords in self.keyword_categories.items():
            keyword_counts[category + '_count'] = df[text_column].apply(lambda x: self.keyword_count(x, keywords))
        
        # Calculate negative keyword counts
        keyword_counts['negative_keyword_count'] = df[text_column].apply(lambda x: self.keyword_count(x, self.negative_keywords))
        
        # Vectorize the text data
        X_text = self.vectorizer.fit_transform(df[text_column])
        
        # Combine the text features and the keyword count features
        X_keywords = keyword_counts.to_numpy()
        X = hstack([X_text, csr_matrix(X_keywords)])
        
        # Target variable
        y = df[target_column]
        
        return X, y
    
    def train(self, df, text_column, target_column):
        """Train the model using the provided dataframe."""
        X, y = self.prepare_data(df, text_column, target_column)
        
        # Convert X to csr_matrix for slicing
        X = csr_matrix(X)
        
        # Filter out rows where no keywords are present in any category
        keyword_present = np.any(X[:, -len(self.keyword_categories):].toarray(), axis=1)
        X = X[keyword_present]
        y = y[keyword_present]
        
        # Fit the model on the filtered data
        self.model.fit(X, y)
    
    def predict(self, text):
        """Predict the relevance of a new text."""
        # Vectorize the input text
        X_text_new = self.vectorizer.transform([text])
        
        # Calculate keyword counts for the new text
        keyword_counts_new = np.array([[self.keyword_count(text, keywords) for keywords in self.keyword_categories.values()]])
        negative_keyword_count_new = np.array([[self.keyword_count(text, self.negative_keywords)]])
        
        # Combine features
        X_new = hstack([X_text_new, csr_matrix(keyword_counts_new), csr_matrix(negative_keyword_count_new)])
        
        # Check if any keyword from the categories is present
        if not np.any(keyword_counts_new):
            return 0.0
        
        # Predict using the model
        return self.model.predict_proba(X_new)[0, 1]  # Return the probability of the positive class


In [36]:
 from sklearn.model_selection import train_test_split, cross_val_score

In [40]:
# Example usage
if __name__ == "__main__":
    # Importing and dropping rows from ataFrame

    df_coded = pd.read_csv('../data/processed_and_coded_posts.csv')
    df = df_coded[['processed_text','highly_relevant']]
    
    #Importing keywords

    csv_file_path = '../keywords/medications.csv'

    # Read the CSV file
    df_med = pd.read_csv(csv_file_path)

    # Extract the first column as a list of keywords
    medications = df_med.iloc[:, 0].tolist()

    csv_file_path_2 = '../keywords/Treatment.csv'

    # Read the CSV file
    df_therapy = pd.read_csv(csv_file_path_2)

    # Extract the first column as a list of keywords
    therapy = df_therapy.iloc[:, 0].tolist()

    general_keywords = ['medicine','therapy','treatment','recovery','prescribed','diagnosed','med','meds','prescribe','therapist','session','psychiatrist','psychiatrists','dosage','medication', 'dbt', 'abilify', 'outpatient', 'therapist', 'harming','medicine','therapy','treatment','recovery','prescribed','diagnosed','therapists','prescribe','diagnose','medicines','drugs','drug','therapist','session']

    # Define categories of keywords
    
    keyword_categories = {
    'general_keywords': general_keywords,
    'medications': medications,
    'therapy': therapy
}
    
    # Define negative keywords
    negative_keywords = ['relationship', 'friend', 'together', 'fp', 'people', 'person', 'partner', 'dating']
    
    # Create an instance of the model
    model = TextRelevanceModel(keyword_categories, negative_keywords)
    
    df_train, df_test = train_test_split(df, test_size=0.2, random_state=17, stratify=df.highly_relevant)


    # Train the model
    model.train(df_train, text_column='processed_text', target_column='highly_relevant')
    
   