#### Testing the relevance model on the manually coded comments

As a final step, we run the relevance model on the testing set coded by Eunbin. Right now, this only contains relevant comments, so we can only assess the recall of the model, not the precision. We will have to enlarge the testing set with the irrelevant comments to assess performance.


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot  as plt

In [5]:
#Loading the manually coded relevant comments

df1=pd.read_csv('../../esk_working/val_df.csv')
df1=df1.dropna(how='any')
# Dropping the data points with null values 
df1 = df1.dropna(how = 'any', axis = 0)
# lowercasing the column names so it will be easier for access ^^
df1.columns = df1.columns.str.lower()

In [8]:
df1.head()

Unnamed: 0,comment,author,post,relevance
0,I’ve been seeing a therapist since the age of ...,ChocCoveredSarcasm,1bmk9m2,1
1,i’ve been in and out of therapy (mostly in) fo...,oddthing757,1bmk9m2,1
2,Been in regular (twice a week) therapy for ove...,bedrock_BEWD,1bmk9m2,1
3,In therapy since I was 14. It's been over 20 y...,Own_Collection_8916,1bmk9m2,1
4,"Tried CBT for years, and never got anywhere. \...",sky-amethyst23,1bmk9m2,1


In [9]:
# Step 1: Changing to Lower Case
df1['comment'] = df1['comment'].str.lower()

# Step 2: Replacing the Repeating Pattern of '&#039;'
df1['comment'] = df1['comment'].str.replace("&#039;", "")

# Step 3: Removing All Special Characters
df1['comment'] = df1['comment'].str.replace(r'[^\w\d\s]', '')

# Step 4: Removing Leading and Trailing Whitespaces
df1['comment'] = df1['comment'].str.strip()

# Step 5: Replacing Multiple Spaces with Single Space
df1['comment'] = df1['comment'].str.replace(r'\s+', ' ')

In [11]:
# Assuming 'selftext' is one of the columns you expect in df1
# You should check the actual columns in your DataFrame
# Make sure to load your DataFrame properly before running these operations

# Check if 'selftext' is in the columns
if 'comment' in df1.columns:
    # Drop rows where 'selftext' is '[removed]' or '\[removed\]'
    df1.drop(df1[(df1['comment'] =='\\[removed\\]')].index, inplace=True)
    df1.drop(df1[(df1['comment'] =='[removed]')].index, inplace=True)

    df1.drop(df1[(df1['comment'] =='\\[deleted\\]')].index, inplace=True)
    df1.drop(df1[(df1['comment'] =='[deleted]')].index, inplace=True)


    # Drop rows with missing values
    df1.dropna(inplace=True)


    # Randomly sample 2 rows
    print(df1.sample(2))
else:
    print("'selftext' column not found in DataFrame")

                                               comment            author  \
71   >i don't currently go to therapy,\n\nunfortuna...  Throwawayabc2345   
282  its a no for me, none of them really helped.\n...         trikkiirl   

        post  relevance  
71   12pyiln          1  
282  14s0tfr          1  


In [14]:
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, make_scorer, accuracy_score
from sklearn.metrics import recall_score, precision_score
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

In [15]:


# Punctuation removal
def remove_punctuation(text):
    return ''.join([char for char in text if char not in string.punctuation])

# Text preprocessing function
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    
    # Tokenization
    tokens = remove_punctuation(text).split()
    
    # Lowercase and remove stopwords
    tokens = [word.lower() for word in tokens if word.lower() not in stop_words]
    
    # Lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return ' '.join(tokens)

# Apply preprocessing to the combined text column
df1['processed_comment'] = df1['comment'].apply(preprocess_text)
df1['processed_comment'] = df1['processed_comment'].apply(remove_punctuation)


#### Loading and training the decision tree model

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack, csr_matrix
import numpy as np

class TextRelevanceModel:
    def __init__(self, keyword_categories, negative_keywords=None, model=None):
        self.keyword_categories = keyword_categories
        self.negative_keywords = negative_keywords if negative_keywords is not None else []
        self.vectorizer = TfidfVectorizer()
        self.model = model if model is not None else LogisticRegression(max_iter=1000, random_state=42)
    
    def keyword_count(self, text, keywords):
        return sum(1 for word in text.lower().split() if word in keywords)
    
    def prepare_data(self, df, text_column):
        df[text_column] = df[text_column].str.lower()

        keyword_counts = pd.DataFrame()
        
        for category, keywords in self.keyword_categories.items():
            keyword_counts[category + '_count'] = df[text_column].apply(lambda x: self.keyword_count(x, keywords))
        
        keyword_counts['negative_keyword_count'] = df[text_column].apply(lambda x: self.keyword_count(x, self.negative_keywords))
        
        X_text = self.vectorizer.fit_transform(df[text_column])
        X_keywords = keyword_counts.to_numpy()
        X = hstack([X_text, csr_matrix(X_keywords)])
        
        return X
    
    def train(self, X, y):
        keyword_present = np.any(X[:, -len(self.keyword_categories):].toarray(), axis=1)
        X_train = X[keyword_present]
        y_train = y[keyword_present]
        
        self.model.fit(X_train, y_train)
    
    def predict_proba(self, text):
        X_text_new = self.vectorizer.transform([text])
        keyword_counts_new = np.array([[self.keyword_count(text, keywords) for keywords in self.keyword_categories.values()]])
        negative_keyword_count_new = np.array([[self.keyword_count(text, self.negative_keywords)]])
        X_new = hstack([X_text_new, csr_matrix(keyword_counts_new), csr_matrix(negative_keyword_count_new)])
        
        if not np.any(keyword_counts_new):
            return 0.0
        
        return self.model.predict_proba(X_new)[0, 1]


In [17]:
# Example usage
if __name__ == "__main__":
    # Importing and dropping rows from Frame

    df_coded = pd.read_csv('../data/processed_and_coded_posts.csv')
    df2 = df_coded[['processed_text','highly_relevant']]
    
    #Importing keywords

    csv_file_path = '../keywords/medications.csv'

    # Read the CSV file
    df_med = pd.read_csv(csv_file_path)

    # Extract the first column as a list of keywords
    medications = df_med.iloc[:, 0].tolist()

    csv_file_path_2 = '../keywords/Treatment.csv'

    # Read the CSV file
    df_therapy = pd.read_csv(csv_file_path_2)

    # Extract the first column as a list of keywords
    therapy = df_therapy.iloc[:, 0].tolist()

    general_keywords = ['diagnose', 'diagnosed', 'dosage','dose', 'drug', 'drugs', 'harming', 'med', 'medication', 'medicine', 'medicines', 'meds', 'prescribe', 'prescribed', 'psychiatrist', 'psychiatrists', 'psychotherapy', 'recovery', 'session', 'therapist', 'therapists', 'therapy', 'treatment']

    # Define categories of keywords
    
    keyword_categories = {
    'general_keywords': general_keywords,
    'medications': medications,
    'therapy': therapy
}
    
    # Define negative keywords
    #negative_keywords = ['relationship', 'friend', 'together', 'fp', 'people', 'person', 'partner', 'dating']
    negative_keywords_2 = []


    # Create an instance of the model
    model = TextRelevanceModel(keyword_categories,negative_keywords_2)
    



    #Prepare data
    X = model.prepare_data(df2, text_column='processed_text')
    y = df2['highly_relevant']

    # Train the model
    model.train(X, y)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df[text_column].str.lower()


### Running the model on the manually coded dataset

Here, I use .1 as a threshold because that is what was done in the previous evaluation.

In [18]:
df1['relevance_probability']=0.0

In [19]:
#Making sure the code is working
model.predict_proba('This is a random string with dbt in it.')

0.14374214917702083

In [21]:
df1.head()

Unnamed: 0,comment,author,post,relevance,processed_comment,relevance_probability
0,i’ve been seeing a therapist since the age of ...,ChocCoveredSarcasm,1bmk9m2,1,i’ve seeing therapist since age 25 i’m 47 i’ve...,0.0
1,i’ve been in and out of therapy (mostly in) fo...,oddthing757,1bmk9m2,1,i’ve therapy mostly past 10ish year helpful ha...,0.0
2,been in regular (twice a week) therapy for ove...,bedrock_BEWD,1bmk9m2,1,regular twice week therapy ten year started ad...,0.0
3,in therapy since i was 14. it's been over 20 y...,Own_Collection_8916,1bmk9m2,1,therapy since 14 20 year ive learned lot year ...,0.0
4,"tried cbt for years, and never got anywhere. \...",sky-amethyst23,1bmk9m2,1,tried cbt year never got anywhere dbt iop save...,0.0


In [22]:
# Function to get the relevance probability for a given text
def get_relevance_probability(text):
    return model.predict_proba(text)

# Apply the function to each row in the 'processed_text' column and create the new column
df1['relevance_probability'] = df1['processed_comment'].apply(get_relevance_probability)


In [31]:
df2=df1[df1['relevance_probability']>.1]

In [32]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 294 entries, 0 to 310
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   comment                294 non-null    object 
 1   author                 294 non-null    object 
 2   post                   294 non-null    object 
 3   relevance              294 non-null    int64  
 4   processed_comment      294 non-null    object 
 5   relevance_probability  294 non-null    float64
dtypes: float64(1), int64(1), object(4)
memory usage: 16.1+ KB


This has length 294, which means that the model thought that 294 out of the original 311 comments were relevant. Therefore, we have a recall of 95%.