In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack, csr_matrix
import numpy as np

class TextRelevanceModel:
    def __init__(self, keyword_categories, negative_keywords=None, model=None):
        self.keyword_categories = keyword_categories
        self.negative_keywords = negative_keywords if negative_keywords is not None else []
        self.vectorizer = TfidfVectorizer()
        self.model = model if model is not None else LogisticRegression(max_iter=1000, random_state=42)
    
    def keyword_count(self, text, keywords):
        return sum(1 for word in text.lower().split() if word in keywords)
    
    def prepare_data(self, df, text_column):
        df[text_column] = df[text_column].str.lower()

        keyword_counts = pd.DataFrame()
        
        for category, keywords in self.keyword_categories.items():
            keyword_counts[category + '_count'] = df[text_column].apply(lambda x: self.keyword_count(x, keywords))
        
        keyword_counts['negative_keyword_count'] = df[text_column].apply(lambda x: self.keyword_count(x, self.negative_keywords))
        
        X_text = self.vectorizer.fit_transform(df[text_column])
        X_keywords = keyword_counts.to_numpy()
        X = hstack([X_text, csr_matrix(X_keywords)])
        
        return X
    
    def train(self, X, y):
        keyword_present = np.any(X[:, -len(self.keyword_categories):].toarray(), axis=1)
        X_train = X[keyword_present]
        y_train = y[keyword_present]
        
        self.model.fit(X_train, y_train)
    
    def predict_proba(self, text):
        X_text_new = self.vectorizer.transform([text])
        keyword_counts_new = np.array([[self.keyword_count(text, keywords) for keywords in self.keyword_categories.values()]])
        negative_keyword_count_new = np.array([[self.keyword_count(text, self.negative_keywords)]])
        X_new = hstack([X_text_new, csr_matrix(keyword_counts_new), csr_matrix(negative_keyword_count_new)])
        
        if not np.any(keyword_counts_new):
            return 0.0
        
        return self.model.predict_proba(X_new)[0, 1]


In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVR

In [7]:
# Example usage
if __name__ == "__main__":
    # Example DataFrame
    data = {
        'text': [
            'This is the first document.', 'This document is the second document.', 
            'And this is the third one.', 'Is this the first document?',
            'More text data.', 'Even more text.', 'Text data again.', 
            'And another one.', 'More examples.', 'Additional text.',
            'Sample text.', 'Another sample.', 'More samples.', 'Final example.'
        ],
        'relevance': [1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0]  # Relevance labels
    }
    df = pd.DataFrame(data)
    
    # Define categories of keywords
    keyword_categories = {
        'Category1': ['first', 'second', 'third'],
        'Category2': ['more', 'additional', 'sample']
    }
    
    # Define negative keywords
    negative_keywords = ['another', 'final']
    
    # Create an instance of the model
    model = TextRelevanceModel(keyword_categories, negative_keywords,KNeighborsClassifier())
    
    # Prepare data
    X = model.prepare_data(df, text_column='text')
    y = df['relevance']
    
    # Train the model
    model.train(X, y)
    
    # Predict the probability of relevance for a new sample of text
    text_sample = "This is a final document"
    relevance_probability = model.predict_proba(text_sample)
    print("Probability of relevance:", relevance_probability)


Probability of relevance: 0.0


In [20]:
# Example DataFrame with 5 text samples
data = {
    'text': [
        'This is the first document.', 
        'This document is the second document.', 
        'And this is the third one.', 
        'Is this the first document?',
        'More text data.'
    ]
}
df_samples = pd.DataFrame(data)

# Predict the probability of relevance for each text sample
for i, text_sample in enumerate(df_samples['text']):
    relevance_probability = model.predict_proba(text_sample)
    print(f"Probability of relevance for sample {i+1}: {relevance_probability:.2f}")


Probability of relevance for sample 1: 0.36
Probability of relevance for sample 2: 0.36
Probability of relevance for sample 3: 0.34
Probability of relevance for sample 4: 0.36
Probability of relevance for sample 5: 0.63


In [9]:
# Example usage
if __name__ == "__main__":
    # Importing and dropping rows from Frame

    df_coded = pd.read_csv('../data/processed_and_coded_posts.csv')
    df = df_coded[['processed_text','highly_relevant']]
    
    #Importing keywords

    csv_file_path = '../keywords/medications.csv'

    # Read the CSV file
    df_med = pd.read_csv(csv_file_path)

    # Extract the first column as a list of keywords
    medications = df_med.iloc[:, 0].tolist()

    csv_file_path_2 = '../keywords/Treatment.csv'

    # Read the CSV file
    df_therapy = pd.read_csv(csv_file_path_2)

    # Extract the first column as a list of keywords
    therapy = df_therapy.iloc[:, 0].tolist()

    general_keywords = ['diagnose', 'diagnosed', 'dosage','dose', 'drug', 'drugs', 'harming', 'med', 'medication', 'medicine', 'medicines', 'meds', 'prescribe', 'prescribed', 'psychiatrist', 'psychiatrists', 'psychotherapy', 'recovery', 'session', 'therapist', 'therapists', 'therapy', 'treatment']

    # Define categories of keywords
    
    keyword_categories = {
    'general_keywords': general_keywords,
    'medications': medications,
    'therapy': therapy
}
    
    # Define negative keywords
    #negative_keywords = ['relationship', 'friend', 'together', 'fp', 'people', 'person', 'partner', 'dating']
    negative_keywords_2 = []


    # Create an instance of the model
    model = TextRelevanceModel(keyword_categories,negative_keywords_2)
    



    #Prepare data
    X = model.prepare_data(df, text_column='processed_text')
    y = df['highly_relevant']

    # Train the model
    model.train(X, y)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[text_column] = df[text_column].str.lower()


In [11]:
df.columns

Index(['processed_text', 'highly_relevant'], dtype='object')

#### Using the model to predict relevance

Now we can turn the model loose on the data set. Note that this code has a fundamental flaw, in that I am fitting the model using the entire data set and then using those to predict the probabilities. In practice, we will want to use the entire coded data set to fit the probabilities, and then code up a test data set to evaluate the model. Finally, we will predict the probabilities for all the posts in the entire BPD subreddit dataset. This will give us our collection of relevant posts.

In [59]:
# Predict the probability of relevance for each text sample
df_coded['relevance_probability']=0.0

for i, text_sample in enumerate(df_coded['processed_text']):
    relevance_probability = model.predict_proba(text_sample)
    df_coded['relevance_probability'][i]=relevance_probability
    print(f"Probability of relevance for sample {i+1}: {relevance_probability:.2f}")

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_test['relevance_probability'][i]=relevance_probability
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_te

Probability of relevance for sample 1: 0.26
Probability of relevance for sample 2: 0.00
Probability of relevance for sample 3: 0.00
Probability of relevance for sample 4: 0.00
Probability of relevance for sample 5: 0.00
Probability of relevance for sample 6: 0.00
Probability of relevance for sample 7: 0.37
Probability of relevance for sample 8: 0.02
Probability of relevance for sample 9: 0.29
Probability of relevance for sample 10: 0.00
Probability of relevance for sample 11: 0.28
Probability of relevance for sample 12: 0.30
Probability of relevance for sample 13: 0.12
Probability of relevance for sample 14: 0.00
Probability of relevance for sample 15: 0.00
Probability of relevance for sample 16: 0.00
Probability of relevance for sample 17: 0.04
Probability of relevance for sample 18: 0.00
Probability of relevance for sample 19: 0.22
Probability of relevance for sample 20: 0.00


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_test['relevance_probability'][i]=relevance_probability
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_te

Probability of relevance for sample 21: 0.00
Probability of relevance for sample 22: 0.00
Probability of relevance for sample 23: 0.00
Probability of relevance for sample 24: 0.00
Probability of relevance for sample 25: 0.00
Probability of relevance for sample 26: 0.13
Probability of relevance for sample 27: 0.00
Probability of relevance for sample 28: 0.37
Probability of relevance for sample 29: 0.00
Probability of relevance for sample 30: 0.00
Probability of relevance for sample 31: 0.65
Probability of relevance for sample 32: 0.13
Probability of relevance for sample 33: 0.00
Probability of relevance for sample 34: 0.00
Probability of relevance for sample 35: 0.16
Probability of relevance for sample 36: 0.21
Probability of relevance for sample 37: 0.00
Probability of relevance for sample 38: 0.00
Probability of relevance for sample 39: 0.69
Probability of relevance for sample 40: 0.00
Probability of relevance for sample 41: 0.00
Probability of relevance for sample 42: 0.00
Probabilit

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_test['relevance_probability'][i]=relevance_probability
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_te

Probability of relevance for sample 45: 0.19
Probability of relevance for sample 46: 0.00
Probability of relevance for sample 47: 0.00
Probability of relevance for sample 48: 0.00
Probability of relevance for sample 49: 0.00
Probability of relevance for sample 50: 0.25
Probability of relevance for sample 51: 0.00
Probability of relevance for sample 52: 0.00
Probability of relevance for sample 53: 0.00
Probability of relevance for sample 54: 0.00
Probability of relevance for sample 55: 0.27
Probability of relevance for sample 56: 0.00
Probability of relevance for sample 57: 0.00
Probability of relevance for sample 58: 0.04
Probability of relevance for sample 59: 0.00
Probability of relevance for sample 60: 0.09
Probability of relevance for sample 61: 0.00
Probability of relevance for sample 62: 0.23
Probability of relevance for sample 63: 0.11
Probability of relevance for sample 64: 0.00
Probability of relevance for sample 65: 0.00
Probability of relevance for sample 66: 0.00
Probabilit

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_test['relevance_probability'][i]=relevance_probability
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_te

Probability of relevance for sample 72: 0.19
Probability of relevance for sample 73: 0.00
Probability of relevance for sample 74: 0.00
Probability of relevance for sample 75: 0.14
Probability of relevance for sample 76: 0.29
Probability of relevance for sample 77: 0.19
Probability of relevance for sample 78: 0.33
Probability of relevance for sample 79: 0.23
Probability of relevance for sample 80: 0.45
Probability of relevance for sample 81: 0.16
Probability of relevance for sample 82: 0.00
Probability of relevance for sample 83: 0.00
Probability of relevance for sample 84: 0.30
Probability of relevance for sample 85: 0.00
Probability of relevance for sample 86: 0.00
Probability of relevance for sample 87: 0.00
Probability of relevance for sample 88: 0.11
Probability of relevance for sample 89: 0.00
Probability of relevance for sample 90: 0.40
Probability of relevance for sample 91: 0.24
Probability of relevance for sample 92: 0.00
Probability of relevance for sample 93: 0.00
Probabilit

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_test['relevance_probability'][i]=relevance_probability
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_te

Probability of relevance for sample 96: 0.29
Probability of relevance for sample 97: 0.00
Probability of relevance for sample 98: 0.27
Probability of relevance for sample 99: 0.00
Probability of relevance for sample 100: 0.73
Probability of relevance for sample 101: 0.29
Probability of relevance for sample 102: 0.00
Probability of relevance for sample 103: 0.18
Probability of relevance for sample 104: 0.22
Probability of relevance for sample 105: 0.00
Probability of relevance for sample 106: 0.00
Probability of relevance for sample 107: 0.12
Probability of relevance for sample 108: 0.00
Probability of relevance for sample 109: 0.28


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_test['relevance_probability'][i]=relevance_probability
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_te

Probability of relevance for sample 110: 0.52
Probability of relevance for sample 111: 0.46
Probability of relevance for sample 112: 0.00
Probability of relevance for sample 113: 0.00
Probability of relevance for sample 114: 0.27
Probability of relevance for sample 115: 0.00
Probability of relevance for sample 116: 0.00
Probability of relevance for sample 117: 0.00
Probability of relevance for sample 118: 0.32
Probability of relevance for sample 119: 0.22
Probability of relevance for sample 120: 0.00
Probability of relevance for sample 121: 0.59
Probability of relevance for sample 122: 0.43
Probability of relevance for sample 123: 0.36
Probability of relevance for sample 124: 0.21
Probability of relevance for sample 125: 0.00
Probability of relevance for sample 126: 0.00
Probability of relevance for sample 127: 0.33
Probability of relevance for sample 128: 0.00
Probability of relevance for sample 129: 0.00
Probability of relevance for sample 130: 0.00
Probability of relevance for sampl

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_test['relevance_probability'][i]=relevance_probability
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_te

Probability of relevance for sample 132: 0.35
Probability of relevance for sample 133: 0.00
Probability of relevance for sample 134: 0.00
Probability of relevance for sample 135: 0.32
Probability of relevance for sample 136: 0.33
Probability of relevance for sample 137: 0.18
Probability of relevance for sample 138: 0.00
Probability of relevance for sample 139: 0.00
Probability of relevance for sample 140: 0.00
Probability of relevance for sample 141: 0.30
Probability of relevance for sample 142: 0.35
Probability of relevance for sample 143: 0.00
Probability of relevance for sample 144: 0.00


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_test['relevance_probability'][i]=relevance_probability
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_te

Probability of relevance for sample 145: 0.33
Probability of relevance for sample 146: 0.00
Probability of relevance for sample 147: 0.00
Probability of relevance for sample 148: 0.32
Probability of relevance for sample 149: 0.13
Probability of relevance for sample 150: 0.00
Probability of relevance for sample 151: 0.00
Probability of relevance for sample 152: 0.05
Probability of relevance for sample 153: 0.00
Probability of relevance for sample 154: 0.41
Probability of relevance for sample 155: 0.00


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_test['relevance_probability'][i]=relevance_probability
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_te

Probability of relevance for sample 156: 0.00
Probability of relevance for sample 157: 0.00
Probability of relevance for sample 158: 0.00
Probability of relevance for sample 159: 0.00
Probability of relevance for sample 160: 0.00
Probability of relevance for sample 161: 0.00
Probability of relevance for sample 162: 0.00
Probability of relevance for sample 163: 0.00
Probability of relevance for sample 164: 0.00
Probability of relevance for sample 165: 0.00
Probability of relevance for sample 166: 0.00
Probability of relevance for sample 167: 0.00
Probability of relevance for sample 168: 0.00
Probability of relevance for sample 169: 0.00
Probability of relevance for sample 170: 0.00
Probability of relevance for sample 171: 0.15
Probability of relevance for sample 172: 0.13
Probability of relevance for sample 173: 0.00
Probability of relevance for sample 174: 0.00
Probability of relevance for sample 175: 0.23
Probability of relevance for sample 176: 0.00
Probability of relevance for sampl

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_test['relevance_probability'][i]=relevance_probability
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_te

In [61]:
df_test[df_test.relevance_probability>.1]

Unnamed: 0,processed_text,highly_relevant,relevance_probability
110,desperate need advice dealing someone romantic...,0,0.457136
77,went ton first date i’m attracted make feel li...,0,0.328666
10,advice starting therapy today hi nb18 finally ...,0,0.280403
78,feel cold bpd predominantly diagnosed girl fee...,0,0.231191
118,started regressing past behaviour mention sici...,0,0.221909
109,hard people diagnosed bpd get attached anyone ...,0,0.515074
30,argument spiraled borderline episode long sorr...,0,0.654304
135,social anxiety treatment study hi im phd stude...,0,0.326204
192,getting cheated bpd hi everyone question haven...,0,0.219492
76,fill void i’ve fallen many friend irl online f...,0,0.192197
