# Reddit Depression Final Project
Link to the paper: https://dl.acm.org/doi/pdf/10.1145/3578503.3583621

Read through the paper fully before starting the assignment!

In [1]:
!pip install "pandas<2.0.0"

Collecting pandas<2.0.0
  Downloading pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m43.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 2.2.2
    Uninstalling pandas-2.2.2:
      Successfully uninstalled pandas-2.2.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-cu12 24.10.1 requires pandas<2.2.3dev0,>=2.0, but you have pandas 1.5.3 which is incompatible.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 1.5.3 which is incompatible.
mizani 0.13.1 requires pandas>=2.2.0, but you have pandas 1.5.3 which is incompatible.
plotn

In [2]:
import pickle
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_validate, cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier

from google.colab import drive
drive.mount('/content/drive')

FILEPATH = 'drive/MyDrive/student.pkl'
processed_control_path = 'drive/MyDrive/control1.pkl'
processed_symptom_path = 'drive/MyDrive/symptom1.pkl'

Mounted at /content/drive


## Preprocessing

In [3]:
# List of depression subreddits in the paper
depression_subreddits = ["Anger",
    "anhedonia", "DeadBedrooms",
    "Anxiety", "AnxietyDepression", "HealthAnxiety", "PanicAttack",
    "DecisionMaking", "shouldi",
    "bingeeating", "BingeEatingDisorder", "EatingDisorders", "eating_disorders", "EDAnonymous",
    "chronicfatigue", "Fatigue",
    "ForeverAlone", "lonely",
    "cry", "grief", "sad", "Sadness",
    "AvPD", "SelfHate", "selfhelp", "socialanxiety", "whatsbotheringyou",
    "insomnia", "sleep",
    "cfs", "ChronicPain", "Constipation", "EssentialTremor", "headaches", "ibs", "tinnitus",
    "AdultSelfHarm", "selfharm", "SuicideWatch",
    "Guilt", "Pessimism", "selfhelp", "whatsbotheringyou"
]

symptom_subreddits = [["Anger"],
    ["anhedonia", "DeadBedrooms"],
    ["Anxiety", "AnxietyDepression", "HealthAnxiety", "PanicAttack"],
    ["DecisionMaking", "shouldi"],
    ["bingeeating", "BingeEatingDisorder", "EatingDisorders", "eating_disorders", "EDAnonymous"],
    ["chronicfatigue", "Fatigue"],
    ["ForeverAlone", "lonely"],
    ["cry", "grief", "sad", "Sadness"],
    ["AvPD", "SelfHate", "selfhelp", "socialanxiety", "whatsbotheringyou"],
    ["insomnia", "sleep"],
    ["cfs", "ChronicPain", "Constipation", "EssentialTremor", "headaches", "ibs", "tinnitus"],
    ["AdultSelfHarm", "selfharm", "SuicideWatch"],
    ["Guilt", "Pessimism", "selfhelp", "whatsbotheringyou"]
]

In [4]:
def load(filepath):
  """Load pickles"""
  with open(filepath, 'rb') as f:
      data = pickle.load(f)
      return data

In [58]:
def sym_dataset_generation(raw_data):
  """Build control and symptom datasets"""
  symptom_dfs = [dict(),dict(),dict(),dict(),dict(),dict(),dict(),dict(),dict(),dict(),dict(),dict(),dict()]
  for i in range(13):
    symptom_dfs[i] = raw_data[raw_data['subreddit'].isin(symptom_subreddits[i])].copy()
    symptom_dfs[i]['created_utc'] = pd.to_datetime(symptom_dfs[i]['created_utc'], unit='s')

  return symptom_dfs

In [6]:
from datetime import datetime, timedelta
def dataset_generation(raw_data):
  """Build control and symptom datasets"""
  symptom_dfs = []
  for i in range(13):
    symptom_dfs[i] = raw_data[raw_data['subreddit'].isin(symptom_subreddits[i])].copy()
    symptom_dfs[i]['created_utc'] = pd.to_datetime(symptom_dfs[i]['created_utc'], unit='s')

  control_df = []
  authors = symptom_dfs['author'].unique()
  authors = set(authors)
  print('number of unique:', len(authors))

  i=0
  for author in authors:
      print(i/len(authors)*100)
      i+=1
      author_symptom_posts = symptom_dfs[symptom_dfs['author'] == author]
      earliest_symptom_date = author_symptom_posts['created_utc'].min()

      author_posts = raw_data[(raw_data['author'] == author) &
                              (~raw_data['subreddit'].isin(depression_subreddits))].copy()
      author_posts['created_utc'] = pd.to_datetime(author_posts['created_utc'], unit='s')

      valid_control_posts = author_posts[author_posts['created_utc'] <= earliest_symptom_date - timedelta(days=180)]
      control_df.append(valid_control_posts)

  control_df = pd.concat(control_df, ignore_index=True)
  print(f"Filtered {len(control_df)} control posts.")

  return symptom_dfs, control_df


In [23]:
#load raw data from handout
raw_data = load(FILEPATH)

In [None]:
#generate data from scratch (obsolete)

symptom_dfs, control_df = dataset_generation(raw_data)
pickle.dump(symptom_df, open(processed_symptom_path, 'wb'))
pickle.dump(control_df, open(processed_control_path, 'wb'))

#save symptom dataframes to drive
for i in range(13):
  symptom_dfs[i].to_pickle(f'drive/MyDrive/symptom{i+1}.pkl')

#control too
control_df.to_pickle('drive/MyDrive/control1.pkl')

In [None]:
#only regenerate symptom data (control takes soo long)
symptom_dfs = sym_dataset_generation(raw_data)
control_df = load(processed_control_path)



In [39]:
#load them back
symptom_dfs = []
for i in range(13):
  symptom_dfs.append(load('drive/MyDrive/symptom{i+1}.pkl'))
control_df = load('drive/MyDrive/control1.pkl')

## Reddit Topics with LDA

 - Don't use MALLET (as the paper does), use some other LDA implementation.

In [10]:
from gensim.models import LdaMulticore
from gensim.corpora import Dictionary
import spacy
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tqdm import tqdm


In [57]:
# We highly recommend you using the LdaMulticore interface, but feel free to use any other implementations if you prefer.

from gensim.models import LdaModel
from tqdm import tqdm

nlp = spacy.load("en_core_web_sm", disable=["ner", "parser", "tagger"])

def preprocess_text_spacy(text):
    #take out stop words (100 most frequent)
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return tokens
def preprocess_texts_spacy_batched(texts):
    #with tqdm progress bar, batched (faster)
    processed_texts = []
    for doc in tqdm(nlp.pipe(texts, batch_size=1000), total=len(texts), desc="Preprocessing texts"):
        tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
        processed_texts.append(tokens)
    return processed_texts

def run_lda(data, num_topics=200, passes=1,loadData=False):
    # Preprocess text data
    tqdm.pandas(desc="Processing text")
    if not loadData:
      data['processed_text'] = data['text'].progress_apply(preprocess_text_spacy)
      data.to_pickle('drive/MyDrive/all_processed_texts.pkl')
    else:

      processed_texts = load('drive/MyDrive/processed_texts.pkl')
      data['processed_text'] = processed_texts

    #make dictionary and corpus
    print("creating dict/corpus")
    dictionary = Dictionary(data['processed_text'])

    corpus = [dictionary.doc2bow(text) for text in data['processed_text']]

    print("Training LDA")
    lda_model = LdaMulticore(corpus, num_topics=num_topics, id2word=dictionary, passes=passes, workers=4)
    # lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=passes)

    topics = lda_model.print_topics()
    for topic in topics:
        print(topic)

    return lda_model, dictionary, corpus, data

# TODO: Your LDA code!

In [60]:
#combine symptom_df and control_df into one flattened df to train LDA

all_df = pd.DataFrame(symptom_dfs[0])
for i in range(1,13):

  all_df = pd.concat([all_df, symptom_dfs[i]])
control_df = pd.DataFrame(control_df)

all_data = pd.concat([all_df, control_df])


In [None]:
processed_texts = pd.read_csv('drive/MyDrive/symptom_df1.csv')
lda_model, dictionary, corpus, proc = run_lda(processed_texts, loadData=False)





In [None]:
lda_model, dictionary, corpus, proc = run_lda(processed_texts, loadData=True)


X_lda = []
y_lda = []
for i in range(13):
  symptom_corpus = [dictionary.doc2bow(text) for text in processed_sdf[i]['processed_text']]
  lda_features = [lda_model.get_document_topics(doc, minimum_probability=0) for doc in symptom_corpus]
  lda_array = np.array([[prob for _, prob in doc] for doc in lda_features])
  X_lda.append(np.vstack((lda_array, lda_array_c)))
  y_lda.append(np.concatenate((np.ones(len(lda_array)), np.zeros(len(lda_array_c)))))

for i in range(13):
  main(X_lda[i],y_lda[i])

## RoBERTa Embeddings

In [7]:
def get_roberta_embeddings(data, batch_size=32, layer=5, model_name="distilroberta-base"):

    print("Loading tokenizer and model...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    model.to("cuda")  # Use GPU if available
    model.eval()

    embeddings = []

    print("Processing text for embeddings...")
    for i in tqdm(range(0, len(data), batch_size), desc="Embedding batches"):
        batch_texts = data['text'][i:i + batch_size].tolist()
        inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt").to("cuda")

        with torch.no_grad():
            outputs = model(**inputs, output_hidden_states=True)
            hidden_states = outputs.hidden_states
            batch_embeddings = hidden_states[layer].mean(dim=1).cpu().numpy()  # Mean pooling

        embeddings.extend(batch_embeddings)

    return np.array(embeddings)


In [25]:
def combine_features(lda_model, dictionary, corpus, roberta_embeddings):
    """Combine LDA and RoBERTa embeddings into a single feature set."""
    print("Generating LDA topic distributions...")
    lda_features = [lda_model.get_document_topics(doc, minimum_probability=0) for doc in corpus]
    lda_array = np.array([[prob for _, prob in doc] for doc in lda_features])

    print("Combining features...")
    roberta_embedding = roberta_embeddings.reshape(-1,1)
    combined_features = np.hstack((lda_array, roberta_embeddings))
    return combined_features

def train_and_evaluate_classifier(features, labels):
    """Train and evaluate a random forest classifier."""
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

    print("Training Random Forest Classifier...")
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)

    print("Evaluating model...")
    y_pred = clf.predict(X_test)
    print(classification_report(y_test, y_pred))

    print("Cross-validation scores:")
    scores = cross_val_score(clf, features, labels, cv=5)
    print(f"Mean accuracy: {scores.mean():.4f} (+/- {scores.std():.4f})")

    return clf


In [53]:
symptom_embeddings = []
control_embeddings = get_roberta_embeddings(control_df)

X_list = []
y_list = []
for i in range(13):
  symptom_embeddings.append(get_roberta_embeddings(symptom_dfs[i]))
  symptom_embeddings[i].tofile(f'drive/MyDrive/symptom_embeddings{i+1}.npy')
  X_list.append(np.vstack((symptom_embeddings[i], control_embeddings)))
  y_list.append(np.concatenate((np.ones(len(symptom_embeddings[i])), np.zeros(len(control_embeddings)))))


Loading tokenizer and model...
Processing text for embeddings...


Embedding batches: 100%|██████████| 137/137 [00:50<00:00,  2.71it/s]


Loading tokenizer and model...
Processing text for embeddings...


Embedding batches: 100%|██████████| 18/18 [00:08<00:00,  2.09it/s]


Loading tokenizer and model...
Processing text for embeddings...


Embedding batches: 100%|██████████| 186/186 [01:33<00:00,  1.99it/s]


Loading tokenizer and model...
Processing text for embeddings...


Embedding batches: 100%|██████████| 767/767 [05:54<00:00,  2.16it/s]


Loading tokenizer and model...
Processing text for embeddings...


Embedding batches: 100%|██████████| 1/1 [00:00<00:00, 28.27it/s]

Loading tokenizer and model...





Processing text for embeddings...


Embedding batches: 100%|██████████| 56/56 [00:25<00:00,  2.17it/s]


Loading tokenizer and model...
Processing text for embeddings...


Embedding batches: 100%|██████████| 1/1 [00:00<00:00, 29.22it/s]

Loading tokenizer and model...





Processing text for embeddings...


Embedding batches: 100%|██████████| 361/361 [02:42<00:00,  2.23it/s]


Loading tokenizer and model...
Processing text for embeddings...


Embedding batches: 100%|██████████| 70/70 [00:31<00:00,  2.20it/s]


Loading tokenizer and model...
Processing text for embeddings...


Embedding batches: 100%|██████████| 309/309 [02:20<00:00,  2.20it/s]


Loading tokenizer and model...
Processing text for embeddings...


Embedding batches: 100%|██████████| 100/100 [00:43<00:00,  2.27it/s]


Loading tokenizer and model...
Processing text for embeddings...


Embedding batches: 100%|██████████| 261/261 [01:57<00:00,  2.23it/s]


Loading tokenizer and model...
Processing text for embeddings...


Embedding batches: 100%|██████████| 829/829 [06:24<00:00,  2.15it/s]


Loading tokenizer and model...
Processing text for embeddings...


Embedding batches: 100%|██████████| 57/57 [00:24<00:00,  2.32it/s]


In [55]:

pickle.dump(X_list, open('drive/MyDrive/Xlist.pkl', 'wb'))
pickle.dump(y_list, open('drive/MyDrive/ylist.pkl', 'wb'))

for i in range(13):
  main(X_list[i],y_list[i])

Running 5-fold cross-validation...

Training Scores:
[1. 1. 1. 1. 1.]
Testing Scores:
[0.93443713 0.94066512 0.94470621 0.94233488 0.95202882]
Running 5-fold cross-validation...

Training Scores:
[1. 1. 1. 1. 1.]
Testing Scores:
[0.94294323 0.95636956 0.95960694 0.95172144 0.96160825]
Running 5-fold cross-validation...

Training Scores:
[1. 1. 1. 1. 1.]
Testing Scores:
[0.95685672 0.95019626 0.95500194 0.95619455 0.95430696]
Running 5-fold cross-validation...

Training Scores:
[1. 1. 1. 1. 1.]
Testing Scores:
[0.4271789  0.84228571 0.62614416 0.39530892 0.93421053]
Running 5-fold cross-validation...

Training Scores:
[1. 1. 1. 1. 1.]
Testing Scores:
[0.95344863 0.95332038 0.95419402 0.95580584 0.95659453]
Running 5-fold cross-validation...


Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 279, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 371, in _score
    y_pred = method_caller(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 89, in _cached_call
    result, _ = _get_response_values(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_response.py", line 214, in _get_response_values
    y_pred = _process_predict_proba(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_response.py", line 51, in _process_predict_proba
    raise ValueError(
ValueError: Got predict_proba of shape (874, 1), but need clas


Training Scores:
[nan  1.  1.  1.  1.]
Testing Scores:
[nan nan nan nan nan]
Running 5-fold cross-validation...

Training Scores:
[0.99999168 0.99998813 1.         0.9999499  0.99998534]
Testing Scores:
[0.91754488 0.9266538  0.9214281  0.91198491 0.90955192]
Running 5-fold cross-validation...

Training Scores:
[1. 1. 1. 1. 1.]
Testing Scores:
[0.93503715 0.93163891 0.93369737 0.94000984 0.92986778]
Running 5-fold cross-validation...

Training Scores:
[1. 1. 1. 1. 1.]
Testing Scores:
[0.93776312 0.92807865 0.94200903 0.93524902 0.92695224]
Running 5-fold cross-validation...

Training Scores:
[1. 1. 1. 1. 1.]
Testing Scores:
[0.95685038 0.95918502 0.95620205 0.9575227  0.96252877]
Running 5-fold cross-validation...

Training Scores:
[1. 1. 1. 1. 1.]
Testing Scores:
[0.92874342 0.93433323 0.94067554 0.91880437 0.93839913]
Running 5-fold cross-validation...

Training Scores:
[1. 1. 1. 1. 1.]
Testing Scores:
[0.96142462 0.96329066 0.96070375 0.96590418 0.96936106]
Running 5-fold cross-val

## Main

In [52]:
from tqdm import tqdm

from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
def main(X, y):
    """
    Here's the basic structure of the main block! It should run
    5-fold cross validation with random forest to evaluate your RoBERTa and LDA
    performance.
    """
    rf_classifier = RandomForestClassifier(verbose=1)
    cv = KFold(n_splits=5, shuffle=True)



    print("Running 5-fold cross-validation...")
    rf_classifier = RandomForestClassifier()
    cv = KFold(n_splits=5, shuffle=True)
    results = cross_validate(rf_classifier, X=X, y=y, cv=cv, scoring='roc_auc', return_train_score=True)

    print("\nTraining Scores:")
    print(results.get('train_score'))
    print("Testing Scores:")
    print(results.get('test_score'))






In [43]:
lda_model, dictionary, corpus = run_lda(all_data,loadData=False)

Preprocessing text...


Processing text: 100%|██████████| 100633/100633 [29:56<00:00, 56.02it/s] 


Creating dictionary and corpus...
Training LDA model...
(56, '0.029*"friend" + 0.019*"dreams" + 0.017*"anymore" + 0.015*"friends" + 0.013*"time" + 0.013*"know" + 0.012*"like" + 0.012*"best" + 0.011*"life" + 0.009*"going"')
(77, '0.016*"like" + 0.014*"time" + 0.010*"years" + 0.008*"want" + 0.007*"feel" + 0.007*"life" + 0.006*"artist" + 0.006*"going" + 0.006*"know" + 0.006*"relationship"')
(198, '0.014*"anxiety" + 0.011*"know" + 0.011*"like" + 0.010*"series" + 0.010*"razors" + 0.009*"tired" + 0.008*"feel" + 0.007*"sleepy" + 0.007*"adderall" + 0.006*"aids"')
(49, '0.016*"audio" + 0.012*"like" + 0.011*"time" + 0.009*"kratom" + 0.007*"want" + 0.007*"know" + 0.007*"work" + 0.007*"going" + 0.007*"feel" + 0.007*"help"')
(91, '0.021*"life" + 0.013*"patients" + 0.013*"like" + 0.012*"know" + 0.010*"people" + 0.008*"want" + 0.008*"self" + 0.006*"way" + 0.006*"think" + 0.005*"things"')
(168, '0.024*"suicidal" + 0.019*"going" + 0.019*"like" + 0.014*"feel" + 0.012*"want" + 0.012*"thoughts" + 0.011*"l

In [47]:
lda_model.save('drive/MyDrive/lda_model.model')


In [12]:
#load lda model
lda_model = LdaMulticore.load('drive/MyDrive/lda_model.model')

5338       Advice on dealing with anger? Normally I'm a c...
5594       I've been to anger management 10 times all it'...
11246                         Ripping heads off :) [removed]
13284      Things that piss me off most. Being lonely. \n...
20424                              Weird black guy [removed]
                                 ...                        
1955075    How To Stop This Before it Gets Out Of Hand? H...
1958806           Hypocrisy has never been so real [removed]
1960247    I hate myself after losing control First time ...
1960734    I’m not a particularly angry person, but today...
1964193    Sometimes I can literally feel my neurology sh...
Name: text, Length: 555, dtype: object


In [None]:
symptom_embeddings.tofile('drive/MyDrive/symptom_embeddings.npy')
control_embeddings.tofile('drive/MyDrive/control_embeddings.npy')

In [26]:
symptom_embeddings = np.fromfile('drive/MyDrive/symptom_embeddings.npy')
control_embeddings = np.fromfile('drive/MyDrive/control_embeddings.npy')

In [27]:
lda_model_symptom, dictionary_symptom, corpus_symptom = run_lda(symptom_df, loadData=True)  # LDA for symptom data


Preprocessing text...
Creating dictionary and corpus...
Training LDA model...
(38, '0.016*"weight" + 0.015*"like" + 0.013*"anxiety" + 0.011*"feel" + 0.010*"eat" + 0.010*"know" + 0.009*"felt" + 0.008*"time" + 0.008*"day" + 0.008*"appetite"')
(11, '0.031*"feel" + 0.021*"like" + 0.020*"know" + 0.017*"want" + 0.011*"hug" + 0.009*"day" + 0.009*"going" + 0.008*"life" + 0.007*"help" + 0.007*"better"')
(79, '0.014*"headphones" + 0.009*"pain" + 0.008*"want" + 0.007*"like" + 0.007*"going" + 0.007*"day" + 0.007*"injections" + 0.007*"shot" + 0.007*"recommendation" + 0.006*"pool"')
(138, '0.027*"like" + 0.025*"feel" + 0.012*"anxiety" + 0.012*"know" + 0.010*"want" + 0.009*"life" + 0.008*"time" + 0.007*"drawn" + 0.006*"going" + 0.006*"people"')
(150, '0.027*"birthday" + 0.015*"like" + 0.013*"know" + 0.012*"said" + 0.011*"feel" + 0.009*"got" + 0.008*"time" + 0.008*"day" + 0.008*"going" + 0.007*"told"')
(156, '0.018*"feel" + 0.017*"want" + 0.017*"like" + 0.014*"m" + 0.013*"know" + 0.011*"people" + 0.01

In [28]:
lda_model_control, dictionary_control, corpus_control = run_lda(control_df, loadData=False)  # LDA for control data


Preprocessing text...


Processing text: 100%|██████████| 4369/4369 [00:34<00:00, 127.59it/s]


Creating dictionary and corpus...




Training LDA model...
(82, '0.019*"think" + 0.013*"tiger" + 0.013*"tooth" + 0.013*"black" + 0.013*"karambit" + 0.013*"keys" + 0.013*"games" + 0.013*"going" + 0.012*"okay" + 0.011*"time"')
(125, '0.022*"like" + 0.019*"game" + 0.014*"know" + 0.014*"season" + 0.013*"think" + 0.012*"got" + 0.011*"saying" + 0.011*"goes" + 0.010*"people" + 0.010*"time"')
(67, '0.020*"like" + 0.013*"feel" + 0.013*"want" + 0.011*"going" + 0.010*"internet" + 0.010*"asked" + 0.010*"nt" + 0.009*"play" + 0.009*"water" + 0.009*"wanted"')
(3, '0.041*"like" + 0.026*"people" + 0.018*"want" + 0.015*"know" + 0.012*"bad" + 0.011*"find" + 0.010*"life" + 0.010*"sign" + 0.009*"day" + 0.008*"going"')
(62, '0.015*"going" + 0.013*"year" + 0.011*"told" + 0.011*"old" + 0.010*"time" + 0.010*"dream" + 0.009*"friends" + 0.009*"wo" + 0.009*"random" + 0.008*"floor"')
(55, '0.016*"like" + 0.011*"know" + 0.010*"s" + 0.010*"nt" + 0.009*"got" + 0.008*"point" + 0.008*"body" + 0.007*"shit" + 0.007*"pc" + 0.007*"time"')
(9, '0.014*"people" 

In [29]:
lda_features = [lda_model_symptom.get_document_topics(doc, minimum_probability=0) for doc in corpus_symptom]
lda_array = np.array([[prob for _, prob in doc] for doc in lda_features])

In [40]:

sym_embedding_re = symptom_embeddings.reshape(lda_array.shape[0], -1)
print(sym_embedding_re.shape)
print(lda_array.shape)
combined_features_sym = np.hstack((lda_array, sym_embedding_re))

NameError: name 'symptom_embeddings' is not defined

In [31]:
lda_features_c = [lda_model_control.get_document_topics(doc, minimum_probability=0) for doc in corpus_control]
lda_array_c = np.array([[prob for _, prob in doc] for doc in lda_features_c])

ctrl_embedding_re = control_embeddings.reshape(lda_array_c.shape[0], -1)

combined_features_c = np.hstack((lda_array_c, ctrl_embedding_re))

In [32]:
X = np.vstack((combined_features_sym, combined_features_c))
y = np.concatenate((np.ones(len(combined_features_sym)), np.zeros(len(combined_features_c))))
X.tofile('drive/MyDrive/X.npy')
y.tofile('drive/MyDrive/y.npy')

In [None]:
X = np.fromfile('drive/MyDrive/X.npy')
y = np.fromfile('drive/MyDrive/y.npy')
main(X,y)

In [37]:
si = 0
lda_features = [lda_model.get_document_topics(doc, minimum_probability=0) for doc in corpus]
lda_array = np.array([[prob for _, prob in doc] for doc in lda_features])
sym_embedding_re = symptom_embeddings.reshape(lda_array.shape[0], -1)
print(sym_embedding_re.shape)
print(lda_array.shape)
combined_features_sym = np.hstack((lda_array, sym_embedding_re))


(98883, 584)
(98883,)


In [38]:
X = np.vstack((combined_features_sym, combined_features_c))
y = np.concatenate((np.ones(len(combined_features_sym)), np.zeros(len(combined_features_c))))
main(X,y)

Running 5-fold cross-validation...


Cross-validation folds:   0%|          | 0/5 [00:00<?, ?it/s][Parallel(n_jobs=1)]: Done  49 tasks      | elapsed: 11.0min
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    2.1s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.5s
Cross-validation folds:  20%|██        | 1/5 [32:44<2:10:57, 1964.39s/it]


KeyboardInterrupt: 

In [22]:

symptom_features = combine_features(lda_model_symptom, dictionary_symptom, corpus_symptom, symptom_embeddings)
control_features = combine_features(lda_model_control, dictionary_control, corpus_control, control_embeddings)

NameError: name 'corpus' is not defined