# Generate Word Features


## Imports

In [1]:
# LDA
from gensim import corpora, models
from gensim.models import CoherenceModel, LdaModel, LdaMulticore
from gensim.models.callbacks import PerplexityMetric, ConvergenceMetric, CoherenceMetric
# Managing data
import pandas as pd
import numpy as np
import re
# DB connection
from scraping import create_connection
# Files & I/O
import pickle
import csv
import os
from pathlib import Path
from io import FileIO
# For logging
import logging
# Plotting
import matplotlib.pyplot as plt
# Random
import random
# Parallelizing
import dask.dataframe as dd
from dask.multiprocessing import get

## Functions

## File Locations

In [2]:
p = Path.cwd()
path_parent = p.parents[1]

In [3]:
# database
path_db = str(path_parent / "database" / "netmums-merged.db")
path_clean_data = path_parent / "clean_data" / "netmums"
# data to load
path_lemma_pkl = str(path_clean_data / "lemmatized_text_{0}_{1}_{2}.pkl")
path_corpus_pkl = str(path_clean_data / "corpus_{0}_{1}_{2}.pkl")
path_dictionary_gensim = str(path_clean_data / "dictionary_{0}_{1}_{2}.gensim")
# model saving
path_tune_models = str(path_clean_data / "lda_tune_{0}_{1}_{2}_{3}_{4}.gensim")
path_ntopic_models = str(path_clean_data / "lda_ntopics_{0}_{1}_{2}_{3}.gensim")
# path_coherence = str(path_parent / "clean_data" / "coherence_{}.csv")
path_log = str(path_clean_data / "logging_{0}_{1}_{2}_{3}.log")
path_log_iterations = str(path_clean_data / "logging_{0}_{1}_{2}_{3}.log")
# dominant topic
path_dom_topic = str(path_clean_data / "dominant_topic_{0}_{1}_{2}_{3}.csv")

In [4]:
# dataframes
path_topics_pkl = str(path_clean_data / "daily_topics.pkl")
path_text_pkl = str(path_clean_data / "daily_clean_text.pkl")
path_days_since_pkl = str(path_clean_data / "daily_days_since.pkl")
path_subforums_pkl = str(path_clean_data / "daily_subforums.pkl")
path_emote_pkl = str(path_clean_data / "daily_emote_processed_{}.pkl")
path_joined_pkl = str(path_clean_data / "daily_joined_df.pkl")
path_bigrams_pkl = str(path_corpus_pkl.format("all", "all", "daily_text_df"))

## Process all text for topics
Make lemmatized text, dictionary, and corpus for all text

In [None]:
# import lemmatize_all as la
# la.process_data(chunksize=1000000, n_chunks=1)

Make corpuses for all text chunk. The prior command didn't create individual corpuses.

In [None]:
# n_chunks = 4
# for i in tqdm(range(16)):
#     text = pickle.load(open(path_lemma_pkl.format("all", "all", "thread_id_{}".format(i)), 'rb'))
#     corpus = la.text_to_corpus(text, n_chunks, dictionary)
#     pickle.dump(corpus, open(path_corpus_pkl.format("all", "all", "thread_id_{}".format(i)), 'wb'))

In [None]:
n_chunks = 10
for i in tqdm(range(16)):
    text = pickle.load(open(path_lemma_pkl.format("all", "all", "message_id_{}".format(i)), 'rb'))
    corpus = la.text_to_corpus(text, n_chunks, dictionary)
    pickle.dump(corpus, open(path_corpus_pkl.format("all", "all", "message_id_{}".format(i)), 'wb'))

Make single corpus using the whole dictionary. The process_data command didn't create a good corpus using the incremental approach.

In [None]:
# n_chunks = 4
# corpus = []
# for i in tqdm(range(16)):
#     text = pickle.load(open(path_lemma_pkl.format("all", "all", "thread_id_{}".format(i)), 'rb'))
#     corpus = corpus + la.text_to_corpus(text, n_chunks, dictionary)
# pickle.dump(corpus, open(path_corpus_pkl.format("all", "all", "thread_id_v2"), 'wb'))

Make 10% sample lemmatized text and corpus

In [5]:
import random
from tqdm import tqdm
import lemmatize_all as la

In [None]:
dictionary = corpora.Dictionary.load(path_dictionary_gensim.format("all", "all", "thread_id"))

In [None]:
lemmatized_text = []
corpus = []
n_chunks = 1
for i in tqdm(range(16)):
    text = pickle.load(open(path_lemma_pkl.format("all", "all", "thread_id_{}".format(i)), 'rb'))
    list_len = len(text)
    list_10p = int(list_len * .1)
    text = random.sample(text, list_10p)
    lemmatized_text = lemmatized_text + text
    corpus = corpus + la.text_to_corpus(text, n_chunks, dictionary)
pickle.dump(lemmatized_text, open(path_lemma_pkl.format("all", "all", "thread_id_10p"), 'wb'))
pickle.dump(corpus, open(path_corpus_pkl.format("all", "all", "thread_id_10p"), 'wb'))

## Create Topic Model for 10% sample

In [None]:
dictionary = corpora.Dictionary.load(path_dictionary_gensim.format("all", "all", "thread_id"))
corpus = pickle.load(open(path_corpus_pkl.format("all", "all", "thread_id_10p"), 'rb'))
lemmatized_text = pickle.load(open(path_lemma_pkl.format("all", "all", "thread_id_10p".format(2)), 'rb'))

In [None]:
%%time
n_topics = 20
lda = LdaModel(
    corpus=corpus,
    num_topics=n_topics,
    id2word=dictionary,
    random_state=1,
    alpha="auto",
    eta="auto"
)
coherence_model_lda = CoherenceModel(model=lda, texts=lemmatized_text, dictionary=dictionary, coherence='c_v')
print(coherence_model_lda.get_coherence())

In [None]:
lda.save(path_ntopic_models.format("all", "all", "thread_id_10p", str(n_topics)))

## Daily text to corpus

In [None]:
import lemmatize_all as la

In [None]:
dictionary = corpora.Dictionary.load(path_dictionary_gensim.format("all", "all", "thread_id"))
clean_text = pd.read_pickle(path_text_pkl)
clean_text.columns = ['user_url', 'day', 'text_clean', 'sentiment']

In [None]:
%%time
df_bigrams, corpus = la.make_corpus(clean_text, dictionary, 4)

In [None]:
df_bigrams.to_pickle(path_corpus_pkl.format("all", "all", "daily_text_df"))

In [None]:
pickle.dump(corpus, open(path_corpus_pkl.format("all", "all", "daily_text"), 'wb'))

In [None]:
# corpus = pickle.load(open(path_corpus_pkl.format("all", "all", "daily_text"), 'rb'))
# len(corpus)

## Daily text to topics

In [None]:
n_topics = 20
lda = LdaModel.load(path_ntopic_models.format("all", "all", "thread_id_10p", str(n_topics)))

In [None]:
from gensim.matutils import corpus2csc
import numpy as np

In [None]:
all_topics = lda.get_document_topics(corpus, minimum_probability=0.0)
all_topics_csr = corpus2csc(all_topics)
all_topics_numpy = all_topics_csr.T.toarray()
all_topics_df = pd.DataFrame(all_topics_numpy)
all_topics_df.head()

In [None]:
column_names = ["topic_{}".format(i) for i in range(20)]
all_topics_df.columns = column_names
all_topics_df.to_pickle(path_topics_pkl)

## Post Features

In [10]:
sql = '''
    SELECT
        text.text_clean AS text_clean,
        s.name AS subforum_name,
        p.user_url AS user_url,
        p.date_created AS date_created
    FROM text
    LEFT JOIN posts AS p
    ON text.post_id = p.id
    LEFT JOIN threads AS t
    ON t.id=p.thread_id
    LEFT JOIN subforums AS s
    ON s.id=t.subforum_id
    LEFT JOIN forums AS f
    ON f.id=s.forum_id
    WHERE text.text_clean<>""
    AND p.user_url<>"Anonymous"
'''

In [11]:
conn = create_connection(path_db)
df = pd.read_sql_query(sql, conn)
conn.close()

In [12]:
# create days dataframe
ddf = dd.from_pandas(df, npartitions=200)
ddf['date_created'] = dd.to_datetime(ddf['date_created'])
ddf['day'] = ddf['date_created'].dt.date
df = ddf.compute(scheduler='processes')

In [None]:
# count unique days in dataset
ddf = dd.from_pandas(df, npartitions=200)
ddf = ddf[['user_url', 'day']].groupby(["user_url"])["day"].nunique().reset_index(drop=False)
df_count = ddf.compute(scheduler='processes')
df_count.columns = ['user_url','n_unique_days']

In [16]:
# count posts per day
ddf = dd.from_pandas(df, npartitions=200)
ddf = ddf.groupby(["user_url", "day"])['subforum_name'].count().reset_index(drop=False)
df_daily_count = ddf.compute(scheduler='processes')
df_daily_count.columns = ['user_url', 'day', 'n_posts']

In [20]:
df_daily_count['day'] = df_daily_count['day'].apply(lambda x: x.strftime('%Y-%m-%d'))
df_daily_count.to_stata(path_clean_data / "daily_panel_counts.dta")

In [None]:
morethanone = df_count.loc[df_count['n_unique_days'] > 1, 'user_url']
df = df.loc[df['user_url'].isin(morethanone)]

In [None]:
df.reset_index(drop=True, inplace=True)
df.head()

In [None]:
# days since last post
ddf = dd.from_pandas(df, npartitions=200)
ddf = ddf[['user_url', 'day']].groupby(["user_url", "day"])['day'].count().to_frame().rename(columns={'day':'daily_count'}).reset_index(drop=False).sort_values(['user_url', 'day'])
df_days_since = ddf.compute(scheduler='processes')
df_days_since['datediff'] = df_days_since[['user_url', 'day', 'daily_count']].groupby(['user_url'])['day'].diff()
df_days_since['days_since_last_post'] = 0
df_days_since.loc[df_days_since['datediff'].notna(), 'days_since_last_post'] = df_days_since.loc[df_days_since['datediff'].notna(), 'datediff'].apply(lambda x: x.days)
df_days_since = df_days_since.drop("datediff", axis=1)

In [None]:
df_days_since.to_pickle(path_days_since_pkl)

In [None]:
df_days_since.head()

In [None]:
# forum posted in
df_subforums = df[['user_url', 'day', 'subforum_name']].groupby(["user_url", "day", "subforum_name"])["subforum_name"].count().reset_index(name="count")
df_subforums = df_subforums.pivot(index=['user_url', 'day'], columns='subforum_name', values='count').reset_index(drop=False).fillna(0).reset_index(drop=True)
df_subforums.head()

In [None]:
df_subforums.to_pickle(path_subforums_pkl)

## Join Data

In [30]:
df_days_since = pd.read_pickle(path_days_since_pkl)
df_subforums = pd.read_pickle(path_subforums_pkl)

In [9]:
df_emote_0 = pd.read_pickle(path_emote_pkl.format(0))
df_emote_1 = pd.read_pickle(path_emote_pkl.format(1))
df_emote_2 = pd.read_pickle(path_emote_pkl.format(2))
df_emote_0 = df_emote_0.drop("scores", axis=1)
df_emote_1 = df_emote_1.drop("scores", axis=1)
df_emote_2 = df_emote_2.drop("scores", axis=1)
df_emote = pd.concat([df_emote_0, df_emote_1, df_emote_2], axis=0).reset_index(drop=True)
df_emote['day'] = df_emote['day'].apply(lambda x: x.date())
df_emote = df_emote.groupby(['user_url', 'day']).agg(anger=("anger", np.mean),
                                          joy=("joy", np.mean),
                                          optimism=("optimism", np.mean),
                                          sadness=("sadness", np.mean)).reset_index(drop=False)

In [None]:
df_sentiment = pd.read_pickle(path_text_pkl)
df_sentiment = df_sentiment.drop("text", axis=1)
df_sentiment[['neg', 'neu', 'pos','compound']] = df_sentiment['sentiment'].apply(pd.Series)
df_sentiment = df_sentiment.drop("sentiment", axis=1)
df_sentiment['day'] = df_sentiment['day'].apply(lambda x: x.date())

Unnamed: 0,user_url,day,text,sentiment
0,150,2015-10-21,Hi I didn't get into carrying until ds 2 was a...,"{'neg': 0.0, 'neu': 0.774, 'pos': 0.226, 'comp..."
1,150,2016-05-20,I have low iron in normal life although convin...,"{'neg': 0.16, 'neu': 0.795, 'pos': 0.045, 'com..."
2,1st-time-mummy,2018-10-30,Hi Helen I just wondered how your little boy i...,"{'neg': 0.119, 'neu': 0.833, 'pos': 0.048, 'co..."
3,24h,2018-12-30,Did you get your positive opk Siobhan? Hi Clai...,"{'neg': 0.092, 'neu': 0.68, 'pos': 0.228, 'com..."
4,2557,2015-10-19,Sian Are you alright now? Hope everything is o...,"{'neg': 0.0, 'neu': 0.507, 'pos': 0.493, 'comp..."


In [27]:
df_bigrams = pd.read_pickle(path_bigrams_pkl)
df_topics = pd.read_pickle(path_topics_pkl)
df_topics = pd.concat([df_bigrams.reset_index(drop=True), df_topics.reset_index(drop=True)], axis=1).drop('bigrams', axis=1)
df_topics['day'] = df_topics['day'].apply(lambda x: x.date())

In [31]:
df = df_sentiment.merge(df_emote, how="inner", on=["user_url","day"])
df = df.merge(df_subforums, how="inner", on=["user_url","day"])
df = df.merge(df_days_since, how="inner", on=["user_url","day"])
df = df.merge(df_topics, how="inner", on=["user_url","day"])
df.to_pickle(path_joined_pkl)

In [5]:
df = pd.read_pickle(path_joined_pkl)

In [9]:
df.loc[df['user_url']=='alrx1'].sort_values('day').tail(30)

Unnamed: 0,user_url,day,neg,neu,pos,compound,anger,joy,optimism,sadness,...,topic_17,topic_18,topic_19,sn_user,user_id,time_period,first_period,last_period,time_since_first_period,is_last_period
1180372,alrx1,2014-02-20,0.059,0.814,0.127,0.9707,0.260971,0.056864,0.072216,0.609949,...,0.000108,0.545847,0.000105,1,2183,3182,2957,4719,225,0
751159,alrx1,2014-03-25,0.0,0.61,0.39,0.9821,0.009462,0.964755,0.014913,0.01087,...,0.000359,0.015341,0.000348,1,2183,3215,2957,4719,258,0
1159025,alrx1,2014-03-26,0.069,0.567,0.364,0.973,0.013523,0.924859,0.047491,0.014127,...,0.000474,0.248984,0.000458,1,2183,3216,2957,4719,259,0
1394373,alrx1,2014-03-27,0.214,0.588,0.198,-0.3267,0.022793,0.03569,0.012705,0.928812,...,0.000114,0.005351,0.00011,1,2183,3217,2957,4719,260,0
429723,alrx1,2014-04-07,0.0,0.625,0.375,0.9451,0.007536,0.964907,0.015495,0.012063,...,0.000643,0.025142,0.000623,1,2183,3228,2957,4719,271,0
1919442,alrx1,2014-04-19,0.074,0.763,0.164,0.9495,0.062314,0.085534,0.0612,0.790952,...,0.000153,0.006527,0.000148,1,2183,3240,2957,4719,283,0
1597890,alrx1,2014-05-02,0.0,0.803,0.197,0.89,0.015495,0.907554,0.050404,0.026547,...,0.000351,0.022017,0.00034,1,2183,3253,2957,4719,296,0
1726961,alrx1,2014-05-13,0.0,0.83,0.17,0.8645,0.051312,0.769607,0.090021,0.08906,...,0.000419,0.020138,0.000405,1,2183,3264,2957,4719,307,0
1126868,alrx1,2014-05-22,0.103,0.786,0.111,0.764,0.236875,0.118734,0.083988,0.560404,...,0.00015,0.588875,0.000145,1,2183,3273,2957,4719,316,0
1105453,alrx1,2014-05-26,0.0,0.645,0.355,0.9214,0.009733,0.969973,0.012901,0.007393,...,0.000496,0.019199,0.00048,1,2183,3277,2957,4719,320,0


In [34]:
df = df.dropna(axis=0)

In [35]:
df.to_pickle(path_joined_pkl)

In [5]:
posts_sql = '''
    SELECT
        p.id AS post_id,
        p.user_url,
        f.id AS forum_id
    FROM posts AS p
    LEFT JOIN threads AS t
    ON t.id=p.thread_id
    LEFT JOIN subforums AS s
    ON s.id=t.subforum_id
    LEFT JOIN forums AS f
    ON f.id=s.forum_id
'''

In [6]:
conn = create_connection(path_db)
sn_users = pd.read_sql_query(posts_sql, conn)
conn.close()

In [38]:
sn_users = sn_users.loc[sn_users['forum_id']==24]
sn_users = sn_users.drop_duplicates('user_url')[['user_url']]
sn_users = sn_users.loc[sn_users['user_url']!="Anonymous"]
sn_users.reset_index(inplace=True, drop=True)

In [39]:
df['sn_user'] = 0
df.loc[df['user_url'].isin(sn_users['user_url']), 'sn_user'] = 1

In [40]:
df['user_id'] = df.groupby('user_url').ngroup()
df['time_period'] = df.sort_values(['day']).groupby(['day']).ngroup()
df['first_period'] = df.groupby(['user_url'])['time_period'].transform('min')
df['last_period'] = df.groupby(['user_url'])['time_period'].transform('max')
df['time_since_first_period'] = df['time_period'] - df['first_period']
df['is_last_period'] = 0
df.loc[df['time_period']==df['last_period'], 'is_last_period'] = 1

In [41]:
df.to_pickle(path_joined_pkl)

In [42]:
all_cols = list(df.columns)

In [10]:
first_sn_post_sql = '''
WITH added_row_number AS (
    SELECT
        p.id AS post_id,
        p.user_url,
        p.date_created,
        ROW_NUMBER() OVER(PARTITION BY p.user_url ORDER BY p.date_created ASC) AS row_number
    FROM posts AS p
    LEFT JOIN threads AS t
    ON t.id=p.thread_id
    LEFT JOIN subforums AS s
    ON s.id=t.subforum_id
    WHERE s.forum_id=24
)
SELECT
  *
FROM added_row_number
WHERE row_number = 1;
'''
conn = create_connection(path_db)
first_sn_post = pd.read_sql_query(first_sn_post_sql, conn)
conn.close()

In [25]:
first_sn_post['day'] = pd.to_datetime(first_sn_post['date_created']).dt.date

In [26]:
first_sn_post = first_sn_post.rename(columns={'day':'first_sn_day'})

In [27]:
first_sn_post = first_sn_post[['user_url', 'first_sn_day']]

In [34]:
first_sn_post.to_csv(path_clean_data / "first_sn_day.csv", index=False)

In [35]:
sql = """
SELECT *
FROM posts
WHERE user_url = 'aarthi-s-3'
"""
conn = create_connection(path_db)
test = pd.read_sql_query(sql, conn)
conn.close()
test.head()

Unnamed: 0,id,thread_id,post_count,post_id,user_url,date_created,date_recorded,body,version
0,15295652,1237966,8,19717478,aarthi-s-3,2020-01-27 08:28PM,2021-03-07 02:35:12,Hi i am desperate for some good advice to get ...,1


### Panel Data

In [43]:
cols_panel = [all_cols[i] for i in [0, 1, 6, 7, 8, 9, 289, 290, 291, 292, 293, 294, 295]]
cols_panel

['user_url',
 'day',
 'anger',
 'joy',
 'optimism',
 'sadness',
 'sn_user',
 'user_id',
 'time_period',
 'first_period',
 'last_period',
 'time_since_first_period',
 'is_last_period']

In [44]:
df_panel = df[cols_panel].copy()

In [45]:
df_panel['year'] = pd.DatetimeIndex(df_panel['day']).year
df_panel['month'] = pd.DatetimeIndex(df_panel['day']).month

In [46]:
df_panel.sort_values(['user_url', 'day']).head()

Unnamed: 0,user_url,day,anger,joy,optimism,sadness,sn_user,user_id,time_period,first_period,last_period,time_since_first_period,is_last_period,year,month
546615,0407nc,2017-02-25,0.027597,0.010135,0.014064,0.948204,0,0,4283,4283,4364,0,0,2017,2
365103,0407nc,2017-02-28,0.031126,0.009913,0.006925,0.952036,0,0,4286,4283,4364,3,0,2017,2
1554764,0407nc,2017-03-01,0.010955,0.012001,0.006985,0.97006,0,0,4287,4283,4364,4,0,2017,3
976701,0407nc,2017-03-02,0.023654,0.808639,0.116215,0.051492,0,0,4288,4283,4364,5,0,2017,3
1812262,0407nc,2017-03-14,0.017647,0.397049,0.031483,0.553821,0,0,4300,4283,4364,17,0,2017,3


In [47]:
df_panel['day'] = df_panel['day'].apply(lambda x: x.strftime('%Y-%m-%d'))

In [48]:
df_panel.to_stata(path_clean_data / "daily_panel.dta")

### Cox Data

In [49]:
df.to_csv(path_clean_data / "daily_all.csv", index=False)

In [None]:
df_cox = df[cols_cox].copy()