In [50]:
import pandas as pd
import os
from datetime import datetime

import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
import seaborn as sns
# Use seaborn style defaults and set the default figure size
sns.set(rc={'figure.figsize':(11, 4)})

from fbprophet import Prophet
import nltk
from nltk.stem.wordnet import WordNetLemmatizer

from collections import defaultdict
from tqdm import tqdm

# Data Aggregation

## Load in data

In [3]:
PATH="data/subreddits/BabyBumps/submissions/"
orig_df = pd.DataFrame()
for entry in tqdm(os.listdir(PATH)):
    if not entry.endswith(".json.gz"):
        continue
    orig_df = pd.concat([orig_df, pd.read_json(PATH+entry, compression='infer')], axis=0)
orig_df.shape, datetime.utcfromtimestamp(min(orig_df['created_utc'])), datetime.utcfromtimestamp(max(orig_df['created_utc']))

100%|██████████| 535/535 [01:44<00:00,  5.10it/s]


((314472, 41),
 datetime.datetime(2011, 1, 4, 18, 25, 57),
 datetime.datetime(2021, 3, 15, 19, 23, 34))

In [4]:
orig_df = orig_df.drop_duplicates(['id','created_utc', 'author'],keep='last')
orig_df.shape

(314472, 41)

In [5]:
combined_df = orig_df

In [None]:
# earlier_df = pd.DataFrame()
# for entry in tqdm(os.listdir(PATH+"original/")):
#     earlier_df = pd.concat([earlier_df, pd.read_json(PATH+"original/"+entry, lines=True, compression='gzip')], axis=0)
    
# earlier_df.shape, min(earlier_df['created_utc']), max(earlier_df['created_utc']) 

**Combine examples into one dataframe**

In [None]:
# keys_intersect = list(orig_df.keys().intersection(earlier_df.keys()))
# combined_df = pd.concat([earlier_df[keys_intersect], orig_df[keys_intersect]]).reset_index(drop=True)
# combined_df.shape

Remove duplicates

In [None]:
# combined_df = combined_df.drop_duplicates(['id','created_utc', 'author'],keep='last')
# combined_df.shape

### Filtering 

Include only examples with "birth story" or "graduat" in the title

In [6]:
combined_df['is_birth_story'] = combined_df['title'].map(lambda x: "birth story" in x.lower() or 'graduat' in x.lower())
print(combined_df['is_birth_story'].value_counts())
df = combined_df[combined_df['is_birth_story']]
df.shape

False    306346
True       8126
Name: is_birth_story, dtype: int64


(8126, 42)

Apply `pre-covid` indicator

In [8]:
# 1583020801 - march 1st 2020
# 1584244800 - march 15th 2020
df['pre-covid'] = df['created_utc'].map(lambda x: True if x < 1584244800 else False) #1583020801 is unix timestamp for March 1st 2020 at 12:00:01 am
df['pre-covid'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


True     7147
False     979
Name: pre-covid, dtype: int64

In [10]:
df.to_json("birth_narratives.jsonl.gz", lines=True, compression="gzip", orient="records")

In [71]:
df =pd.read_json("birth_narratives.jsonl.gz", lines=True, compression="gzip", orient="records")

In [72]:
df.shape

(6386, 43)

**Aligning empty stories with corresponding comments**

In [53]:
missing_text_df = df[df['selftext'].map(lambda x: not x)]
missing_id_author_df = missing_text_df[['id', 'author', 'pre-covid']]
missing_id_author_df

Unnamed: 0,id,author,pre-covid
0,i1ula0,GhxstCxt,False
3,i2l1y8,bloop_bloop_bloooooo,False
4,i2q6ph,Watchingpornwithcas,False
7,i33b32,DashOfLiz,False
9,i3awn7,fluorescentpuffin,False
...,...,...,...
8105,7di89o,Duckyes,True
8107,112c2k,fillie,True
8108,117yty,derpitydooda,True
8109,1181qn,chancesofconception,True


In [54]:
missing_id_author_df['pre-covid'].value_counts()

True     2077
False     574
Name: pre-covid, dtype: int64

In [55]:
def get_first_comment(row):
    curr_id, author = row.id, row.author
    if not os.path.exists(f"data/subreddits/BabyBumps/comments/{curr_id}.json.gz"):
        return 
    comments_df = pd.read_json(f"data/subreddits/BabyBumps/comments/{curr_id}.json.gz", compression='gzip')
    if comments_df.shape[0] == 0:
        return
    match_df = comments_df[(comments_df['parent_id'].map(lambda x: curr_id in x)) & (comments_df['author'] == author)].sort_values('created_utc',ascending=True)
    if match_df.shape[0] == 0:
        return 
    return match_df.iloc[0]['body']

missing_id_author_df['body'] = missing_id_author_df.apply(get_first_comment, axis=1)
missing_id_author_df['body'].map(lambda x: x == None).value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


True     1740
False     911
Name: body, dtype: int64

In [56]:
missing_id_author_df[missing_id_author_df['body'] == None]

Unnamed: 0,id,author,pre-covid,body


***Add check for pre-covid examples***

**Update missing selftext's** 

In [57]:
print(df['selftext'].map(lambda x: not x).value_counts())
for idx, row in missing_id_author_df.iterrows():
    df.at[idx, 'selftext'] = row.body
    #if row.body:
    #    df.loc[row.index]['selftext'] = row.body
df['selftext'].map(lambda x: not x).value_counts()

False    5475
True     2651
Name: selftext, dtype: int64


False    6386
True     1740
Name: selftext, dtype: int64

In [58]:
df['selftext'].map(lambda x: x != None).value_counts()

True     6386
False    1740
Name: selftext, dtype: int64

In [61]:
df[df['selftext'].map(lambda x: not not x)]['selftext'].shape

(6386,)

In [62]:
df = df[df['selftext'].map(lambda x: not not x)]
df.shape

(6386, 43)

In [73]:
df['selftext'].map(lambda x: x != '[removed]' or x != '[deleted]').value_counts()

True     6317
False      69
Name: selftext, dtype: int64

In [76]:
df = df[df['selftext'] != '[removed]']
df = df[df['selftext'] != '[deleted]']
df.shape

(6119, 43)

In [77]:
df.to_json("birth_narratives.jsonl.gz", lines=True, compression="gzip", orient="records")

## Tokenization

In [None]:
df['title_tokens'] = df['title'].map(lambda x: nltk.word_tokenize(x.lower()))
df['selftext_tokens'] = df['selftext'].map(lambda x: nltk.word_tokenize(x.lower()))

In [None]:
lmtzr = WordNetLemmatizer()
df['selftext_lemmas'] = df['selftext_tokens'].map(lambda x, lmtzr=lmtzr: [lmtzr.lemmatize(tok) for tok in x])
df['title_lemmas'] = df['title_tokens'].map(lambda x, lmtzr=lmtzr: [lmtzr.lemmatize(tok) for tok in x])

**Remove stories with less than 500 words**

In [None]:
print(pd.Series(df['selftext_tokens'].map(lambda x: len(x) >= 500).value_counts()))
df = df[df['selftext_tokens'].map(lambda x: len(x) >= 500)]
df.shape

In [None]:
min(orig_df['created_utc']), max(orig_df['created_utc'])


orig_df['timestamp'] = orig_df['created_utc'].map(lambda x: pd.to_datetime(datetime.utcfromtimestamp(int(x)).strftime('%Y-%m-%d %H:%M:%S'))) 

# if you encounter a "year is out of range" error the timestamp
# may be in milliseconds, try `ts /= 1000` in that case
print(datetime.utcfromtimestamp(int(min(df['created_utc']))).strftime('%Y-%m-%d %H:%M:%S'))
print(datetime.utcfromtimestamp(int(max(df['created_utc']))).strftime('%Y-%m-%d %H:%M:%S'))
orig_df = orig_df.sort_values('timestamp')

In [None]:
df['timestamp'] = df['created_utc'].map(lambda x: pd.to_datetime(datetime.utcfromtimestamp(int(x)).strftime('%Y-%m-%d %H:%M:%S'))) 

In [None]:
bins = np.arange(0, 5000, 500)
bins = np.insert(bins, 1, 1)
df['selftext'].map(lambda x: 0 if not x else len(x.split())).hist(bins=bins)

**Length of `stories`**

In [None]:
bins = np.arange(500, 5000, 100)
df['selftext_tokens'].map(lambda x: len(x)).hist(bins=bins)

##### Quantity of Submissions per year

In [None]:
ax = df['timestamp'].map(lambda x: x.year).value_counts().sort_index().plot(kind="bar", rot=45)
#ax.xaxis.set_major_formatter(mdates.DateFormatter())
#df['timestamp'].map(lambda x: "-".join(x.split("-")[0:2])).value_counts().sort_index().plot()

## Saving Dataframe

In [None]:
df.keys(), df.shape

In [None]:
df.to_json("babybumps.jsonl.gz", lines=True, compression='gzip', orient='records')

**Test loading dataframe**

In [None]:
tmp_df = pd.read_json("babybumps.jsonl.gz", lines=True, compression='gzip')

In [None]:
tmp_df.keys() == df.keys()

All posts

### Time projections

In [None]:
pre_covid_vc = df[df['pre-covid']]['timestamp'].map(lambda x: pd.to_datetime(f"{x.year}-{x.month}")).value_counts().sort_index()
day_count_df = pd.DataFrame()
day_count_df['ds'] = pre_covid_vc.map(lambda x: str(x))

In [None]:
day_count_df['y'] = day_count_df['ds']
day_count_df['ds'] = day_count_df.index
day_count_df

In [None]:
m = Prophet()
m.fit(day_count_df)

In [None]:
future = m.make_future_dataframe(periods=365)
future.tail()

In [None]:
forecast = m.predict(future)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()

In [None]:
fig1 = m.plot(forecast)

In [None]:
fig2 = m.plot_components(forecast)

In [None]:
anxious_text_df_long = anxious_text_df[anxious_text_df.selftext.str.split().map(lambda x: len(x) > 99)]
anxious_text_df_long.shape

In [None]:
" ".join(anxious_text_df.selftext_tokens.iloc[5])

In [None]:
import nltk

## LDA Topic Modeling

In [None]:
# Tokenize the documents.
from nltk.tokenize import RegexpTokenizer

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
anxious_text_df_long['tokens'] = anxious_text_df_long.selftext.str.lower().map(lambda x, tokenizer=tokenizer: tokenizer.tokenize(x))

# Remove numbers, but not words that contain numbers.
anxious_text_df_long['tokens'] = anxious_text_df_long['tokens'].map(lambda x: [token for token in x if not token.isnumeric()])

# Remove words that are only one character.
anxious_text_df_long['tokens'] = anxious_text_df_long['tokens'].map(lambda x: [token for token in x if len(token) > 1])

# Remove stop words
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english')) 
anxious_text_df_long['tokens'] = anxious_text_df_long['tokens'].map(lambda x: [token for token in x if token not in stop_words])

# Lemmatize the documents.
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
anxious_text_df_long['tokens'] = anxious_text_df_long['tokens'].map(lambda x: [lemmatizer.lemmatize(token) for token in x])

In [None]:
docs = list(anxious_text_df_long['tokens'])


# Compute bigrams.
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

In [None]:
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 10 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=5, no_above=0.10)

# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 10
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

top_topics = model.top_topics(corpus) #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

def topics_to_df(top_topics):
    topic_id2words = {}
    for idx, topic in enumerate(top_topics):
        topic_id2words[idx] = []
        for word in topic[0][:10]:
            topic_id2words[idx].append(word[1])
    return pd.DataFrame.from_dict(topic_id2words).T

topics_to_df(top_topics)