# Generate Word Features


## Imports

In [1]:
# LDA
from gensim import corpora, models
from gensim.models import CoherenceModel, LdaModel, LdaMulticore
from gensim.models.callbacks import PerplexityMetric, ConvergenceMetric, CoherenceMetric
# Managing data
import pandas as pd
import numpy as np
import re
# DB connection
from scraping import create_connection
# Files & I/O
import pickle
import csv
import os
from pathlib import Path
from io import FileIO
# For logging
import logging
# Plotting
import matplotlib.pyplot as plt
# Random
import random
# Parallelizing
import dask.dataframe as dd
from dask.multiprocessing import get

## Functions

## File Locations

In [2]:
p = Path.cwd()
path_parent = p.parents[1]

In [3]:
# database
path_db = str(path_parent / "database" / "netmums-merged.db")
path_clean_data = path_parent / "clean_data" / "netmums"
# data to load
path_lemma_pkl = str(path_clean_data / "lemmatized_text_{0}_{1}_{2}.pkl")
path_corpus_pkl = str(path_clean_data / "corpus_{0}_{1}_{2}.pkl")
path_dictionary_gensim = str(path_clean_data / "dictionary_{0}_{1}_{2}.gensim")
# model saving
path_tune_models = str(path_clean_data / "lda_tune_{0}_{1}_{2}_{3}_{4}.gensim")
path_ntopic_models = str(path_clean_data / "lda_ntopics_{0}_{1}_{2}_{3}.gensim")
# path_coherence = str(path_parent / "clean_data" / "coherence_{}.csv")
path_log = str(path_clean_data / "logging_{0}_{1}_{2}_{3}.log")
path_log_iterations = str(path_clean_data / "logging_{0}_{1}_{2}_{3}.log")
# dominant topic
path_dom_topic = str(path_clean_data / "dominant_topic_{0}_{1}_{2}_{3}.csv")

In [4]:
# dataframes
path_topics_pkl = str(path_clean_data / "daily_topics_sn.pkl")
path_text_pkl = str(path_clean_data / "daily_clean_text.pkl")
path_topics = str(path_clean_data / "lda_topics_sn.csv")
# path_days_since_pkl = str(path_clean_data / "daily_days_since.pkl")
# path_subforums_pkl = str(path_clean_data / "daily_subforums.pkl")
# path_emote_pkl = str(path_clean_data / "daily_emote_processed_{}.pkl")
# path_joined_pkl = str(path_clean_data / "daily_joined_df.pkl")
# path_bigrams_pkl = str(path_corpus_pkl.format("all", "all", "daily_text_df"))

## Create Topic Model

In [5]:
forum = "special-needs"
group = "all"
id_type = "family_id"

In [6]:
dictionary = corpora.Dictionary.load(path_dictionary_gensim.format(forum, group, id_type))
corpus = pickle.load(open(path_corpus_pkl.format(forum, group, id_type), 'rb'))
lemmatized_text = pickle.load(open(path_lemma_pkl.format(forum, group, id_type), 'rb'))

In [7]:
import time

In [8]:
# 15 topics has the highest coherence
n_topics = 15
start = time.time()
lda = LdaModel(
    corpus=corpus,
    num_topics=n_topics,
    id2word=dictionary,
    random_state=238,
    alpha="auto",
    eta="auto"
)
print("model finished time: ", (time.time() - start) / 60)
coherence_model_lda = CoherenceModel(model=lda, texts=lemmatized_text, dictionary=dictionary, coherence='c_v')
print(coherence_model_lda.get_coherence())
print("working time: ", (time.time() - start) / 60)

model finished time:  1.5097419102986653
0.43178274529929883
working time:  1.7702297886212668


In [9]:
lda.save(path_ntopic_models.format(forum, group, id_type, str(n_topics)))

## Daily text to corpus

In [6]:
import lemmatize_all as la

In [7]:
dictionary = corpora.Dictionary.load(path_dictionary_gensim.format(forum, group, id_type))
clean_text = pd.read_pickle(path_text_pkl)
clean_text.columns = ['user_url', 'day', 'text_clean', 'sentiment']

In [8]:
sn_users_sql = """
    SELECT DISTINCT p.user_url AS user_url
    FROM posts AS p
    LEFT JOIN threads AS t
    ON t.id=p.thread_id
    LEFT JOIN subforums AS s
    ON s.id=t.subforum_id
    WHERE s.forum_id=24
"""
conn = create_connection(path_db)
sn_user_df = pd.read_sql_query(sn_users_sql, conn)
conn.close()

In [9]:
clean_text = pd.merge(clean_text, sn_user_df, on="user_url", how="inner")

In [14]:
%%time
df_bigrams, corpus = la.make_corpus(clean_text, dictionary, 4)

making bigrams
time elapsed: 39.28419262568156
bigrams to corpus
time elapsed: 0.5153856674830118
CPU times: user 45.3 s, sys: 5.76 s, total: 51.1 s
Wall time: 39min 48s


In [15]:
df_bigrams.to_pickle(path_corpus_pkl.format("special-needs", "all", "daily_text_df"))

In [16]:
pickle.dump(corpus, open(path_corpus_pkl.format("special-needs", "all", "daily_text"), 'wb'))

In [17]:
# corpus = pickle.load(open(path_corpus_pkl.format("special-needs", "all", "daily_text"), 'rb'))
# len(corpus)

## Daily text to topics

In [11]:
n_topics = 15
lda = LdaModel.load(path_ntopic_models.format(forum, group, id_type, str(n_topics)))

In [19]:
from gensim.matutils import corpus2csc
import numpy as np

In [20]:
all_topics = lda.get_document_topics(corpus, minimum_probability=0.0)
all_topics_csr = corpus2csc(all_topics)
all_topics_numpy = all_topics_csr.T.toarray()
all_topics_df = pd.DataFrame(all_topics_numpy)
all_topics_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.000685,0.000344,0.069275,0.316988,0.000616,0.067188,0.000316,0.000862,0.005149,0.000224,0.347239,0.016753,0.05536,0.11876,0.000242
1,0.011825,0.000363,0.001532,0.001431,0.000649,0.046162,0.000333,0.000909,0.468074,0.000236,0.216788,0.002439,0.218197,0.030807,0.000256
2,0.001114,0.067358,0.002385,0.00221,0.001002,0.114225,0.029597,0.237355,0.361407,0.000364,0.068411,0.092779,0.002688,0.01871,0.000395
3,0.163373,0.048176,0.002696,0.002527,0.001147,0.332729,0.000588,0.14479,0.102098,0.000417,0.051512,0.143494,0.00306,0.002943,0.000452
4,0.1376,0.000267,0.001129,0.0243,0.064738,0.374783,0.000245,0.000669,0.193862,0.000174,0.189324,0.001724,0.009772,0.001225,0.000188


In [21]:
column_names = ["topic_{}".format(i) for i in range(n_topics)]
all_topics_df.columns = column_names
all_topics_df.to_pickle(path_topics_pkl)

In [23]:
df_bigrams.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 644587 entries, 0 to 644665
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype         
---  ------    --------------   -----         
 0   user_url  644587 non-null  object        
 1   day       644587 non-null  datetime64[ns]
 2   bigrams   644587 non-null  object        
dtypes: datetime64[ns](1), object(2)
memory usage: 19.7+ MB


In [24]:
all_topics_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 644587 entries, 0 to 644586
Data columns (total 15 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   topic_0   644587 non-null  float64
 1   topic_1   644587 non-null  float64
 2   topic_2   644587 non-null  float64
 3   topic_3   644587 non-null  float64
 4   topic_4   644587 non-null  float64
 5   topic_5   644587 non-null  float64
 6   topic_6   644587 non-null  float64
 7   topic_7   644587 non-null  float64
 8   topic_8   644587 non-null  float64
 9   topic_9   644587 non-null  float64
 10  topic_10  644587 non-null  float64
 11  topic_11  644587 non-null  float64
 12  topic_12  644587 non-null  float64
 13  topic_13  644587 non-null  float64
 14  topic_14  644587 non-null  float64
dtypes: float64(15)
memory usage: 73.8 MB


In [34]:
all_topics_df = pd.concat([df_bigrams.reset_index(drop=True), all_topics_df.reset_index(drop=True)], axis=1)

In [36]:
all_topics_df = all_topics_df.drop('bigrams', axis=1)

In [38]:
all_topics_df.to_pickle(path_topics_pkl)
all_topics_df.to_csv(str(path_clean_data / "daily_topics_sn.csv"), index=False)

In [12]:
all_topics_df = pd.read_pickle(path_topics_pkl)

## Get Topic Keywords

In [16]:
def write_list(fn, results):
    with open(fn, 'a') as f:
        writer = csv.writer(f) 
        writer.writerow(results)

In [14]:
n_topics = 15
lda = LdaModel.load(path_ntopic_models.format(forum, group, id_type, str(n_topics)))

In [18]:
n_words = 20
topics = lda.print_topics(num_topics=n_topics, num_words=n_words)
for topic in topics:
    write_list(path_topics, [topic[0], topic[1]])

## Save Dominant Topic and Text

In [10]:
clean_text.head()

Unnamed: 0,user_url,day,text_clean,sentiment
0,abbey-h-18,2014-11-07,"Hi, My little boy who just turned 4 past Septe...","{'neg': 0.042, 'neu': 0.855, 'pos': 0.102, 'co..."
1,abbey-h-18,2015-07-08,"Hi Kaye, I can relate to you, my boy is starti...","{'neg': 0.109, 'neu': 0.737, 'pos': 0.154, 'co..."
2,abbey-h-18,2017-05-11,"Hi Ladies, Just catching up on thread. My cram...","{'neg': 0.033, 'neu': 0.828, 'pos': 0.139, 'co..."
3,abbey-h-18,2017-06-16,yes me!! Im 9+5 and been suffering with bleedi...,"{'neg': 0.05, 'neu': 0.87, 'pos': 0.08, 'compo..."
4,abbey-h-18,2014-08-31,We used to be £100-£120 per week around £400 p...,"{'neg': 0.024, 'neu': 0.875, 'pos': 0.101, 'co..."


In [13]:
all_topics_df.head()

Unnamed: 0,user_url,day,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,topic_11,topic_12,topic_13,topic_14
0,abbey-h-18,2014-11-07,0.000685,0.000344,0.069275,0.316988,0.000616,0.067188,0.000316,0.000862,0.005149,0.000224,0.347239,0.016753,0.05536,0.11876,0.000242
1,abbey-h-18,2015-07-08,0.011825,0.000363,0.001532,0.001431,0.000649,0.046162,0.000333,0.000909,0.468074,0.000236,0.216788,0.002439,0.218197,0.030807,0.000256
2,abbey-h-18,2017-05-11,0.001114,0.067358,0.002385,0.00221,0.001002,0.114225,0.029597,0.237355,0.361407,0.000364,0.068411,0.092779,0.002688,0.01871,0.000395
3,abbey-h-18,2017-06-16,0.163373,0.048176,0.002696,0.002527,0.001147,0.332729,0.000588,0.14479,0.102098,0.000417,0.051512,0.143494,0.00306,0.002943,0.000452
4,abbey-h-18,2014-08-31,0.1376,0.000267,0.001129,0.0243,0.064738,0.374783,0.000245,0.000669,0.193862,0.000174,0.189324,0.001724,0.009772,0.001225,0.000188


In [14]:
all_topics_df.columns

Index(['user_url', 'day', 'topic_0', 'topic_1', 'topic_2', 'topic_3',
       'topic_4', 'topic_5', 'topic_6', 'topic_7', 'topic_8', 'topic_9',
       'topic_10', 'topic_11', 'topic_12', 'topic_13', 'topic_14'],
      dtype='object')

In [15]:
all_topics_df['dominant_topic'] = all_topics_df[['topic_0', 'topic_1', 'topic_2', 'topic_3',
       'topic_4', 'topic_5', 'topic_6', 'topic_7', 'topic_8', 'topic_9',
       'topic_10', 'topic_11', 'topic_12', 'topic_13', 'topic_14']].idxmax(axis=1)

In [16]:
all_topics_df['dominant_topic'].value_counts()

topic_8     264693
topic_10    194879
topic_5      64503
topic_11     46937
topic_7      22433
topic_2      14501
topic_3      14017
topic_0       8029
topic_12      6642
topic_13      5163
topic_4       2182
topic_1        352
topic_6        160
topic_14        61
topic_9         35
Name: dominant_topic, dtype: int64

In [17]:
all_topics_text = pd.merge(all_topics_df, clean_text[['user_url','day','text_clean']], on=['user_url','day'], how="inner")

In [19]:
all_topics_text.to_csv(str(path_clean_data / "sn_topics_and_text.csv"), index=False)