In [1]:
from gensim import corpora, models
import pickle
from pathlib import Path
from io import FileIO
import pyLDAvis.gensim
from gensim.models import LdaModel, CoherenceModel
from lemmatize import *
from scraping import create_connection

In [2]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

  and should_run_async(code)


In [3]:
def load_data(forum="all", group="all", id_type="family_id"):
    lemmatized_text = pickle.load(open(path_lemma_pkl.format(forum, group, id_type), 'rb'))
    corpus = pickle.load(open(path_corpus_pkl.format(forum, group, id_type), 'rb'))
    dictionary = corpora.Dictionary.load(path_dictionary_gensim.format(forum, group, id_type))
    return lemmatized_text, corpus, dictionary

In [4]:
def gen_cols(n):
    return ["topic_{}".format(str(i).zfill(2)) for i in range(n)]

In [5]:
p = Path.cwd()
path_parent = p.parents[0]

In [6]:
# database
path_db = str(path_parent / "database" / "youbemom-merged.db")
# data to load
path_lemma_pkl = str(path_parent / "clean_data" / "lemmatized_text_{0}_{1}_{2}.pkl")
path_corpus_pkl = str(path_parent / "clean_data" / "corpus_{0}_{1}_{2}.pkl")
path_dictionary_gensim = str(path_parent / "clean_data" / "dictionary_{0}_{1}_{2}.gensim")
# model saving
path_ntopic_models = str(path_parent / "clean_data" / "lda_ntopics_{0}_{1}_{2}_{3}.gensim")
# clean text
path_clean_text = str(path_parent / "clean_data" / "clean_text_{0}_{1}.csv")
# topic distribution
path_topic_counts = str(path_parent / "clean_data" / "topics_{0}_{1}_{2}.csv")

In [7]:
args = {
    'special-needs':{'n_passes':100, 'n_iterations':200, 'n_topics':[5, 10, 15, 20, 25, 30, 40, 50]},
    'tween-teen':   {'n_passes':120, 'n_iterations':400, 'n_topics':[2, 3, 4, 5, 7, 10, 15, 20, 25, 30, 40, 50]},
    'preschool':    {'n_passes':100, 'n_iterations':400, 'n_topics':[2, 3, 4, 5, 7, 10, 15, 20, 25, 30]},
    'elementary':   {'n_passes':130, 'n_iterations':400, 'n_topics':[2, 3, 4, 5, 7, 10, 15, 20, 25, 30]},
    'new-york-city':{'n_passes':150, 'n_iterations':400, 'n_topics':[2, 3, 4, 5, 7, 10, 15, 20, 25, 30, 40, 50]},
    'school':       {'n_passes':200, 'n_iterations':400, 'n_topics':[2, 3, 4, 5, 7, 10, 15, 20, 25, 30, 40, 50]},
    'toddler':      {'n_passes':35,  'n_iterations':200, 'n_topics':[5, 10, 15, 20, 25, 30, 40, 50]}
}

In [8]:
group = 'all'
id_type = 'family_id'

In [9]:
sf = "special-needs"
n_sn = 20
lemmatized_text_sn, corpus_sn, dictionary_sn = load_data(sf, group, id_type)
mod_sn = LdaModel.load(path_ntopic_models.format(sf, group, id_type, str(n_sn)))

In [10]:
ct_sn = pd.read_csv(path_clean_text.format(sf, group))
ct_sn = ct_sn[['family_id','text_clean']].groupby(['family_id'])['text_clean'].apply(' '.join)
ct_sn = pd.DataFrame(ct_sn)
ct_sn.reset_index(inplace=True)

In [11]:
sf = "toddler"
n_td = 15
lemmatized_text_td, corpus_td, dictionary_td = load_data(sf, group, id_type)
mod_td = LdaModel.load(path_ntopic_models.format(sf, group, id_type, str(n_td)))

In [12]:
ct_td = pd.read_csv(path_clean_text.format(sf, group))
ct_td = ct_td[['family_id','text_clean']].groupby(['family_id'])['text_clean'].apply(' '.join)
ct_td = pd.DataFrame(ct_td)
ct_td.reset_index(inplace=True)

In [13]:
# toddler topic distribution on special-needs documents
## must make new corpus for special-needs using toddler dictionary so
## the lda model matches words correctly
corpus_sn_using_td = [dictionary_td.doc2bow(t) for t in lemmatized_text_sn]
out_sn = np.zeros((len(corpus_sn_using_td), n_td), dtype=float, order='C')
for i, doc in enumerate(corpus_sn_using_td):
    topics = mod_td.get_document_topics(doc)
    for j, score in topics:
        out_sn[i,j] = score

In [19]:
topic_df_sn = pd.concat([ct_sn, pd.DataFrame(out_sn, columns=gen_cols(n_td))], axis=1)
topic_df_sn.to_csv(path_topic_counts.format("special-needs", group, "using_td_15"))

In [15]:
df = pd.DataFrame(out_sn)
colsums = df.sum()
totalsum = colsums.sum()
100 * colsums / totalsum

0      0.216413
1      0.735056
2     51.257636
3      0.148091
4      2.125724
5      2.988512
6      3.425553
7     10.825894
8      0.176394
9      0.648541
10     0.531523
11     3.281950
12    23.235781
13     0.254778
14     0.148152
dtype: float64

In [16]:
# special-needs topic distribution on toddler documents
corpus_td_using_sn = [dictionary_sn.doc2bow(t) for t in lemmatized_text_td]
out_td = np.zeros((len(corpus_td_using_sn), n_sn), dtype=float, order='C')
for i, doc in enumerate(corpus_td_using_sn):
    topics = mod_sn.get_document_topics(doc)
    for j, score in topics:
        out_td[i,j] = score

In [20]:
topic_df_td = pd.concat([ct_td, pd.DataFrame(out_td, columns=gen_cols(n_sn))], axis=1)
topic_df_td.to_csv(path_topic_counts.format("toddler", group, "using_sn_20"))

In [18]:
df = pd.DataFrame(out_td)
colsums = df.sum()
totalsum = colsums.sum()
100 * colsums / totalsum

0      1.050317
1      0.370314
2      5.466853
3      0.432776
4      0.551899
5      2.679955
6     11.084513
7      2.983304
8      0.112575
9     28.373404
10     3.541574
11     0.460947
12     0.155415
13     0.235904
14     1.319802
15     1.779726
16     0.677381
17     0.337585
18    38.019294
19     0.366460
dtype: float64

## Word Frequency

In [21]:
# find word frequency
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

In [36]:
ct_sn = pd.read_csv(path_clean_text.format("special-needs", group))
ct_td = pd.read_csv(path_clean_text.format("toddler", group))

In [38]:
LEN_SN = ct_sn.shape[0]
LEN_TD = ct_td.shape[0]
def token_counts(word):
    sn_freq = ct_sn['text_clean'].str.count(word).sum() / LEN_SN
    td_freq = ct_td['text_clean'].str.count(word).sum() / LEN_TD
    print("special-needs:", sn_freq)
    print("toddler:      ", td_freq)

In [51]:
token_counts("\\bis\sautistic")

special-needs: 0.00025304284015283785
toddler:       1.6795080947750958e-05


In [50]:
token_counts("\\bhas\sautism")

special-needs: 0.0009290287131325619
toddler:       2.9050950828542194e-05


In [52]:
2.9050950828542194e-05 / 1.6795080947750958e-05

1.7297297297297296

In [53]:
0.0009290287131325619 / 0.00025304284015283785

3.6714285714285717

In [54]:
token_counts("retard\\b")

special-needs: 5.78383634635058e-05
toddler:       8.624501027223464e-05


In [55]:
token_counts("retarded\\b")

special-needs: 0.0002277385561375541
toddler:       0.00015932630845028612


In [56]:
token_counts("\\bsn\\b")

special-needs: 0.07336434915574064
toddler:       0.0006733919617834742


In [59]:
token_counts("\\bneed\shelp\\b")

special-needs: 0.0011025438035230793
toddler:       0.0002589619913700519


In [27]:
countvectorizer = CountVectorizer()
tfidfvectorizer = TfidfVectorizer()
l_sn = [" ".join(l) for l in lemmatized_text_sn]
countvec_sn = countvectorizer.fit_transform(l_sn)
tfidfvec_sn = tfidfvectorizer.fit_transform(l_sn)
count_tokens_sn = countvectorizer.get_feature_names()
tfidf_tokens_sn = tfidfvectorizer.get_feature_names()
df_countvec_sn = pd.DataFrame(data = countvec_sn.toarray(),columns = count_tokens_sn)
df_tfidfvec_sn = pd.DataFrame(data = tfidfvec_sn.toarray(),columns = tfidf_tokens_sn)

In [None]:
countvectorizer = CountVectorizer()
tfidfvectorizer = TfidfVectorizer()
l_td = [" ".join(l) for l in lemmatized_text_td]
countvec_td = countvectorizer.fit_transform(l_td)
tfidfvec_td = tfidfvectorizer.fit_transform(l_td)
count_tokens_td = countvectorizer.get_feature_names()
tfidf_tokens_td = tfidfvectorizer.get_feature_names()
df_countvec_td = pd.DataFrame(data = countvec_td.toarray(),columns = count_tokens_td)
df_tfidfvec_td = pd.DataFrame(data = tfidfvec_td.toarray(),columns = tfidf_tokens_td)