In [1]:
from gensim import corpora, models
import pickle
from pathlib import Path
from io import FileIO
import pyLDAvis.gensim
from gensim.models import LdaModel, CoherenceModel
from lemmatize import *
from scraping import create_connection

In [2]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

  and should_run_async(code)


In [3]:
def load_data(forum="all", group="all", id_type="family_id"):
    lemmatized_text = pickle.load(open(path_lemma_pkl.format(forum, group, id_type), 'rb'))
    corpus = pickle.load(open(path_corpus_pkl.format(forum, group, id_type), 'rb'))
    dictionary = corpora.Dictionary.load(path_dictionary_gensim.format(forum, group, id_type))
    return lemmatized_text, corpus, dictionary

In [4]:
def gen_cols(n):
    return ["topic_{}".format(str(i).zfill(2)) for i in range(n)]

In [5]:
p = Path.cwd()
path_parent = p.parents[0]

In [6]:
# database
path_db = str(path_parent / "database" / "youbemom-merged.db")
# data to load
path_lemma_pkl = str(path_parent / "clean_data" / "lemmatized_text_{0}_{1}_{2}.pkl")
path_corpus_pkl = str(path_parent / "clean_data" / "corpus_{0}_{1}_{2}.pkl")
path_dictionary_gensim = str(path_parent / "clean_data" / "dictionary_{0}_{1}_{2}.gensim")
# model saving
path_ntopic_models = str(path_parent / "clean_data" / "lda_ntopics_{0}_{1}_{2}_{3}.gensim")
# clean text
path_clean_text = str(path_parent / "clean_data" / "clean_text_{0}_{1}.csv")
# topic distribution
path_topic_counts = str(path_parent / "clean_data" / "topics_{0}_{1}_{2}.csv")

In [7]:
args = {
    'special-needs':{'n_passes':100, 'n_iterations':200, 'n_topics':[5, 10, 15, 20, 25, 30, 40, 50]},
    'tween-teen':   {'n_passes':120, 'n_iterations':400, 'n_topics':[2, 3, 4, 5, 7, 10, 15, 20, 25, 30, 40, 50]},
    'preschool':    {'n_passes':100, 'n_iterations':400, 'n_topics':[2, 3, 4, 5, 7, 10, 15, 20, 25, 30]},
    'elementary':   {'n_passes':130, 'n_iterations':400, 'n_topics':[2, 3, 4, 5, 7, 10, 15, 20, 25, 30]},
    'new-york-city':{'n_passes':150, 'n_iterations':400, 'n_topics':[2, 3, 4, 5, 7, 10, 15, 20, 25, 30, 40, 50]},
    'school':       {'n_passes':200, 'n_iterations':400, 'n_topics':[2, 3, 4, 5, 7, 10, 15, 20, 25, 30, 40, 50]},
    'toddler':      {'n_passes':35,  'n_iterations':200, 'n_topics':[5, 10, 15, 20, 25, 30, 40, 50]}
}

In [8]:
group = 'all'
id_type = 'family_id'

In [9]:
sf = "special-needs"
n_sn = 20
lemmatized_text_sn, corpus_sn, dictionary_sn = load_data(sf, group, id_type)
mod_sn = LdaModel.load(path_ntopic_models.format(sf, group, id_type, str(n_sn)))

In [None]:
ct_sn = pd.read_csv(path_clean_text.format(sf, group))
ct_sn = ct_sn[['family_id','text_clean']].groupby(['family_id'])['text_clean'].apply(' '.join)
ct_sn = pd.DataFrame(ct_sn)
ct_sn.reset_index(inplace=True)

In [None]:
sf = "toddler"
n_td = 15
lemmatized_text_td, corpus_td, dictionary_td = load_data(sf, group, id_type)
mod_td = LdaModel.load(path_ntopic_models.format(sf, group, id_type, str(n_td)))

In [None]:
ct_td = pd.read_csv(path_clean_text.format(sf, group))
ct_td = ct_td[['family_id','text_clean']].groupby(['family_id'])['text_clean'].apply(' '.join)
ct_td = pd.DataFrame(ct_td)
ct_td.reset_index(inplace=True)

In [None]:
# toddler topic distribution on special-needs documents
## must make new corpus for special-needs using toddler dictionary so
## the lda model matches words correctly
corpus_sn_using_td = [dictionary_td.doc2bow(t) for t in lemmatized_text_sn]
out_sn = np.zeros((len(corpus_sn_using_td), n_td), dtype=float, order='C')
for i, doc in enumerate(corpus_sn_using_td):
    topics = mod_td.get_document_topics(doc)
    for j, score in topics:
        out_sn[i,j] = score

In [None]:
topic_df_sn = pd.concat([ct_sn, pd.DataFrame(out_sn, columns=gen_cols(n_td))], axis=1)
topic_df_sn.to_csv(path_topic_counts.format("special-needs", group, "using_td_15"))

In [None]:
df = pd.DataFrame(out_sn)
colsums = df.sum()
totalsum = colsums.sum()
100 * colsums / totalsum

In [None]:
# special-needs topic distribution on toddler documents
corpus_td_using_sn = [dictionary_sn.doc2bow(t) for t in lemmatized_text_td]
out_td = np.zeros((len(corpus_td_using_sn), n_sn), dtype=float, order='C')
for i, doc in enumerate(corpus_td_using_sn):
    topics = mod_sn.get_document_topics(doc)
    for j, score in topics:
        out_td[i,j] = score

In [None]:
topic_df_td = pd.concat([ct_td, pd.DataFrame(out_td, columns=gen_cols(n_sn))], axis=1)
topic_df_td.to_csv(path_topic_counts.format("toddler", group, "using_sn_20"))

In [None]:
df = pd.DataFrame(out_td)
colsums = df.sum()
totalsum = colsums.sum()
100 * colsums / totalsum

In [None]:
# school-age corpus
sf = "school"
lemmatized_text_sa, corpus_sa, dictionary_sa = load_data(sf, group, id_type)
corpus_sa_using_sn = [dictionary_sn.doc2bow(t) for t in lemmatized_text_sa]
out_sa = np.zeros((len(corpus_sa_using_sn), n_sn), dtype=float, order='C')
for i, doc in enumerate(corpus_sa_using_sn):
    topics = mod_sn.get_document_topics(doc)
    for j, score in topics:
        out_sa[i,j] = score
ct_sa = pd.read_csv(path_clean_text.format(sf, group))
ct_sa = ct_sa[['family_id','text_clean']].groupby(['family_id'])['text_clean'].apply(' '.join)
ct_sa = pd.DataFrame(ct_sa)
ct_sa.reset_index(inplace=True)
topic_df_sa = pd.concat([ct_sa, pd.DataFrame(out_sa, columns=gen_cols(n_sn))], axis=1)
topic_df_sa.to_csv(path_topic_counts.format("school", group, "using_sn_20"))

In [None]:
topic_df_sa.head()

## Word Frequency

In [10]:
# find word frequency
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

In [11]:
ct_sn = pd.read_csv(path_clean_text.format("special-needs", group))
ct_td = pd.read_csv(path_clean_text.format("toddler", group))
ct_sa = pd.read_csv(path_clean_text.format("school", group))

In [13]:
LEN_SN = ct_sn.shape[0]
LEN_TD = ct_td.shape[0]
LEN_SA = ct_sa.shape[0]
def token_freq(word):
    sn_freq = ct_sn['text_clean'].str.count(word).sum() / LEN_SN
    td_freq = ct_td['text_clean'].str.count(word).sum() / LEN_TD
    sa_freq = ct_sa['text_clean'].str.count(word).sum() / LEN_SA
    return {'special-needs': sn_freq, 'toddler': td_freq, 'school': sa_freq}

In [14]:
def token_rates(token1, token2):
    freq1 = token_freq(token1)
    freq2 = token_freq(token2)
    rates = {
        'special-needs': freq1['special-needs'] / freq2['special-needs'],
        'toddler': freq1['toddler'] / freq2['toddler'],
        'school': freq1['school'] / freq2['school']
    }
    print("word 1")
    print("special-needs: ", freq1['special-needs'])
    print("toddler:       ", freq1['toddler'])
    print("school:        ", freq1['school'])
    print("word 2")
    print("special-needs: ", freq2['special-needs'])
    print("toddler:       ", freq2['toddler'])
    print("school:        ", freq2['school'])
    print("relative rate")
    print("special-needs: ", rates['special-needs'])
    print("toddler:       ", rates['toddler'])
    print("school:        ", rates['school'])

In [15]:
def board_rates(word):
    freq = token_freq(word)
    print("word:          ", word)
    print("special-needs: ", freq['special-needs'])
    print("toddler:       ", freq['toddler'])
    print("school:        ", freq['school'])
    if freq['special-needs'] / freq['toddler'] >= 1:
        print("sn / td:       ", freq['special-needs'] / freq['toddler'])
    else:
        print("td / sn:       ", freq['toddler'] / freq['special-needs'])
    if freq['special-needs'] / freq['school'] >= 1:
        print("sn / sa:       ", freq['special-needs'] / freq['school'])
    else:
        print("sa / sn:       ", freq['school'] / freq['special-needs'])

In [17]:
board_rates("\\beasy\\b")

word:           \beasy\b
special-needs:  0.005371738006673101
toddler:        0.003559422358156463
school:         0.0042467948717948715
sn / td:        1.5091600451302705
sn / sa:        1.2648922702505718


In [18]:
board_rates("\\bhard\\b")

word:           \bhard\b
special-needs:  0.029465031286939735
toddler:        0.008981736938193194
school:         0.015865384615384615
sn / td:        3.2805493513893818
sn / sa:        1.8571898508131712


In [20]:
token_rates("\\bhard\\b", "\\beasy\\b")

word 1
special-needs:  0.029465031286939735
toddler:        0.008981736938193194
school:         0.015865384615384615
word 2
special-needs:  0.005371738006673101
toddler:        0.003559422358156463
school:         0.0042467948717948715
relative rate
special-needs:  5.485195154777927
toddler:        2.5233692533316328
school:         3.7358490566037736


In [None]:
token_rates("\\bhas[\sa\s|\s]disab*", "\\bis\sdisab*")

In [None]:
token_rates("\\bhas\\s[add|adhd]\\b", "\\bis\\s[add|adhd]\\b")

In [None]:
token_rates("\\bhas\\sautis*", "\\bis\\sautis*")

In [None]:
token_rates("retarded\\b", "retard\\b")

In [None]:
board_rates("\\bspecial[\s-]need.\\b")

In [None]:
board_rates("retarded\\b")

In [None]:
board_rates("retard\\b")

In [None]:
board_rates("\\bmoron\\b")

In [None]:
board_rates("\\bimbecile\\b")

In [None]:
board_rates("\\bhandicapped\\b")

In [None]:
board_rates("\\bdisabled\\b")

In [None]:
board_rates("\\bidiot\\b")

In [None]:
token_freq("\\bsn\\b")

In [None]:
token_freq("\\bneed\shelp\\b")

In [None]:
token_freq("\\bhelp\sme\\b")

In [None]:
token_freq("\\biep\\b")

In [None]:
token_freq("\\b504\\b")

In [None]:
token_freq("\\bds\\b")

In [None]:
token_freq("\\bdd\\b")

In [None]:
token_freq("\\bdh\\b")

In [None]:
token_freq("\\bschool\\b")

In [None]:
token_freq("\\bdoctor\\b")

In [None]:
token_freq("\\btherapist\\b")

In [None]:
token_freq("\\basd\\b")

In [None]:
countvectorizer = CountVectorizer()
tfidfvectorizer = TfidfVectorizer()
l_sn = [" ".join(l) for l in lemmatized_text_sn]
countvec_sn = countvectorizer.fit_transform(l_sn)
tfidfvec_sn = tfidfvectorizer.fit_transform(l_sn)
count_tokens_sn = countvectorizer.get_feature_names()
tfidf_tokens_sn = tfidfvectorizer.get_feature_names()
df_countvec_sn = pd.DataFrame(data = countvec_sn.toarray(),columns = count_tokens_sn)
df_tfidfvec_sn = pd.DataFrame(data = tfidfvec_sn.toarray(),columns = tfidf_tokens_sn)

In [None]:
countvectorizer = CountVectorizer()
tfidfvectorizer = TfidfVectorizer()
l_td = [" ".join(l) for l in lemmatized_text_td]
countvec_td = countvectorizer.fit_transform(l_td)
tfidfvec_td = tfidfvectorizer.fit_transform(l_td)
count_tokens_td = countvectorizer.get_feature_names()
tfidf_tokens_td = tfidfvectorizer.get_feature_names()
df_countvec_td = pd.DataFrame(data = countvec_td.toarray(),columns = count_tokens_td)
df_tfidfvec_td = pd.DataFrame(data = tfidfvec_td.toarray(),columns = tfidf_tokens_td)