# Lemmatize Forum Text
Cleans the data for topic modelling

## Data Sources
- youbemom-merged.db (scraped with 1-Scrape_Forum.ipynb)


## Changes
- 2020-09-14: Created
- 2020-09-15: Added functions for accessing database, cleaning/tokenizing text
- 2020-09-16: Generated and saved corpus and dictionary
- 2020-12-13: Updated for new data

## TODO
- Tutorial: https://towardsdatascience.com/topic-modelling-in-python-with-nltk-and-gensim-4ef03213cd21
- Add actual database structure

## Imports

In [1]:
import sqlite3
import pandas as pd
import numpy as np
from datetime import datetime
from pathlib import Path
from io import FileIO
import re
import string
# tokenizing
from nltk.tokenize import RegexpTokenizer #, word_tokenize
# saving the corpus and dictionary
from gensim import corpora, models
import pickle
# my functions
from youbemom import create_connection
from lemmatize import clean_text, remove_stopwords, get_lemma, lemmatize

Prerequisite:

In [2]:
# import nltk
# nltk.download('wordnet')
# nltk.download('stopwords')

## Functions

For loading data

In [19]:
def gen_sql(subforum="special-needs", group="parent"):
    if group=="parent":
        sql = '''
            SELECT s.text_no_url AS text_no_url
            FROM sentiment AS s
            JOIN posts AS p
            ON s.message_id = p.message_id
            WHERE p.subforum="{}" AND p.parent_id=""
        '''
    elif group=="all":
        sql = '''
            SELECT s.text_no_url AS text_no_url
            FROM sentiment AS s
            JOIN posts AS p
            ON s.message_id = p.message_id
            WHERE p.subforum="{}"
        '''
    elif group=="child":
        sql = '''
            SELECT s.text_no_url AS text_no_url
            FROM sentiment AS s
            JOIN posts AS p
            ON s.message_id = p.message_id
            WHERE p.subforum="{}" AND p.parent_id<>""
        '''
    return sql.format(subforum)

In [9]:
def load_data(conn, sql):
    return pd.read_sql_query(sql, conn)

For processing the data

In [10]:
def clean_data(df):
    text = df['text_no_url']
    text = text.apply(clean_text)
    tokenizer = RegexpTokenizer(r'\w+')
    text = text.apply(tokenizer.tokenize)
    text = text.apply(remove_stopwords)
    text = text.apply(lemmatize)
    return text

In [11]:
def process_data(subforum="special-needs", group="parent"):
    conn = create_connection(path_db)
    sql = gen_sql(subforum, group)
    df = load_data(conn, sql)
    text = clean_data(df)
    save_data(text, subforum, group)
    conn.close()

For saving the data

In [12]:
def save_data(text, subforum="special-needs", group="parent"):
    pickle.dump(text, open(path_lemma_pkl.format(subforum, group), 'wb'))
    dictionary = corpora.Dictionary(text)
    dictionary.save(FileIO(path_dictionary_gensim.format(subforum, group), "wb"))
    corpus = [dictionary.doc2bow(t) for t in text]
    pickle.dump(corpus, open(path_corpus_pkl.format(subforum, group), 'wb'))

## File Locations

In [13]:
p = Path.cwd()
path_parent = p.parents[0]

In [14]:
path_db = path_parent / "database" / "youbemom-merged.db"
path_db = str(path_db)
path_lemma_pkl = path_parent / "clean_data" / "lemmatized_text_{0}_{1}.pkl"
path_lemma_pkl = str(path_lemma_pkl)
path_corpus_pkl = path_parent / "clean_data" / "corpus_{0}_{1}.pkl"
path_corpus_pkl = str(path_corpus_pkl)
path_dictionary_gensim = path_parent / "clean_data" / "dictionary_{0}_{1}.gensim"
path_dictionary_gensim = str(path_dictionary_gensim)

## Process Data

In [15]:
process_data("special-needs", "parent")

In [16]:
process_data("toddler", "parent")

In [20]:
process_data("special-needs", "child")

In [21]:
process_data("special-needs", "all")