# Lemmatize Forum Text
Cleans the data for topic modelling

## Data Sources
- youbemom-merged.db (scraped with 1-Scrape_Forum.ipynb)


## Changes
- 2020-09-14: Created
- 2020-09-15: Added functions for accessing database, cleaning/tokenizing text
- 2020-09-16: Generated and saved corpus and dictionary
- 2020-12-13: Updated for new data

## TODO
- Tutorial: https://towardsdatascience.com/topic-modelling-in-python-with-nltk-and-gensim-4ef03213cd21
- Add actual database structure

## Imports

In [1]:
import sqlite3
import pandas as pd
import numpy as np
from datetime import datetime
from pathlib import Path
from io import FileIO
import re
import string
# tokenizing
from nltk.corpus import wordnet as wn
from nltk.tokenize import RegexpTokenizer #, word_tokenize
from nltk.corpus import stopwords
# saving the corpus and dictionary
from gensim import corpora, models
import pickle
from youbemom import create_connection

Prerequisite:

In [2]:
#import nltk
#nltk.download('wordnet')

## Functions

For formatting the data

In [3]:
def clean_text(text):
    """ cleans the input text of punctuation, extra
        spaces, and makes letters lower case
    :param text: text (= title + body here)
    :return clean: clean text
    """
    clean = "".join([t for t in text if t not in string.punctuation])
    clean = re.sub(" +", " ", clean)
    clean = clean.strip()
    clean = clean.lower()
    return clean

In [4]:
def remove_stopwords(text):
    """ remove all stop words from the text
        using stopwords from nltk.corpus
    :param text: text with stopwords
    :return words: text without stopwords
    """
    words = [w for w in text if w not in stopwords.words('english')]
    return words

In [5]:
def encode_utf8(text):
    words = [w.encode() for w in text]
    return words

In [6]:
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

In [7]:
def lemmatize(text):
    lemmas = [get_lemma(w) for w in text]
    return lemmas

## File Locations

In [8]:
p = Path.cwd()
path_parent = p.parents[0]

In [15]:
path_db = path_parent / "database" / "youbemom-merged.db"
path_db = str(path_db)
path_lemma_pkl = path_parent / "clean_data" / "lemmatized_text_{}.pkl"
path_lemma_pkl = str(path_lemma_pkl)
path_corpus_pkl = path_parent / "clean_data" / "corpus_{}.pkl"
path_corpus_pkl = str(path_corpus_pkl)
path_dictionary_gensim = path_parent / "clean_data" / "dictionary_{}.gensim"
path_dictionary_gensim = str(path_dictionary_gensim)

## Load Data

In [10]:
sn_sql = ''' SELECT text FROM posts WHERE subforum="special-needs" '''
td_sql = ''' SELECT text FROM posts WHERE subforum="toddler" '''

In [None]:
conn = create_connection(path_db)

In [None]:
sn = pd.read_sql_query(sn_sql, conn)

In [12]:
td = pd.read_sql_query(td_sql, conn)

## Tokenize/Lemmatize Text
Tokenize the data, removing stopwords, punctuation, and making all lower case. Then lemmatize the words. Create a corpus and dictionary, and save them.

In [16]:
text = sn['text']
text = text.apply(clean_text)
tokenizer = RegexpTokenizer(r'\w+')
text = text.apply(tokenizer.tokenize)
text = text.apply(remove_stopwords)
text = text.apply(lemmatize)
pickle.dump(text, open(path_lemma_pkl.format("special-needs"), 'wb'))

In [17]:
dictionary = corpora.Dictionary(text)
dictionary.save(FileIO(path_dictionary_gensim.format("special-needs"), "wb"))

In [18]:
corpus = [dictionary.doc2bow(t) for t in text]
pickle.dump(corpus, open(path_corpus_pkl.format("special-needs"), 'wb'))

In [19]:
text = td['text']
text = text.apply(clean_text)
tokenizer = RegexpTokenizer(r'\w+')
text = text.apply(tokenizer.tokenize)
text = text.apply(remove_stopwords)
text = text.apply(lemmatize)
pickle.dump(text, open(path_lemma_pkl.format("toddler"), 'wb'))

MemoryError: 

In [None]:
dictionary = corpora.Dictionary(text)
dictionary.save(FileIO(path_dictionary_gensim.format("toddler"), "wb"))

In [None]:
corpus = [dictionary.doc2bow(t) for t in text]
pickle.dump(corpus, open(path_corpus_pkl.format("toddler"), 'wb'))