# Filter Spam from Data
Cleans the data for topic modelling


## Imports

In [1]:
import sqlite3
from pathlib import Path
from scraping import create_connection
import pandas as pd
import numpy as np
from math import floor
from tqdm.notebook import tqdm
import re

## File Locations

In [2]:
db = "netmums-merged.db"
p = Path.cwd()
path_parent = p.parents[0]
path_db = str(path_parent / "database" / db)

## Regex Patterns

In [3]:
sent_from_pattern = r'sent from my [^\s]+ using [^\s]+'

In [4]:
link_pattern = r'::link_[0-9]*::'

## Functions

In [5]:
def get_size(conn):
    """ gets the size of the data set in number of rows
    :param conn: connection the the db
    :return size: size of the posts table
    """
    cur = conn.cursor()
    cur.execute(''' SELECT COUNT(id) FROM posts ''')
    size = cur.fetchone()
    if size:
        return int(size[0])
    raise SystemExit("No size found")

In [6]:
def remove_sent_from(df):
    """ removes sent from information from clean text strings
    :param df: data frame
    :return df: formatted data frame
    """
    regex_pat = re.compile(sent_from_pattern, flags=re.IGNORECASE)
    df['text_clean'] = df['text_clean'].str.replace(regex_pat, "")
    df['text_clean'] = df['text_clean'].str.strip()
    return df

In [7]:
def remove_links(df):
    """ removes link substitutions from clean text strings
    :param df: data frame
    :return df: formatted data frame
    """
    regex_pat = re.compile(link_pattern, flags=re.IGNORECASE)
    df['text_clean'] = df['text_clean'].str.replace(regex_pat, "")
    df['text_clean'] = df['text_clean'].str.strip()
    return df

In [8]:
def drop_emptys(df):
    df['text_clean'].replace('', np.nan, inplace=True)
    df.dropna(subset=['text_clean'], inplace=True)
    return df

In [9]:
def process_text(df):
    df = df.rename(columns={"id": "post_id", "body": "text_clean"})
    df = remove_sent_from(df)
    df = remove_links(df)
    df = drop_emptys(df)
    return df

In [10]:
def process_data(chunksize):
    """ read data in chunks from the table, format the text,
        and write to the text table
    :param sql: selects columns of the posts table
    :param chunksize: size of chunks
    """
    sql = ''' SELECT id, body FROM posts '''
    reader = pd.read_sql_query(sql,
                               conn,
                               chunksize=chunksize)
    for i, df in enumerate(tqdm(reader)):
        df = process_text(df)
        if i == 0:
            df.to_sql('text', conn, if_exists='replace', index=False)
        else:
            df.to_sql('text', conn, if_exists='append', index=False)

## Create Clean Text

In [11]:
conn = create_connection(path_db)

In [12]:
size = get_size(conn)
nchunks = 100
chunksize = floor(size / nchunks)

In [13]:
df = process_data(chunksize)

|          | 0/? [00:00<?, ?it/s]

In [14]:
conn.commit()
conn.close()