# Filter Spam from Data
Cleans the data for topic modelling

## Data Sources
- youbemom-merged.db (scraped with 1-Scrape_Forum.ipynb)


## Changes
- 2020-12-23: Created
- 2021-01-18: Updated spam detection
- 2021-01-25: Filtering spam words 

## TODO
- 

## Imports

In [4]:
import warnings
warnings.simplefilter('ignore')

In [1]:
import sqlite3
import pandas as pd
from datetime import datetime
from pathlib import Path
from youbemom import create_connection
import re
from math import floor
from tqdm.notebook import tqdm
from langdetect import detect
import pandas as pd
import numpy as np
from io import FileIO
# saving the corpus and dictionary
from gensim import corpora, models
import pickle
# topic models
import pyLDAvis.gensim
from gensim.models import CoherenceModel, LdaModel, LdaMulticore
# my functions
from scraping import create_connection
from lemmatize import *

## Regex Patterns

In [5]:
warnings.simplefilter('ignore')

In [4]:
# old pattern = r'(http|ftp|https):\/\/[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)'
url_pattern = r'''((http|ftp|https):\/\/)[-a-zA-Z0-9:%\._\+~#=]{1,256}\.[a-zA-Z0-9\(\)]{1,8}\b([-a-zA-Z0-9<>\*\^\(\)@:%\!,\[\]\{\}\|'"_\+\.~#\?&/=]*)|(www\.)*[-a-zA-Z0-9@:%\._\+~#=]{1,256}\.(com|be|io|org|net)\b([-a-zA-Z0-9<>\*\^\(\)@:%\!,\[\]\{\}\|'"_\+\.~#\?&/=]*)'''

In [5]:
url_cutoff_pattern = r'''((http|ftp|https):\/\/)[-a-zA-Z0-9:%\._\+~#=]{1,256}'''

In [6]:
email_pattern = r'([-a-zA-Z0-9_\.\+]+@[-a-zA-Z0-9]+\.[-a-zA-Z0-9\.]+)'

In [7]:
large_number_pattern = r'\b[\+\-x0-9]*\d{9,}(?<!0{7})\b'

In [8]:
subject_pattern = r'- no subject -'

In [9]:
alpha_pattern = r'[a-zA-Z]'

In [10]:
lonely_number_pattern = r'^[0-9]+$'

## Functions

In [11]:
def get_size(conn):
    """ gets the size of the data set in number of rows
    :param conn: connection the the db
    :return size: size of the posts table
    """
    cur = conn.cursor()
    cur.execute(''' SELECT COUNT(message_id) FROM posts ''')
    size = cur.fetchone()
    if size:
        return int(size[0])
    raise SystemExit("No size found")

In [12]:
def process_data(chunksize):
    """ read data in chunks from the table, format the text,
        apply the sentiemnt analyzer, and write chunks to 
        the sentiment table
    :param sql: selects columns of the posts table
    :param chunksize: size of chunks
    """
    sql = ''' SELECT message_id, title, body FROM posts '''
    reader = pd.read_sql_query(sql,
                               conn,
                               chunksize=chunksize)
    for i, df in enumerate(tqdm(reader)):
        df = process_text(df)
        df = add_spam_dummies(df)
        df = probable_spam(df)
        df = df[['message_id', 'text', 'text_clean', 'probable_spam']]
        if i == 0:
            df.to_sql('text', conn, if_exists='replace', index=False)
        else:
            df.to_sql('text', conn, if_exists='append', index=False)

Process text

In [13]:
def create_text(df):
    """ creates text column from
        title and body
    :param df: data frame
    :return df: formatted data frame
    """
    df['title'] = df['title'].replace('This post has been deleted\.', '', regex=True)
    df['text'] = df['title'] + " " + df['body']
    return df

In [14]:
def has_url(df):
    """ finds urls in text strings and creates
        new column of whether text has a url
    :param df: data frame
    :return df: formatted data frame
    """
    regex_pat = re.compile(url_pattern, flags=re.IGNORECASE)
    df['has_url'] = df['text'].str.contains(regex_pat)
    regex_pat = re.compile(url_cutoff_pattern, flags=re.IGNORECASE)
    df['has_cutoff_url'] = df['text'].str.contains(regex_pat)
    return df

In [15]:
def remove_urls(df):
    """ removes urls and cutoff urls from text strings and creates
        new column of text without urls
    :param df: data frame
    :return df: formatted data frame
    """
    regex_pat = re.compile(url_pattern, flags=re.IGNORECASE)
    df['text_clean'] = df['text'].str.replace(regex_pat, "")
    regex_pat = re.compile(url_cutoff_pattern, flags=re.IGNORECASE)
    df['text_clean'] = df['text_clean'].str.replace(regex_pat, "")
    df['text_clean'] = df['text_clean'].str.strip()
    return df

In [16]:
def remove_no_subject(df):
    """ removes - no subject - from clean text strings
    :param df: data frame
    :return df: formatted data frame
    """
    regex_pat = re.compile(subject_pattern, flags=re.IGNORECASE)
    df['text_clean'] = df['text_clean'].str.replace(regex_pat, "")
    df['text_clean'] = df['text_clean'].str.strip()
    return df

In [17]:
def has_email(df):
    regex_pat = re.compile(email_pattern, flags=re.IGNORECASE)
    df['has_email'] = df['text'].str.contains(regex_pat)
    return df

In [18]:
def has_large_number(df):
    regex_pat = re.compile(large_number_pattern, flags=re.IGNORECASE)
    df['has_large_number'] = df['text_clean'].str.contains(regex_pat)
    return df

In [19]:
def has_alpha(df):
    regex_pat = re.compile(alpha_pattern, flags=re.IGNORECASE)
    df['has_alpha'] = df['text_clean'].str.contains(regex_pat)
    return df

In [20]:
def replace_lonely_numbers(df):
    regex_pat = re.compile(lonely_number_pattern, flags=re.IGNORECASE)
    df['text_clean'] = df['text_clean'].str.replace(regex_pat, "")
    df['text_clean'] = df['text_clean'].str.strip()
    return df

In [21]:
def drop_emptys(df):
    df['text_clean'].replace('', np.nan, inplace=True)
    df.dropna(subset=['text_clean'], inplace=True)
    df.drop('title', axis=1, inplace=True)
    df.drop('body', axis=1, inplace=True)
    return df

In [22]:
def count_non_punctuation(df):
    pattern = r'[-\w\s\.,/:;!\?\'\"’]'
    regex_pat = re.compile(pattern, flags=re.IGNORECASE)
    df['n_symbols'] = df['text_clean'].str.replace(regex_pat, "").str.len()
    return df

In [23]:
def has_word(df, word, name=None, ignorecase=True):
    if ignorecase:
        regex_pat = re.compile(word.lower(), flags=re.IGNORECASE)
    else:
        regex_pat = re.compile(word)
    if name:
        df[name] = df['text'].str.contains(regex_pat)
    else:
        df[word] = df['text'].str.contains(regex_pat)
    return df

In [24]:
def process_text(df):
    df = create_text(df)
    df = has_url(df)
    df = remove_urls(df)
    df = has_email(df)
    df = has_large_number(df)
    df = remove_no_subject(df)
    df = count_non_punctuation(df)
    df['text_length'] = df['text'].str.len()
    df['text_clean_length'] = df['text_clean'].str.len()
    return df

In [25]:
def add_spam_dummies(df):
    for s in spam:
        df = has_word(df, s)
    df = has_word(df, r'\[url', name="bracket_url")
    df = has_word(df, r'^Http', name="Http", ignorecase=False)
    df = has_word(df, r's\.t\.r\.e\.a\.m', name="s.t.r.e.a.m")
    df = has_word(df, r'''\bdd['s]*\b''', name="has_dd", ignorecase=False)
    df = has_word(df, r'''\bdh['s]*\b''', name="has_dh", ignorecase=False)
    df = has_word(df, r'''\bds['s]*\b''', name="has_ds", ignorecase=False)
    return df

In [26]:
def probable_spam(df):
    df['probable_spam'] = (
        (df.vashikaran) |
        ((~df.has_url) & df.has_large_number & df.text_length > 900) |
        ((~df.has_url) & df.has_large_number & df.n_symbols > 10) |
        ((~df.has_url) & df.has_large_number & df["problem.solution"]) | 
        (df.has_url & df.vs & df.stream) |
        (df.has_url & df["s.t.r.e.a.m"]) |
        (df.has_url & df.has_large_number) |
        (df.has_url & df["visit.here"]) |
        (df.has_url & df["visit.at"]) |
        (df.has_url & df["amino.app"]) |
        (df.has_url & df["male.enhancement"]) |
        (df.has_url & df.testosterone) |
        (df.has_url & df["visit.us.at"]) |
        (df.has_url & df["cbd.oil"]) |
        (df.has_url & df.Http) |
        (df.has_url & df.bracket_url) |
        (df.has_url & df.keto & df.text_length > 320) |
        (df.has_url & df.supplement & df.text_length > 320) |
        (df.has_url & df.pills & df.text_length > 320)
    ) & (
        (~df.has_dd) & (~df.has_dh) & (~df.has_ds)
    )
    return df

Get random sample of posts

In [27]:
def get_sample(count, seed):
    sql = ''' SELECT message_id
        FROM posts
        WHERE deleted=0
    '''
    conn = create_connection(path_db)
    ids = pd.read_sql_query(sql, conn)
    ids = ids.sample(n = count, random_state = seed)
    temp_table_sql = ''' 
        DROP TABLE IF EXISTS temp;
        CREATE TEMPORARY TABLE
            temp(id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, message_id INTEGER);
    '''
    cur = conn.cursor()
    cur.executescript(temp_table_sql)
    ids.to_sql('temp', conn, if_exists='replace', index=False)
    select_sql = '''
        SELECT
            p.message_id AS message_id,
            p.title AS title,
            p.body AS body
        FROM posts AS p
        WHERE p.message_id IN (SELECT message_id FROM temp)
    '''
    samp = pd.read_sql_query(select_sql, conn)
    conn.close()
    return samp

## File Locations

In [6]:
p = Path.cwd()
path_parent = p.parents[0]

In [8]:
# database
path_db = str(path_parent / "database" / "youbemom-merged.db")
# spam data
path_spam_sample = str(path_parent / "clean_data" / "spam_sample_{}.txt")
path_spam_words = str(path_parent / "clean_data" / "spam_words.csv")
path_spam_forum = str(path_parent / "clean_data" / "{}_spam.csv")

## Load Sample of Data
Code on training data set and test on test data set. Randomly select data based on seed.

In [None]:
count = 1000000
seed = 546

In [None]:
sample = get_sample(count, seed)

In [None]:
sample = process_text(sample)

## Read Spam Words

In [None]:
spam = pd.read_csv(path_spam_words)
spam = spam['words'].tolist()

In [None]:
for s in spam:
    sample = has_word(sample, s)

In [None]:
sample = has_word(sample, r'\[url', name="bracket_url")
sample = has_word(sample, r'^Http', name="Http", ignorecase=False)
sample = has_word(sample, r's\.t\.r\.e\.a\.m', name="s.t.r.e.a.m")

In [None]:
sample = has_word(sample, r'''\bdd['s]*\b''', name="has_dd", ignorecase=False)
sample = has_word(sample, r'''\bdh['s]*\b''', name="has_dh", ignorecase=False)
sample = has_word(sample, r'''\bds['s]*\b''', name="has_ds", ignorecase=False)

In [None]:
# sample.to_csv(path_spam_sample.format(str(seed)), sep ='\t', index=False)

Loaded spam sample into Excel and hand-coded spam. There is minimal spam where there is no url and is always accompanied by some other indicator of spam (a long number that is probably a phone number, many non-punctuation symbols, or a specific word). I coded all urls in the first 100,000 messages in the sample, founding common key words and other idnicators. I checked this against the remaining urls in 1.2.5-Clean_Data-Identify_Spam.R, validating the spam indicators. This was used to create probable_spam function.

## Probable Spam

In [None]:
sample = probable_spam(sample)
sample['probable_spam'].value_counts()

## Process Data
Loop through the dataframe, creating text, text_no_url, text_clean, and probable_spam in database.

In [30]:
# if not run above:
spam = pd.read_csv(path_spam_words)
spam = spam['words'].tolist()

In [31]:
conn = create_connection(path_db)

In [32]:
size = get_size(conn)
nchunks = 100
chunksize = floor(size / nchunks)

In [33]:
process_data(chunksize)

|          | 0/? [00:00<?, ?it/s]

In [34]:
conn.commit()
conn.close()

## Marked Spam Lists

In [20]:
sql_select = '''
    SELECT message_id
    FROM posts
    WHERE family_id=?
'''

In [29]:
sql_update = '''
    UPDATE text
    SET probable_spam=1
    WHERE message_id=?
'''

In [22]:
conn = create_connection(path_db)
cur = conn.cursor()

In [23]:
sf = "school"
spam = pd.read_csv(path_spam_forum.format(sf))
spam.fillna(0, inplace=True)
spam = spam[spam['is_spam'] > 0.0]
spam = spam['family_id'].tolist()

In [30]:
for family_id in spam:
    cur.execute(sql_select, (family_id,))
    rows = cur.fetchall()
    for row in rows:
        message_id = row[0]
        print(family_id, message_id)
        cur.execute(sql_update, (message_id,))

46885 105409987
401749 109324345
439951 109779679
442217 109803051
443595 109820163
449521 109886994
449522 109886995
457062 109973871
457063 109973872
464794 110059424
477287 110186461
522972 110714127
522974 110714129
522975 110714130
522978 110714133
522992 110714178
538073 110889261
559821 111142462
559822 111142463
559823 111142464
559825 111142467
560882 111153302
573921 111302444
599786 111599416
602615 111630852
603678 111644283
613610 111763722
624442 111882255
625564 111895167
628509 111929042
632697 111973913
656728 112245593
656730 112245605
656734 112245634
680863 112512368
689182 112605962
698058 112705908
702866 112759595
719980 112950118
726478 113012350
737191 113131723
740985 113174052
743806 113206455
748262 113254198
749461 113268221
750530 113279586
754110 113322308
754217 113322963
755360 113336372
758029 113366989
759599 113385067
760708 113398483
761953 113412219
761956 113412228
763258 113426759
765460 113452269
768715 113489239
769999 113504486
771265 11351848

In [31]:
conn.commit()
conn.close()