# Filter Spam from Data
Cleans the data for topic modelling

## Data Sources
- youbemom-merged.db (scraped with 1-Scrape_Forum.ipynb)


## Changes
- 2020-12-23: Created
- 2020-01-18: Updated spam detection

## TODO
- 

## Imports

In [1]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
import sqlite3
import pandas as pd
from datetime import datetime
from pathlib import Path
from youbemom import create_connection
import re
from math import floor
from tqdm.notebook import tqdm
from langdetect import detect
import numpy as np
from io import FileIO
# saving the corpus and dictionary
from gensim import corpora, models
import pickle
# topic models
import pyLDAvis.gensim
from gensim.models import CoherenceModel, LdaModel, LdaMulticore
# my functions
from youbemom import create_connection
from lemmatize import *

## Regex Patterns

In [3]:
# old pattern = r'(http|ftp|https):\/\/[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)'
url_pattern = r'''((http|ftp|https):\/\/)[-a-zA-Z0-9@:%\._\+~#=]{1,256}\.[a-zA-Z0-9\(\)]{1,8}\b([-a-zA-Z0-9\(\)@:%\!,\[\]\{\}\|'"_\+\.~#\?&/=]*)|(www\.)*[-a-zA-Z0-9@:%\._\+~#=]{1,256}\.(com|be|io|org|net)\b([-a-zA-Z0-9\(\)@:%\!,\[\]\{\}\|'"_\+\.~#\?&/=]*)'''

  and should_run_async(code)


In [4]:
email_pattern = r'([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)'

  and should_run_async(code)


In [5]:
large_number_pattern = r'\b[\+\-x0-9]*\d{9,}(?<!0{7})\b'

  and should_run_async(code)


In [6]:
subject_pattern = r'- no subject -'

  and should_run_async(code)


In [7]:
alpha_pattern = r'[a-zA-Z]'

  and should_run_async(code)


In [8]:
lonely_number_pattern = r'^[0-9]+$'

  and should_run_async(code)


## Functions

In [9]:
def format_topics_sentences(ldamodel, corpus):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
    return(sent_topics_df)

  and should_run_async(code)


In [10]:
def get_size(conn):
    """ gets the size of the data set in number of rows
    :param conn: connection the the db
    :return size: size of the posts table
    """
    cur = conn.cursor()
    cur.execute(''' SELECT COUNT(message_id) FROM posts ''')
    size = cur.fetchone()
    if size:
        return int(size[0])
    raise SystemExit("No size found")

  and should_run_async(code)


In [11]:
def process_data(sql, chunksize, analyzer):
    """ read data in chunks from the table, format the text,
        apply the sentiemnt analyzer, and write chunks to 
        the sentiment table
    :param sql: selects columns of the posts table
    :param chunksize: size of chunks
    :param analyzer: the VADER sentiment analyzer
    """
    # number of symbols
    # has url
    # non-english words (can I do this quickly?)
    # has email xxx@yyy
    # has a spam word/phrase
    reader = pd.read_sql_query(sql,
                               conn,
                               chunksize=chunksize)
    for i, df in enumerate(tqdm(reader)):
        df = format_data(df)
        df = remove_urls(df)
        df = gen_sentiment(df, 'text', 'all', analyzer)
        df = gen_sentiment(df, 'text_clean', 'clean', analyzer)
        df.drop('title', axis=1, inplace=True)
        df.drop('body', axis=1, inplace=True)
        if i == 0:
            df.to_sql('sentiment', conn, if_exists='replace', index=False)
        else:
            df.to_sql('sentiment', conn, if_exists='append', index=False)

  and should_run_async(code)


In [12]:
def create_text(df):
    """ creates text column from
        title and body
    :param df: data frame
    :return df: formatted data frame
    """
    df['title'] = df['title'].replace('This post has been deleted\.', '', regex=True)
    df['text'] = df['title'] + " " + df['body']
    return df

  and should_run_async(code)
  df['title'] = df['title'].replace('This post has been deleted\.', '', regex=True)


In [13]:
def has_url(df):
    """ finds urls in text strings and creates
        new column of whether text has a url
    :param df: data frame
    :return df: formatted data frame
    """
    regex_pat = re.compile(url_pattern, flags=re.IGNORECASE)
    df['has_url'] = df['text'].str.contains(regex_pat)
    return df

  and should_run_async(code)


In [14]:
def remove_urls(df):
    """ removes urls from text strings and creates
        new column of text without urls
    :param df: data frame
    :return df: formatted data frame
    """
    regex_pat = re.compile(url_pattern, flags=re.IGNORECASE)
    df['text_clean'] = df['text'].str.replace(regex_pat, "")
    df['text_clean'] = df['text_clean'].str.strip()
    return df

  and should_run_async(code)


In [15]:
def remove_no_subject(df):
    """ removes - no subject - from clean text strings
    :param df: data frame
    :return df: formatted data frame
    """
    regex_pat = re.compile(subject_pattern, flags=re.IGNORECASE)
    df['text_clean'] = df['text_clean'].str.replace(regex_pat, "")
    df['text_clean'] = df['text_clean'].str.strip()
    return df

  and should_run_async(code)


In [16]:
def has_email(df):
    regex_pat = re.compile(email_pattern, flags=re.IGNORECASE)
    df['has_email'] = df['text'].str.contains(regex_pat)
    return df

  and should_run_async(code)


In [17]:
def has_large_number(df):
    regex_pat = re.compile(large_number_pattern, flags=re.IGNORECASE)
    df['has_large_number'] = df['text_clean'].str.contains(regex_pat)
    return df

  and should_run_async(code)


In [18]:
def has_alpha(df):
    regex_pat = re.compile(alpha_pattern, flags=re.IGNORECASE)
    df['has_alpha'] = df['text_clean'].str.contains(regex_pat)
    return df

  and should_run_async(code)


In [19]:
def replace_lonely_numbers(df):
    regex_pat = re.compile(lonely_number_pattern, flags=re.IGNORECASE)
    df['text_clean'] = df['text_clean'].str.replace(regex_pat, "")
    df['text_clean'] = df['text_clean'].str.strip()
    return df

  and should_run_async(code)


In [20]:
def drop_emptys(df):
    df['text_clean'].replace('', np.nan, inplace=True)
    df.dropna(subset=['text_clean'], inplace=True)
    df.drop('title', axis=1, inplace=True)
    df.drop('body', axis=1, inplace=True)
    return df

  and should_run_async(code)


In [21]:
def count_non_punctuation(df):
    pattern = r'[-\w\s\.,/:;!\?\'\"’]'
    regex_pat = re.compile(pattern, flags=re.IGNORECASE)
    df['n_symbols'] = df['text_clean'].str.replace(regex_pat, "").str.len()
    return df

  and should_run_async(code)


In [22]:
def has_word(df, word):
    regex_pat = re.compile(word.lower(), flags=re.IGNORECASE)
    df[word] = df['text'].str.contains(regex_pat)
    return df

  and should_run_async(code)


## File Locations

In [23]:
p = Path.cwd()
path_parent = p.parents[0]

  and should_run_async(code)


In [24]:
# database
path_db = str(path_parent / "database" / "youbemom-merged.db")
# spam data
path_spam_sample = str(path_parent / "clean_data" / "spam_sample.csv")
path_spam_words = str(path_parent / "clean_data" / "spam_words.csv")
# topic model data
path_lemma_pkl = str(path_parent / "clean_data" / "lemmatized_text_spam_sample.pkl")
path_corpus_pkl = str(path_parent / "clean_data" / "corpus_spam_sample.pkl")
path_dictionary_gensim = str(path_parent / "clean_data" / "dictionary_spam_sample.gensim")
# model
path_model = str(path_parent / "clean_data" / "spam_model.gensim")

  and should_run_async(code)


## Load Sample of Data

In [25]:
sql = ''' SELECT message_id
    FROM posts
    WHERE deleted=0
'''

  and should_run_async(code)


In [26]:
conn = create_connection(path_db)

  and should_run_async(code)


In [27]:
ids = pd.read_sql_query(sql, conn)

  and should_run_async(code)


In [28]:
ids = ids.sample(n = 1000000, random_state = 391)

  and should_run_async(code)


In [29]:
temp_table_sql = ''' 
    DROP TABLE IF EXISTS temp;
    CREATE TEMPORARY TABLE
        temp(id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, message_id INTEGER);
    '''

  and should_run_async(code)


In [30]:
cur = conn.cursor()
cur.executescript(temp_table_sql)
ids.to_sql('temp', conn, if_exists='replace', index=False)

  and should_run_async(code)


In [31]:
select_sql = '''
    SELECT
        p.message_id AS message_id,
        p.title AS title,
        p.body AS body
    FROM posts AS p
    WHERE p.message_id IN (SELECT message_id FROM temp)
'''

  and should_run_async(code)


In [32]:
samp = pd.read_sql_query(select_sql, conn)

  and should_run_async(code)


In [33]:
conn.close()

  and should_run_async(code)


In [34]:
samp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 3 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   message_id  1000000 non-null  object
 1   title       1000000 non-null  object
 2   body        1000000 non-null  object
dtypes: object(3)
memory usage: 22.9+ MB


  and should_run_async(code)


In [35]:
samp = create_text(samp)

  and should_run_async(code)


In [36]:
samp = has_url(samp)
samp = remove_urls(samp)
samp = has_email(samp)
samp = has_large_number(samp)
samp = remove_no_subject(samp)
samp = replace_lonely_numbers(samp)
samp = count_non_punctuation(samp)
samp = drop_emptys(samp)

  and should_run_async(code)
  return func(self, *args, **kwargs)


In [37]:
samp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 996295 entries, 0 to 999999
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   message_id        996295 non-null  object
 1   text              996295 non-null  object
 2   has_url           996295 non-null  bool  
 3   text_clean        996295 non-null  object
 4   has_email         996295 non-null  bool  
 5   has_large_number  996295 non-null  bool  
 6   n_symbols         996295 non-null  int64 
dtypes: bool(3), int64(1), object(3)
memory usage: 40.9+ MB


  and should_run_async(code)


## Read Spam Words

In [38]:
spam = pd.read_csv(path_spam_words)
spam = spam['words'].tolist()

  and should_run_async(code)


In [39]:
for s in spam:
    samp = has_word(samp, s)
    print(s)
    print(samp[s].value_counts())

  and should_run_async(code)


black magic
False    996287
True          8
Name: black magic, dtype: int64
918728849451
False    996293
True          2
Name: 918728849451, dtype: int64
vashikaran
False    996284
True         11
Name: vashikaran, dtype: int64
enlargement
False    996289
True          6
Name: enlargement, dtype: int64
pills
False    995988
True        307
Name: pills, dtype: int64
creams
False    996150
True        145
Name: creams, dtype: int64
27835121053
False    996294
True          1
Name: 27835121053, dtype: int64
bollywood
False    996292
True          3
Name: bollywood, dtype: int64
9999136878
False    996295
Name: 9999136878, dtype: int64
botcho cream
False    996294
True          1
Name: botcho cream, dtype: int64
yodi pills
False    996294
True          1
Name: yodi pills, dtype: int64
enlarge
False    996269
True         26
Name: enlarge, dtype: int64
penis
False    995816
True        479
Name: penis, dtype: int64
stamina
False    996246
True         49
Name: stamina, dtype: int64
semen
Fa

In [40]:
samp.to_csv(path_spam_sample, sep ='\t', index=False)

  and should_run_async(code)


## Identify Topic Model

In [41]:
text = clean_data(samp)

  and should_run_async(code)


In [42]:
pickle.dump(text, open(path_lemma_pkl, 'wb'))
dictionary = corpora.Dictionary(text)
dictionary.save(FileIO(path_dictionary_gensim, "wb"))
corpus = [dictionary.doc2bow(t) for t in text]
pickle.dump(corpus, open(path_corpus_pkl, 'wb'))

  and should_run_async(code)


In [43]:
w = 15
n_topics = 10
NUM_WORDS = 10
n_iterations = 50

  and should_run_async(code)


In [44]:
%time spammod = LdaMulticore(corpus, num_topics = n_topics, id2word=dictionary, passes=n_iterations, workers=w)

  and should_run_async(code)


CPU times: user 32min 48s, sys: 3min 56s, total: 36min 45s
Wall time: 33min 30s


In [45]:
topics = spammod.print_topics(num_words=NUM_WORDS)
coherence_model = CoherenceModel(model=spammod, texts=text, dictionary=dictionary, coherence='c_v')
coherence = coherence_model.get_coherence()

  and should_run_async(code)


In [46]:
spammod.save(path_model)

  and should_run_async(code)


In [47]:
pyLDAvis.display(pyLDAvis.gensim.prepare(spammod, corpus, dictionary, sort_topics=False))

  and should_run_async(code)


In [48]:
%time samp_topic_sents_keywords = format_topics_sentences(ldamodel=spammod, corpus=corpus)

  and should_run_async(code)


CPU times: user 3h 46min 1s, sys: 4.46 s, total: 3h 46min 5s
Wall time: 3h 46min 5s


In [None]:
samp['dom_topic'] = samp_topic_sents_keywords['Dominant_Topic']
samp['per_contr'] = samp_topic_sents_keywords['Perc_Contribution']
samp['keywords'] = samp_topic_sents_keywords['Topic_Keywords']

In [None]:
samp['dom_topic'].value_counts()

In [None]:
for i in range(100000):
#     if samp['dom_topic'].iloc[i]==19.0:
#         print("topic 20: " + samp['text_clean'].iloc[i])

## Clean Up Dataframe

In [None]:
samp.drop('title', axis=1, inplace=True)
samp.drop('body', axis=1, inplace=True)

In [None]:
samp = has_url(samp)

In [None]:
samp['has_url'].value_counts()

In [None]:
samp['text'].loc[samp['has_url']].head(50)

In [None]:
s = samp['text'].loc[3278]
pattern = r'''((http|ftp|https):\/\/)[-a-zA-Z0-9@:%\._\+~#=]{1,256}\.[a-zA-Z0-9\(\)]{1,8}\b([-a-zA-Z0-9\(\)@:%\!,\[\]\{\}\|'"_\+\.~#\?&/=]*)|(www\.)*[-a-zA-Z0-9@:%\._\+~#=]{1,256}\.(com|be|io|org|net)\b([-a-zA-Z0-9\(\)@:%\!,\[\]\{\}\|'"_\+\.~#\?&/=]*)'''
regex_pat = re.compile(pattern, flags=re.IGNORECASE)
print(s)
print(re.sub(pattern, "", s))
print(s.contains(pattern))

In [None]:
samp.info(verbose=True)

In [None]:
# samp.to_csv(path_spam_sample, columns=['message_id', 'has_url', 'n_symbols', 'text'], index=False)
samp.to_csv(path_spam_sample, sep ='\t', index=False)

In [None]:
samp.info()

## Process Data

In [None]:
conn = create_connection(path_db)

In [None]:
size = get_size(conn)
nchunks = 1
chunksize = floor(size / nchunks)

In [None]:
sql = ''' SELECT message_id, title, body FROM posts '''

In [None]:
process_data(sql, chunksize, analyzer)