## Requirements

* Detect predetorial patterns in other side chat messages, alert parents / block chat
    - Per message
    - Sequence
    - Media
* Detect and warn / block personal information giveaway by own side of chat (Child)
    - text
    - media
* Support 2 party / multiple party chats
* Block known predators from past chats




## Flow control

In [24]:
'''
Options:
Each element can be either 'Process' or 'Load'
'''

# Dataframe creation from raw XMLs
CREATE_FULL_PJ_CONVERSATIONS_DATAFRAME = 'Load'
CREATE_FULL_PAN12_DATAFRAME = 'Load'

# Dataframe preprocessing
PREPROCESS_FULL_PAN12_DATAFRAME = 'Load'
PREPROCESS_FULL_PJ_DATAFRAME = 'Load'

# LDA topic model classifier on Pan12 - single manual steps
PAN12_TOPIC_MODEL_CORPUS = 'Process'

# LDA topic model classifier on Pan12 - Automatic grid search
GRID_SEARCH_LDA_RFC = 'Process'
GRID_SEARCH_LDA_GB = 'Process'


##########################################################
# Obsolete - code under backup section
PAN12_TOPIC_MODEL = 'Process'
PAN12_TOPIC_MODEL_RF = 'Process'
CREATE_FULL_PJ_DATAFRAME_SENTENCE_LEVEL = 'Load'


## General - imports paths etc.

In [2]:
%pip install pyspellchecker
!python -m spacy download en_core_web_sm
%pip install pyLDAvis
%pip install altair


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 590 kB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Imports

In [3]:
import numpy as np
import pandas as pd
from pathlib import Path
import pickle
import random
import re
import string

from tqdm.notebook import tqdm as tqdm
tqdm.pandas()
from ipywidgets import IntProgress

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer

# from sklearn.decomposition import PCA
# from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer
import altair

# from sklearn.cluster import MiniBatchKMeans

import matplotlib.pyplot as plt
import matplotlib.cm as cm

from torch.utils.data import Dataset

import gensim
from gensim.models import Word2Vec
import gensim.corpora as corpora
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

import spacy
from spellchecker import SpellChecker

import xml.etree.ElementTree as ET 
from xml.etree.ElementTree import ParseError

import csv

from typing import Dict, Callable, List, Dict, Set, Any
import logging


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

logger = logging.getLogger(__name__)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


  from collections import Iterable


### Env control and folders

In [4]:
ENV = 'Colab'
# ENV = 'Local'


In [5]:
# Folders
if ENV=='Local':
  PROJECT_ROOT = Path('./')

elif ENV=='Colab':
  from google.colab import drive
  drive.mount('/content/drive')
  PROJECT_ROOT = Path('/content/drive/MyDrive/colab_data/cyber2/')
  

PJ_DATA_FOLDER = PROJECT_ROOT / Path('customer_data')
PAN12_TEST_DATA_FILE = PROJECT_ROOT / Path('ref_data/pan12_corpus/pan12-sexual-predator-identification-test-corpus-2012-05-21/pan12-sexual-predator-identification-test-corpus-2012-05-17.xml')
PAN12_LINE_LABELS_FILE = PROJECT_ROOT / Path('ref_data/pan12_corpus/pan12-sexual-predator-identification-test-corpus-2012-05-21/pan12-sexual-predator-identification-groundtruth-problem2.txt')
PAN12_USER_LABELS_FILE = PROJECT_ROOT / Path('ref_data/pan12_corpus/pan12-sexual-predator-identification-test-corpus-2012-05-21/pan12-sexual-predator-identification-groundtruth-problem1.txt')

PAN12_TRAIN_DATA_FILE = PROJECT_ROOT / Path('ref_data/pan12_corpus/pan12-sexual-predator-identification-training-corpus-2012-05-01/pan12-sexual-predator-identification-training-corpus-2012-05-01.xml')
PAN12_TRAIN_USER_LABELS_FILE = PROJECT_ROOT / Path('ref_data/pan12_corpus/pan12-sexual-predator-identification-training-corpus-2012-05-01/pan12-sexual-predator-identification-training-corpus-predators-2012-05-01.txt')


OUTPUT_FOLDER = PROJECT_ROOT / Path('output')

if not PAN12_TEST_DATA_FILE.exists():
    raise FileNotFoundError('File not found!')

if not PAN12_LINE_LABELS_FILE.exists():
    raise FileNotFoundError('File not found!')  

if not PAN12_USER_LABELS_FILE.exists():
    raise FileNotFoundError('File not found!') 

if not PJ_DATA_FOLDER.is_dir():
    raise FileNotFoundError('Directry not found!') 

if not OUTPUT_FOLDER.is_dir():
    print(f'creating output folder: {OUTPUT_FOLDER}')
    OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)
  

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Utility functions

In [6]:
# Define datasets with texts and labels

def list_files_in_dir(folder: Path, extension='*') -> List:
    
    file_list = [f for f in folder.glob(f'**/*.{extension}') if f.is_file()]
    return file_list


In [7]:
def unify_csv_dataframes_to_one_sorted(csv_parts_folder: Path, target_csv_path=None)-> pd.DataFrame:
    '''
    Gets:
        csv_parts_folder - Path of folder with partial df CSV files
        target_csv_path - (optional) - path to save unified CSV

    Returns:
        Unified PD dataframe with all csvs concatenated on axis 0
    '''

    file_list = list_files_in_dir(csv_parts_folder, extension='csv')
    ordered_filenames = sorted([str(filename) for filename in file_list])
    ordered_file_list = [Path(filename) for filename in ordered_filenames]
    print(f'Found {len(ordered_file_list)} files to unify')

    unified_df = None
    for file in ordered_file_list:
        part_df = pd.read_csv(file, header=0, index_col=0)

        if unified_df is not None:
            unified_df = pd.concat([unified_df, part_df], axis=0)
        else:
            unified_df = part_df

    if target_csv_path is not None:
        unified_df.to_csv(target_csv_path)
    
    return unified_df

### Load word lists

In [8]:
# Load word lists
SEX_WL_PATH = PROJECT_ROOT / Path(r'sex_words.txt')
with open(SEX_WL_PATH, 'rt') as handle:
    sex_word_list = handle.read().split('\n')

MEETING_WL_PATH = SEX_WL_PATH = PROJECT_ROOT / Path(r'meeting_words.txt')
with open(MEETING_WL_PATH, 'rt') as handle:
    meeting_word_list = handle.read().split('\n')

FAMILY_WL_PATH = SEX_WL_PATH = PROJECT_ROOT / Path(r'family_words.txt')
with open(FAMILY_WL_PATH, 'rt') as handle:
    family_word_list = handle.read().split('\n')

CHAT_SLANG_PATH = SEX_WL_PATH = PROJECT_ROOT / Path(r'chat_slang.txt')
with open(CHAT_SLANG_PATH, mode='rt') as handle:
    csv_reader = csv.reader(handle, delimiter='\t')
    chat_slang = {rows[0]:rows[1] for rows in csv_reader}

EMOTICONS_PATH = SEX_WL_PATH = PROJECT_ROOT / Path(r'emoticons.txt')
with open(EMOTICONS_PATH, mode='rt', encoding="utf8") as handle:
    csv_reader = csv.reader(handle, delimiter='\t')
    emoticons = {rows[0]:rows[1] for rows in csv_reader}


## Preprocessing

### Chat text preprocess

In [9]:

def remove_stopwords(text: str, words_to_remove: List[str])-> str:
    '''
    Gets string, returns it without stopwords
    '''
    return " ".join([word for word in str(text).split() if word not in words_to_remove])


def stem_text(text: str, stemmer: Any)-> str:
    '''
    stem text string
    '''
    return " ".join([stemmer.stem(word) for word in text.split()])


def remove_emoji(text: str) -> str:
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


def remove_emoticons(text: str, emoticons: Dict) -> str:
    emoticon_pattern = re.compile(u'(' + u'|'.join(k for k in emoticons) + u')')
    return emoticon_pattern.sub(r'', text)


def replace_pornsites_with_string(text:str, replacement_string:str='porn')->str:
    pornsite_pattern = re.compile(r'\S+xnxx\.co\S+' + r'|\S+pornhub\.co\S+' + r'|\S+nude\.co\S+' + r'|\S+sex\.co\S+')
    return pornsite_pattern.sub(replacement_string, text)

def remove_urls(text:str)-> str:
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)


def remove_special_characters(text:str)-> str:
    special_chars_pattern = re.compile(r'[^A-Za-z0-9 ]+')
    return special_chars_pattern.sub(r' ', text)


def replace_chat_slang(text: str, chat_slang: Dict[str, str])-> str:
    new_text = []
    for w in text.split():
        if w.upper() in chat_slang.keys():
            new_text.append(chat_slang[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)


def correct_spellings(text: str, speller: Callable) -> str:
    corrected_text = []
    misspelled_words = speller.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(speller.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)


def lemmation(text:str, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    nlp_lem = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    result = nlp_lem(text)
    result = [token.lemma_ for token in result if token.pos_ in allowed_postags]
    return  " ".join(result)


def contains_words_from_list(text: str, word_list: List[str])-> bool:
    text_words = re.sub("[^\w]", " ",  text).split()
    if any(word in word_list for word in text_words):
        return True
    else:
        return False


def preprocess_string_for_bow(text: str, stemmer: Callable=None, speller: Callable=None, words_to_remove:List[str]=None, emoticons: Dict[str, str]=None, chat_slang: Dict[str, str]=None)-> str:
    try:
        # text = remove_emoji(text)
        # text = remove_emoticons(text, emoticons)
        text = text.lower()
        text = replace_chat_slang(text, chat_slang)
        text = replace_pornsites_with_string(text)
        # text = remove_urls(text)
        text = remove_special_characters(text)
        text = correct_spellings(text, speller)
        text = lemmation(text)
        # text = remove_stopwords(text, words_to_remove)
        # text = stem_text(text, stemmer)
    except(TypeError):
        print(f'Problematic string: {text}')
        text = ''
    return text


def preprocess_df_for_bow(df: pd.DataFrame, text_col: str, output_col_name='preprocessed_bow', stemmer=None, speller=None, words_to_remove=None, emoticons=None, chat_slang=None)-> pd.DataFrame:
    '''
    Gets a PD dataframe and a text column name
    returns the same dataframe with additional column called 'posts_preprocessed_bow'
    '''
    df[output_col_name] = df[text_col].progress_apply(lambda text: preprocess_string_for_bow(text, stemmer=stemmer, speller=speller, words_to_remove=words_to_remove, emoticons=emoticons, chat_slang=chat_slang))
    return df


  text_words = re.sub("[^\w]", " ",  text).split()


In [10]:
# test
preprocess_args = {'stemmer': PorterStemmer(),
                    'speller': SpellChecker(),
                    'words_to_remove': set(stopwords.words('english')),
                    'emoticons': emoticons,
                    'chat_slang': chat_slang,
                    }

text = 'r u going to www.google.com http://xnxx.com im walking LOL ths is not &amp;right im caming flight now u r right brb and fu :-)'
# text = 'yeah--well I just want to see you before I go in the apt--cause one of my friends :) lol :X) got arrested for doing the same thing with a 16 year old--it was a set-up type thing'

preprocess_string_for_bow(text, **preprocess_args)

  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):


'go com porn walk laugh loud amp right come flight now right right back'

## Datasets

### PJ dataset

#### PJ Convesation level dataset

In [11]:

def load_one_chat_as_df_pj(file_path: Path) -> Dict[str, pd.DataFrame]:
    '''
    Gets an path to a PJ XML file
    returns a dict with three dataframes:
        - victim data
        - predator data
        - conversation posts
    '''
    parser = ET.XMLParser(encoding="utf-8")
    try:
        doc_tree = ET.parse(file_path, parser=parser)
    except(ParseError):
        print(f'failed to parse {str(file_path)}')
        return None
        
    doc_root = doc_tree.getroot()
    
    posts_df = pd.DataFrame(columns = ['USERNAME', 'DATETIME', 'BODY', 'COMMENT', 'CODING'], dtype=str)
    predator_df = pd.DataFrame(columns = ['FIRSTNAME', 'LASTNAME', 'STATEDNAME', 'STATEDAGE', 'GENDER', 'RACE', 'CITY', 'STATE', 'REPEATOFFENDER', 'ADMITGUILT', 'TRUTHFULNAME', 'SCREENNAME'], dtype=str)
    victim_df = pd.DataFrame(columns = ['FIRSTNAME', 'LASTNAME', 'STATEDNAME', 'STATEDAGE', 'GENDER', 'RACE', 'CITY', 'STATE', 'PREVIOUSVICTIMIZATION', 'ADMITGUILT', 'SCREENNAME'], dtype=str)

    for post in doc_root.findall('POST'):
        post_dict = {}
        for field in post:
            post_dict[field.tag] = field.text

        posts_df = posts_df.append(post_dict, ignore_index=True)
    posts_df = posts_df.astype('string')


    for predator in doc_root.findall('PREDATOR'):
        predator_dict = {}
        for field in predator:
            if field.tag == 'SCREENNAME':
                for field2 in field:
                    predator_dict[field2.tag] = field2.text
            predator_dict[field.tag] = field.text

        predator_df = predator_df.append(predator_dict, ignore_index=True)   
    predator_df = predator_df.astype('string')

    for victim in doc_root.findall('VICTIM'):
        victim_dict = {}
        for field in victim:
            victim_dict[field.tag] = field.text

        victim_df = victim_df.append(victim_dict, ignore_index=True)  
    victim_df = victim_df.astype('string')

    return {'predator': predator_df, 'victim': victim_df, 'conversation': posts_df, 'conversation_id': str(file_path.parts[-1])}


#----------------------------------------------------------
# Test XML parse functions:
# file_path = PJ_DATA_FOLDER / Path('ArmySgt1961.xml')
# chat_dict = load_one_chat_as_df_pj(file_path)
# chat_dict['victim'].head()
# chat_dict['predator'].head()
# chat_dict['conversation'].head(10)
# chat_dict['conversation_id']


#### Load entire PJ dataframe as list of one-sided conversations

In [12]:

class PjConversationsDataset(Dataset):
    """
    Wrapper around Torch Dataset.
    Prepares an indexed list of PJ conversation in a folder, returns conversations per index (like an array)
    Load is lazy - loads conversation from disk on request.
    Uses load_one_chat_as_df_pj() for conversation loading
    """

    def __init__(self, data_folder: Path):
        """
        Args:
          data_folder - folder with PJ XML files
          df_preprocess_fn - function that gets a dataframe and adds preprocesed text column based on given text column

        """
        self.file_list = list_files_in_dir(data_folder, extension='xml')
        self.TEXT_COLUMN_NAME = 'BODY'

        
    def __len__(self) -> int:
        """
        Returns:
            int: length of the dataset
        """
        return len(self.file_list)

    def __getitem__(self, idx):
        """Gets element of the dataset

        Args:
            index (int): index of the element in the dataset
        Returns:
            Single element by index
        """        
        sample = load_one_chat_as_df_pj(self.file_list[idx])
        return sample

    

In [13]:
PJ_CONVERSATIONS_RAW_CSV = OUTPUT_FOLDER / Path('pj_conversations_raw_full.csv')

if CREATE_FULL_PJ_CONVERSATIONS_DATAFRAME == 'Process':
    print('Running process...')

    pj_ds = PjConversationsDataset(PJ_DATA_FOLDER)
    predator_texts = []
    innocent_texts = []

    for conversation_dict in tqdm(pj_ds):
      # try:
        conversation_df = conversation_dict['conversation'][~conversation_dict['conversation'].USERNAME.isnull() & ~conversation_dict['conversation'].BODY.isnull()]
        authors = conversation_df['USERNAME'].unique()
        for author in authors:
          author_side_text = ' '.join(conversation_df[conversation_df.USERNAME == author]['BODY'])
          if author in conversation_dict['predator'].iloc[0].USERNAME:
              predator_texts.append(author_side_text)
          else:
              innocent_texts.append(author_side_text)
      # except:
      #   print(f'failed to parse conversation: {conversation_dict["conversation_id"]}')
      #   print(f'author: {author}, predator: {conversation_dict["predator"].iloc[0].USERNAME}')

    predator_texts_df = pd.DataFrame(predator_texts, columns=['text'])
    predator_texts_df['predator'] = np.ones(len(predator_texts_df))
    innocent_texts_df = pd.DataFrame(innocent_texts, columns=['text'])
    innocent_texts_df['predator'] = np.zeros(len(innocent_texts_df))

    pj_conversations_df = pd.concat([predator_texts_df, innocent_texts_df]).reset_index(drop=True)
    pj_conversations_df.to_csv(PJ_CONVERSATIONS_RAW_CSV, header=True)

elif CREATE_FULL_PJ_CONVERSATIONS_DATAFRAME == 'Load':
    print('Loading dataset...')
    pj_conversations_df = pd.read_csv(PJ_CONVERSATIONS_RAW_CSV)


# Remove conversations with leq than 10 words
pj_conversations_df['text_len'] = pj_conversations_df['text'].progress_apply(lambda text: len(text.split()))
pj_conversations_df = pj_conversations_df[pj_conversations_df.text_len > 15]

print(f'Number of predator texts: {len(pj_conversations_df[pj_conversations_df.predator == True])}')
print(f'Number of innocent texts: {len(pj_conversations_df[pj_conversations_df.predator == False])}')

Loading dataset...


  0%|          | 0/123 [00:00<?, ?it/s]

Number of predator texts: 57
Number of innocent texts: 65


In [14]:
pj_conversations_df

Unnamed: 0.1,Unnamed: 0,text,predator,text_len
0,0,hi very pretty pic im david hope i didnt bothe...,1.0,5738
1,1,hi jason here 8-) nice to meet you tony hows i...,1.0,166
2,2,hi me too how are you whrerr are you exactly r...,1.0,150
3,3,:-) hi 19 m fox valley do u want me to cum? ca...,1.0,307
4,4,yim? yes? i am in instant messenger cool...you...,1.0,411
...,...,...,...,...
118,118,ohmy that's funny ok ok lol that's funny i've ...,0.0,9551
119,119,"hi hi renee,are you there >:d< yes your beauti...",0.0,239
120,120,"hi are you there hi, luv bug watching tv and u...",0.0,181
121,121,hi hi hi lol good i was just messin with u wha...,0.0,13135


#### Preprocess PJ dataframe

In [15]:
import warnings
warnings.filterwarnings('ignore')

PJ_PREPROCESSED_CSV = OUTPUT_FOLDER / Path('pj_preprocessed_full.csv')

if PREPROCESS_FULL_PJ_DATAFRAME == 'Process':
    print('Running preprocess...')

    preprocess_args = {'stemmer': PorterStemmer(),
                        'speller': SpellChecker(),
                        'words_to_remove': set(stopwords.words('english')),
                        'emoticons': emoticons,
                        'chat_slang': chat_slang,
                        }

    pj_preprocessed_df = preprocess_df_for_bow(pj_conversations_df, 'text', output_col_name='preprocessed_bow', **preprocess_args)
    pj_preprocessed_df.to_csv(PJ_PREPROCESSED_CSV)        

elif PREPROCESS_FULL_PJ_DATAFRAME == 'Load':
    print('Loading preprocessed data...')
    pj_preprocessed_df = pd.read_csv(PJ_PREPROCESSED_CSV, index_col=0)


pj_preprocessed_df = pj_preprocessed_df[pj_preprocessed_df.preprocessed_bow.notna()].reset_index(drop=True)
pj_preprocessed_df

Loading preprocessed data...


Unnamed: 0,text,predator,text_len,preprocessed_bow
0,hi very pretty pic im david hope i didnt bothe...,1.0,5738,very pretty pic bother how old again sure read...
1,hi jason here 8-) nice to meet you tony hows i...,1.0,166,here nice meet how s go bro lookin fun long te...
2,hi me too how are you whrerr are you exactly r...,1.0,150,too how where exactly right so do eve day here...
3,:-) hi 19 m fox valley do u want me to cum? ca...,1.0,307,want cum cause watchin cam so where fall about...
4,yim? yes? i am in instant messenger cool...you...,1.0,411,instant messenger cool look damn sexy male chi...
...,...,...,...,...
116,ohmy that's funny ok ok lol that's funny i've ...,0.0,9551,ofmy funny laugh loud funny ve see on here lik...
117,"hi hi renee,are you there >:d< yes your beauti...",0.0,239,renee there beautiful want sex want first woul...
118,"hi are you there hi, luv bug watching tv and u...",0.0,181,there bug watch really watch too now watch hap...
119,hi hi hi lol good i was just messin with u wha...,0.0,13135,laugh loud good just messin s way look lot ode...


### Pan12 dataset

#### Pan12 Conversation  level dataset for train data (No line labels)

In [16]:
class Pan12Dataset(Dataset):
    '''
    Wrapper around Torch Dataset.
    Prepares an indexed list of Pan12 conversation in a folder, returns conversations per index (like an array)
    Load is lazy - loads conversation from disk on request.
    Uses load_one_chat_as_df_pj() for conversation loading
    '''

    def __init__(self, chat_data_file: Path, user_labels_file: Path=None):
        """
        Args:
            chat_data_file: path to chat xml file
            conversation_labels:
            line_labels:  
        """
        self.chat_data_file = chat_data_file
        self.conversations = self._get_conversation_roots(chat_data_file)
        self.preprocess_args = preprocess_args
        self.user_labels_file = user_labels_file
        self.TEXT_COLUMN_NAME = 'text'

        # Create sets of problematic lines and authors for labels
        user_labels = pd.read_csv(user_labels_file, header=None)
        self.perverted_authors = set(user_labels[0])

                
    def __len__(self) -> int:
        """
        Returns:
            int: length of the dataset
        """
        return len(self.conversations)

    def __getitem__(self, idx) -> Dict[str, pd.DataFrame]:
        """Gets element of the dataset

        Args:
            index (int): index of the element in the dataset
        Returns:
            Single element by index
        """        

        conversation = self.conversations[idx]
        conversation_id = conversation.attrib['id']
        conversation_list = []

        for message in conversation.findall('message'):
            message_list = [message.attrib['line']]
            for field in message:
                message_list.append(field.text)
            
            conversation_list.append(message_list)

        conversation_df = pd.DataFrame(conversation_list, columns = ['line', 'author', 'time', 'text'])
        conversation_df = conversation_df.dropna()
        
        if self.user_labels_file is not None:
            chat_predetors = [author for author in conversation_df.author.unique() if author in self.perverted_authors]
            result = {'conversation_id': conversation_id, 'conversation': conversation_df, 'predators': chat_predetors}
        else:
            result = {'conversation_id': conversation_id, 'conversation': conversation_df}

        return result
    
    def _get_conversation_roots(self, file_path):
        doc_tree = ET.parse(file_path)
        conversation_roots = doc_tree.getroot().findall('conversation')
        return conversation_roots


#### Pan12 TRAIN converter

This section creates a dataset of single sided chat strings and tags them as predator or innocent.
I.e. - each 2-party conversation will create two samples - one of each party, and each party will be tagged separately.
The samples of predators are not necessary predatory in every sample, sinnce the label is on the person and not on the specific chat.

In [17]:
PAN12_TRAIN_RAW_CSV = OUTPUT_FOLDER / Path('pan12_raw_full.csv')

if CREATE_FULL_PAN12_DATAFRAME == 'Process':
    print('Running process...')


    pan12_train_ds = Pan12Dataset(PAN12_TRAIN_DATA_FILE, PAN12_TRAIN_USER_LABELS_FILE)
    predator_texts = []
    innocent_texts = []

    for conversation_dict in tqdm(pan12_train_ds):
        conversation_df = conversation_dict['conversation']
        authors = conversation_df['author'].unique()
        for author in authors:
            author_side_text = ' '.join(conversation_df[conversation_df.author == author]['text'])
            if author in conversation_dict['predators']:
                predator_texts.append(author_side_text)
            else:
                innocent_texts.append(author_side_text)

    predator_texts_df = pd.DataFrame(predator_texts, columns=['text'])
    predator_texts_df['predator'] = np.ones(len(predator_texts_df))
    innocent_texts_df = pd.DataFrame(innocent_texts, columns=['text'])
    innocent_texts_df['predator'] = np.zeros(len(innocent_texts_df))

    pan12_train_df = pd.concat([predator_texts_df, innocent_texts_df]).reset_index(drop=True)
    pan12_train_df.to_csv(PAN12_TRAIN_RAW_CSV, header=True)

elif CREATE_FULL_PAN12_DATAFRAME == 'Load':
    print('Loading dataset...')
    pan12_train_df = pd.read_csv(PAN12_TRAIN_RAW_CSV)


# Remove conversations with leq than 10 words
pan12_train_df['text_len'] = pan12_train_df['text'].progress_apply(lambda text: len(text.split()))
pan12_train_df = pan12_train_df[pan12_train_df.text_len > 15]

print(f'Number of predator texts: {len(pan12_train_df[pan12_train_df.predator == True])}')
print(f'Number of innocent texts: {len(pan12_train_df[pan12_train_df.predator == False])}')

Loading dataset...


  0%|          | 0/99502 [00:00<?, ?it/s]

Number of predator texts: 753
Number of innocent texts: 38250


In [18]:
import warnings
warnings.filterwarnings('ignore')

PAN12_TRAIN_PREPROCESSED_CSV = OUTPUT_FOLDER / Path('pan12_preprocessed_full.csv')

if PREPROCESS_FULL_PAN12_DATAFRAME == 'Process':
    print('Running preprocess...')

    preprocess_args = {'stemmer': PorterStemmer(),
                        'speller': SpellChecker(),
                        'words_to_remove': set(stopwords.words('english')),
                        'emoticons': emoticons,
                        'chat_slang': chat_slang,
                        }

    for i in range(0, 40):
      print(f'current batch: {i * 1000} - {(i+1) * 1000}')
      df = preprocess_df_for_bow(pan12_train_df.iloc[i * 1000:((i+1) * 1000), :], 'text', output_col_name='preprocessed_bow', **preprocess_args)
      PAN12_TRAIN_PREPROCESSED_CSV = OUTPUT_FOLDER / Path(f'pan12_preprocessed_full_{i}_{i+1}.csv')
      df.to_csv(PAN12_TRAIN_PREPROCESSED_CSV, header=True)
    
    # join and update Pan12 preprocessed csv parts - Required because the preprocessing was ran on several computers in parts
    PAN12_TRAIN_PREPROCESSED_CSV = OUTPUT_FOLDER / Path('pan12_preprocessed_full.csv')
    PAN12_TRAIN_PREPROCESSED_CSV_PARTS_FOLDER = OUTPUT_FOLDER / Path('pan12_preprocessed_parts')

    unified_df = unify_csv_dataframes_to_one_sorted(PAN12_TRAIN_PREPROCESSED_CSV_PARTS_FOLDER)
    # unified_df.preprocessed_bow = unified_df.preprocessed_bow.apply(lambda text: ''.join(ch for ch in text if ch not in ["'", "[", "]", ","]))

    unified_df.to_csv(PAN12_TRAIN_PREPROCESSED_CSV)        

elif PREPROCESS_FULL_PAN12_DATAFRAME == 'Load':
    print('Loading preprocessed data...')
    pan12_train_df = pd.read_csv(PAN12_TRAIN_PREPROCESSED_CSV, index_col=0)


pan12_train_df = pan12_train_df[pan12_train_df.preprocessed_bow.notna()].reset_index(drop=True)
pan12_train_df

Loading preprocessed data...


Unnamed: 0,text,predator,text_len,preprocessed_bow
0,hey!! a little better what are u doing? yea i ...,1.0,320,little better do yea think just wake check see...
1,hello hey where r u from in nj?? same here cnj...,1.0,468,where same here co so bring room work old here...
2,hi liz nothing much...how have u been doin any...,1.0,84,much how do good ok wrong finish homework good...
3,Hello Whatcha doin? Oh yeah hows brad and darl...,1.0,45,do how s loud laugh loud ink school kinda bori...
4,i came to tonapah and you were't on left at 6 ...,1.0,24,come on leave back talk later
...,...,...,...,...
35917,65325d50b2e25aca54bc871b89758c9c: For the poll...,0.0,49,poll vote member vote represent understanding ...
35918,"Ooh, impressive, 3 out of 3 of the times liste...",0.0,29,impressive time list new reminder email correc...
35919,hello asl 21m uk u wasuup omg me too lol rape ...,0.0,221,uk omg too laugh loud rape just start jude lau...
35920,heyy asl? 17 f australia&apos; mehh not much b...,0.0,177,much blast nirvana bedroom laugh haha love buz...


### Unify datasets - entire pan + positives from pj

In [19]:
pan_pj_unified_df = pd.concat([pj_preprocessed_df[pj_preprocessed_df.predator == 1], pan12_train_df])

In [20]:
pan_pj_unified_df

Unnamed: 0,text,predator,text_len,preprocessed_bow
0,hi very pretty pic im david hope i didnt bothe...,1.0,5738,very pretty pic bother how old again sure read...
1,hi jason here 8-) nice to meet you tony hows i...,1.0,166,here nice meet how s go bro lookin fun long te...
2,hi me too how are you whrerr are you exactly r...,1.0,150,too how where exactly right so do eve day here...
3,:-) hi 19 m fox valley do u want me to cum? ca...,1.0,307,want cum cause watchin cam so where fall about...
4,yim? yes? i am in instant messenger cool...you...,1.0,411,instant messenger cool look damn sexy male chi...
...,...,...,...,...
35917,65325d50b2e25aca54bc871b89758c9c: For the poll...,0.0,49,poll vote member vote represent understanding ...
35918,"Ooh, impressive, 3 out of 3 of the times liste...",0.0,29,impressive time list new reminder email correc...
35919,hello asl 21m uk u wasuup omg me too lol rape ...,0.0,221,uk omg too laugh loud rape just start jude lau...
35920,heyy asl? 17 f australia&apos; mehh not much b...,0.0,177,much blast nirvana bedroom laugh haha love buz...


## Feature engineering

### Word-list based features

In [21]:
def contains_words_from_list(text: str, word_list: List[str])-> bool:
    text_words = re.sub("[^\w]", " ",  text).split()
    if any(word in word_list for word in text_words):
        return True
    else:
        return False


def count_words_from_list(text: str, word_list: List[str])-> int:
    text_words = text.split()
    word_counts = pd.value_counts(np.array(text_words))
    count = word_counts[set(word_counts.index) & set(word_list)].sum()
    return count


def add_wordlist_features(df: pd.DataFrame, text_column: str, sex_word_list, family_word_list, meeting_word_list):
    df['num_sex_words'] = df[text_column].progress_apply(lambda text: count_words_from_list(text, sex_word_list))
    df['num_family_words'] = df[text_column].progress_apply(lambda text: count_words_from_list(text, family_word_list))
    df['num_meeting_words'] = df[text_column].progress_apply(lambda text: count_words_from_list(text, meeting_word_list))
    return df


In [26]:
pan_pj_unified_df = add_wordlist_features(pan_pj_unified_df, 'preprocessed_bow', sex_word_list, family_word_list, meeting_word_list)
pan_pj_unified_df

Unnamed: 0,text,predator,text_len,preprocessed_bow,num_sex_words,num_family_words,num_meeting_words
0,hi very pretty pic im david hope i didnt bothe...,1.0,5738,very pretty pic bother how old again sure read...,97,7,15
1,hi jason here 8-) nice to meet you tony hows i...,1.0,166,here nice meet how s go bro lookin fun long te...,0,2,3
2,hi me too how are you whrerr are you exactly r...,1.0,150,too how where exactly right so do eve day here...,5,0,1
3,:-) hi 19 m fox valley do u want me to cum? ca...,1.0,307,want cum cause watchin cam so where fall about...,14,1,4
4,yim? yes? i am in instant messenger cool...you...,1.0,411,instant messenger cool look damn sexy male chi...,11,0,1
...,...,...,...,...,...,...,...
35917,65325d50b2e25aca54bc871b89758c9c: For the poll...,0.0,49,poll vote member vote represent understanding ...,0,0,0
35918,"Ooh, impressive, 3 out of 3 of the times liste...",0.0,29,impressive time list new reminder email correc...,0,0,0
35919,hello asl 21m uk u wasuup omg me too lol rape ...,0.0,221,uk omg too laugh loud rape just start jude lau...,3,0,0
35920,heyy asl? 17 f australia&apos; mehh not much b...,0.0,177,much blast nirvana bedroom laugh haha love buz...,1,0,0


## Models

### LDA topic model

In [27]:
# BOW and corpush preparation

ID2WORD_MODEL_PATH = OUTPUT_FOLDER / Path('lda_id2word.bin')
CORPUS_PATH = OUTPUT_FOLDER / Path('lda_corpus.bin')

# Create bag of words list from chat sentences
def create_bow_from_text(text):
    bow_words = []
    text_word_list = gensim.utils.simple_preprocess(text, deacc=True)
    bow_words.append(text_word_list)
    return bow_words[0]


def create_bow_from_text_list(text_list):
    bow_words = []
    for i, text in enumerate(text_list):
        # text_word_list = gensim.utils.simple_preprocess(text, deacc=True)
        # bow_words.append(text_word_list)
        bow_words.append(create_bow_from_text(text))
        
    return bow_words


# Process the Pan12 dataset
if PAN12_TOPIC_MODEL_CORPUS == 'Process':
  print('Calculating BOW and corpus...')

  bow_words = create_bow_from_text_list(pan_pj_unified_df['preprocessed_bow'])

  # create word indexes and frequencies from chat sentences bow
  id2word = corpora.Dictionary(bow_words)
  corpus = []
  for word in bow_words:
      corpus.append(id2word.doc2bow(word))
    
  # Save models to disk
  id2word.save(str(ID2WORD_MODEL_PATH))
  with open(CORPUS_PATH, 'wb') as handle:
    pickle.dump(corpus, handle)

elif PAN12_TOPIC_MODEL_CORPUS == 'Load':
  print('Loading BOW and corpus from disk...')
  id2word = corpora.Dictionary.load(str(ID2WORD_MODEL_PATH))
  with open(CORPUS_PATH, 'rb') as handle:
    corpus = pickle.load(handle)

Calculating BOW and corpus...


In [28]:
# Actual LDA topic model

def create_lda_model(corpus, id2word, num_topics, save_to_disk=True, save_model_path=None):
  print(f'Calculating LDA model with {num_topics} topics...')
  # create LDA topic model
  lda_model = gensim.models.LdaMulticore(workers=3, corpus=corpus,
  id2word=id2word,
  num_topics=num_topics, 
  chunksize=100,
  passes=10)

  # lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
  # id2word=id2word,
  # num_topics=num_topics, 
  # update_every=1,
  # chunksize=100,
  # passes=10,
  # alpha='auto')

  # Save models to disk
  if save_to_disk:
    lda_model.save(str(save_model_path))

  return lda_model


# NUM_LDA_TOPICS = 10
# LDA_MODEL_PATH = OUTPUT_FOLDER / Path(f'lda_model_{NUM_LDA_TOPICS}.bin')

# if PAN12_TOPIC_MODEL == 'Process':
#   lda_model = create_lda_model(corpus, id2word, NUM_LDA_TOPICS, save_to_disk=True)
    
 
# elif PAN12_TOPIC_MODEL == 'Load':
#   print('Loading LDA model from disk...')
#   lda_model = gensim.models.ldamodel.LdaModel.load(str(LDA_MODEL_PATH))

#   NUM_LDA_TOPICS = len(lda_model.get_topics())
#   print(f'Loaded model has {NUM_LDA_TOPICS} topics')


In [29]:
# # visualize
# vis = gensimvis.prepare(lda_model, corpus, id2word, mds='mmds', R=30)

In [30]:
# Topics inference function
def get_text_topics(text, id2word, lda_model):
  text_vector = id2word.doc2bow(text.split())
  text_topics = lda_model.get_document_topics(text_vector)
  return text_topics

In [31]:
# for test only: show topics for some sample sentences:
preprocess_args = {'stemmer': PorterStemmer(),
                    'speller': SpellChecker(),
                    'words_to_remove': set(stopwords.words('english')),
                    'emoticons': emoticons,
                    'chat_slang': chat_slang,
                    }


def display_text_topics(text: str, preprocess_args):
  text_preprocessed  = preprocess_string_for_bow(text, **preprocess_args)
  text_topics = get_text_topics(text_preprocessed, id2word, lda_model)
  print(text)
  print(text_topics)
  print()


# sample = 'Tell your mom she is a cunt with a nice ass fuck her and suck you too'
# display_text_topics(sample, preprocess_args)

# sample = 'I would like to go to the cinema and have popcorn'
# display_text_topics(sample, preprocess_args)

# sample = pan_pj_unified_df['preprocessed_bow'][100]
# display_text_topics(sample, preprocess_args)

# sample = pan_pj_unified_df['preprocessed_bow'][1000]
# display_text_topics(sample, preprocess_args)


### Data prep for classifier

#### Add topics to data

In [32]:
def add_topics_to_df(df, id2word, lda_model):
  num_lda_topics = len(lda_model.get_topics())
  topic_scores = np.zeros([len(df), num_lda_topics])

  for row_idx, row in tqdm(df.iterrows()):
    row_topics = get_text_topics(row.preprocessed_bow, id2word, lda_model)
    for topic in row_topics:
      topic_scores[row_idx, topic[0]] = topic[1]
  
  for topic_idx in range(num_lda_topics):
    df[f'topic_{topic_idx}'] = topic_scores[:, topic_idx]
  
  return df


# pan_pj_unified_df = add_topics_to_df(df, id2word, lda_model)
# pan_pj_unified_df


#### Train / Test split

In [33]:
# train test split
def dataprep_for_classifier(df, precent_train, num_lda_topics, reandom_seed=17):

  random.seed(reandom_seed)
  x_columns = [f'topic_{i}' for i in range(num_lda_topics)] + ['num_sex_words', 'num_family_words', 'num_meeting_words']
  y_column = ['predator']
  data_columns = x_columns + y_column

  # Split Train / Test
  num_samples_positive = len(df[df.predator == 1])
  num_samples_negative = len(df) - num_samples_positive

  num_train_positive_samples = int(precent_train * num_samples_positive)
  num_train_negative_samples = int(precent_train * num_samples_negative)

  train_pos_index = random.sample(range(num_samples_positive),  num_train_positive_samples)
  test_pos_index = list(set(range(num_samples_positive)).difference(train_pos_index))
  print(f'testing: {num_samples_positive} = {len(train_pos_index) + len(test_pos_index)}')

  train_neg_index = random.sample(range(num_samples_negative),  num_train_negative_samples)
  test_neg_index = list(set(range(num_samples_negative)).difference(train_neg_index))
  print(f'testing: {num_samples_negative} = {len(train_neg_index) + len(test_neg_index)}')

  data_train_pos = df[df.predator == 1].iloc[train_pos_index][data_columns]
  data_train_neg = df[df.predator == 0].iloc[train_neg_index][data_columns]
  data_train = pd.concat([data_train_pos, data_train_neg], axis=0).sample(frac=1)
  data_train.predator = data_train.predator.astype('int')

  data_test_pos = df[df.predator == 1].iloc[test_pos_index][data_columns]
  data_test_neg = df[df.predator == 0].iloc[test_neg_index][data_columns]
  data_test = pd.concat([data_test_pos, data_test_neg], axis=0).sample(frac=1)
  data_test.predator = data_test.predator.astype('int')

  x_train = data_train[x_columns]
  y_train = data_train[y_column]
  x_test = data_test[x_columns]
  y_test = data_test[y_column]

  return x_train, y_train, x_test, y_test



# PERCENT_TRAIN = 0.8
# x_train, y_train, x_test, y_test = dataprep_for_classifier(pan_pj_unified_df, PERCENT_TRAIN, NUM_LDA_TOPICS)
# print(x_train.head(), '\n', y_train.head())
# print(x_test.head(), '\n', y_test.head())

### Random forest classifier on topic model

#### RF model calculation

In [34]:
from sklearn.ensemble import RandomForestClassifier

def calc_random_forest(x_train, y_train, n_estimators, max_depth, class_weight=None, save_to_file=True, output_path=None, random_state=17):
  # Random Forest train
  # print('Processing random forest model...')

  # Setting random state to get reproducible results
  rfc = RandomForestClassifier(n_jobs=4, n_estimators=n_estimators, max_depth=max_depth, class_weight=class_weight, random_state=random_state)
  rfc.fit(x_train, y_train)

  if save_to_file:
    # Save model to disk
    with open(output_path, 'wb') as file_handler:
        pickle.dump(rfc, file_handler)
  
  return rfc



# N_ESTIMATORS = 50
# MAX_DEPTH = 4
# CLASS_WEIGHT='balanced'

# LDA_MODEL_RF_PATH = OUTPUT_FOLDER / Path(f'lda_rf_model_{NUM_LDA_TOPICS}_{N_ESTIMATORS}_{MAX_DEPTH}_{CLASS_WEIGHT}.bin')

# # Random Forest train
# if PAN12_TOPIC_MODEL_RF == 'Process':
#   print('Processing random forest model...')
#   rfc = calc_random_forest(x_train, y_train, N_ESTIMATORS, MAX_DEPTH, class_weight=CLASS_WEIGHT, save_to_file=True, output_path=LDA_MODEL_RF_PATH, random_state=17)


# elif PAN12_TOPIC_MODEL_RF == 'Load':
#   print('Loading random forest model from disk...')
#   with open(LDA_MODEL_RF_PATH, 'rb') as file_handler:
#     rfc = pickle.load(file_handler)

In [35]:
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

def display_roc_curve_and_get_perfs(y_test, y_test_pred, plot=True):
  # Compute fpr, tpr, thresholds and roc auc
  REQUIRED_FPR = 0.05

  test_fpr, test_tpr, thresholds = roc_curve(y_test, y_test_pred, drop_intermediate=False)
  test_roc_auc = auc(test_fpr, test_tpr)

  fpr_idx = np.argmax(test_fpr >= REQUIRED_FPR)
  required_tpr = test_tpr[fpr_idx]

  if plot:
    # Plot ROC curve
    fig = plt.figure(figsize=(10, 10))
    ax = fig.add_subplot(1, 1, 1)
    ax.plot(test_fpr, test_tpr, label='Test ROC curve (area = %0.3f)' % test_roc_auc)
    ax.vlines(REQUIRED_FPR, 0, required_tpr, color="r", lw=2)
    ax.hlines(required_tpr, REQUIRED_FPR, 0, color="r", lw=2, label=f'FPR {REQUIRED_FPR}, TPR {required_tpr}')

    ax.set_xlim([0, 1.0])
    ax.set_ylim([0.0, 1.0])
    ax.set_yticks(np.arange(22) / 20.0)
    ax.set_xlim([4e-5, 1.0])

    ax.set_xlabel("False positive rate")
    ax.set_ylabel("True positive rate")
    ax.set_title('Receiver Operating Characteristic')
    ax.legend(loc="lower right")
    ax.set_xscale("log")
    plt.gca().grid(True)
    plt.show()

  return test_roc_auc, required_tpr


# y_test_pred = rfc.predict_proba(x_test)[:, 1]
# test_roc_auc, tpr = display_roc_curve_and_get_perfs(y_test, y_test_pred)


#### Grid search best hyperparams

In [None]:
# Train or load the random forest model

RFC_TRAIN_RESULTS_CSV_PATH = OUTPUT_FOLDER / 'rfc_train_results.csv'
CLASS_WEIGHT='balanced'

if GRID_SEARCH_LDA_RFC == 'Process':
  PERCENT_TRAIN = 0.8

  # Grid search parmaeters
  train_results = pd.DataFrame(columns=['topics', 'trees', 'depth', 'auc', 'tpr'])

  topics = [9, 10, 20, 35, 50]
  for num_topics in tqdm(topics):
    # Build LDA model
    LDA_MODEL_PATH = OUTPUT_FOLDER / Path(f'lda_model_{int(num_topics)}.bin')
    if LDA_MODEL_PATH.is_file():
      lda_model = gensim.models.ldamodel.LdaModel.load(str(LDA_MODEL_PATH))
    else:
      lda_model = create_lda_model(corpus, id2word, num_topics, save_to_disk=True, save_model_path=LDA_MODEL_PATH)
    
    # update dataframe with topics
    df = add_topics_to_df(pan_pj_unified_df, id2word, lda_model).copy()

    # dataprep
    x_train, y_train, x_test, y_test = dataprep_for_classifier(df, PERCENT_TRAIN, num_topics)

    min_depth = 3
    max_depth = int(num_topics * 2/3)
    num_depths = min(max_depth - min_depth, 5)
    depths_for_num_topics = np.linspace(min_depth, max_depth, num_depths, dtype=int)

    for depth in tqdm(depths_for_num_topics):
      min_trees=50
      max_trees = 300
      trees = [num for num in np.linspace(min_trees, max_trees, 5, dtype=int)]

      for num_trees in tqdm(trees):
        # build RFC
        LDA_MODEL_RF_PATH = OUTPUT_FOLDER / Path(f'lda_rf_model_{int(num_topics)}_{int(num_trees)}_{int(depth)}_{CLASS_WEIGHT}.bin')
        if LDA_MODEL_RF_PATH.is_file():
          with open(LDA_MODEL_RF_PATH, 'rb') as file_handler:
            rfc = pickle.load(file_handler)
        else:
          rfc = calc_random_forest(x_train, y_train, num_trees, depth, class_weight=CLASS_WEIGHT, save_to_file=True, output_path=LDA_MODEL_RF_PATH, random_state=17)
        # Test
        y_test_pred = rfc.predict_proba(x_test)[:, 1]
        test_roc_auc, tpr = display_roc_curve_and_get_perfs(y_test, y_test_pred, plot=False)

        train_results = train_results.append({'topics': num_topics, 'trees': num_trees, 'depth': depth, 'auc': test_roc_auc, 'tpr': tpr}, ignore_index=True)
    train_results.to_csv(RFC_TRAIN_RESULTS_CSV_PATH)

elif GRID_SEARCH_LDA_RFC == 'Load':
  print('Loading train results from disk...')
  train_results = pd.read_csv(RFC_TRAIN_RESULTS_CSV_PATH, index_col=0)

print('10 best results based on TPR:')
print(train_results.sort_values('tpr', ascending=False).head(10))
print()

# Load best configuration back to memory
print('Loading best model set from disk...')
best_config = train_results.sort_values('tpr', ascending=False).iloc[0, :].to_dict()

LDA_MODEL_PATH = OUTPUT_FOLDER / Path(f'lda_model_{int(best_config["topics"])}.bin')
LDA_MODEL_RF_PATH = OUTPUT_FOLDER / Path(f'lda_rf_model_{int(best_config["topics"])}_{int(best_config["trees"])}_{int(best_config["depth"])}_{CLASS_WEIGHT}.bin')

print(f'Loading model {str(LDA_MODEL_PATH)}...', end='')
if LDA_MODEL_PATH.is_file():
  lda_model = gensim.models.LdaMulticore.load(str(LDA_MODEL_PATH))
  print('Done!')


print(f'Loading model {str(LDA_MODEL_RF_PATH)}...', end='')
if LDA_MODEL_RF_PATH.is_file():
  with open(LDA_MODEL_RF_PATH, 'rb') as file_handler:
    rfc = pickle.load(file_handler)
  print('Done!')


  0%|          | 0/5 [00:00<?, ?it/s]

Calculating LDA model with 9 topics...


### Gradient boost on topic model

#### XGB model calculation

In [None]:
from xgboost import XGBClassifier

def calc_grad_boost(x_train, y_train, n_estimators, max_depth, save_to_file=True, output_path=None, random_state=17):
  # Gradient boost train

  xgb = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth)
  xgb.fit(x_train, y_train)

  if save_to_file:
    # Save model to disk
    with open(output_path, 'wb') as file_handler:
        pickle.dump(xgb, file_handler)
  
  return xgb

#### Grid search best hyperparameters

In [None]:
# Train or load the Gradient boost model

GB_TRAIN_RESULTS_CSV_PATH = OUTPUT_FOLDER / 'gb_train_results.csv'

if GRID_SEARCH_LDA_GB == 'Process':
  PERCENT_TRAIN = 0.8

  # Grid search parmaeters
  train_results = pd.DataFrame(columns=['topics', 'trees', 'depth', 'auc', 'tpr'])

  topics = [9, 10, 20, 35, 50]
  for num_topics in tqdm(topics):
    # Build LDA model
    LDA_MODEL_PATH = OUTPUT_FOLDER / Path(f'lda_model_{int(num_topics)}.bin')
    if LDA_MODEL_PATH.is_file():
      lda_model = gensim.models.ldamodel.LdaModel.load(str(LDA_MODEL_PATH))
    else:
      lda_model = create_lda_model(corpus, id2word, num_topics, save_to_disk=True, save_model_path=LDA_MODEL_PATH)
    
    # update dataframe with topics
    df = add_topics_to_df(pan_pj_unified_df, id2word, lda_model).copy()
    # dataprep
    x_train, y_train, x_test, y_test = dataprep_for_classifier(df, PERCENT_TRAIN, num_topics)

    min_depth = 3
    max_depth = int(num_topics * 2/3)
    num_depths = min(max_depth - min_depth, 5)
    depths_for_num_topics = np.linspace(min_depth, max_depth, num_depths, dtype=int)

    for depth in tqdm(depths_for_num_topics):
      min_trees=50
      max_trees = 300
      trees = [num for num in np.linspace(min_trees, max_trees, 5, dtype=int)]

      for num_trees in tqdm(trees):
        # build GB
        LDA_MODEL_GB_PATH = OUTPUT_FOLDER / Path(f'lda_gb_model_{int(num_topics)}_{int(num_trees)}_{int(depth)}.bin')
        if LDA_MODEL_GB_PATH.is_file():
          with open(LDA_MODEL_GB_PATH, 'rb') as file_handler:
            gb = pickle.load(file_handler)
        else:
          gb = calc_grad_boost(x_train, y_train, num_trees, depth, save_to_file=True, output_path=LDA_MODEL_GB_PATH, random_state=17)
        # Test
        y_test_pred = gb.predict_proba(x_test)[:, 1]
        test_roc_auc, tpr = display_roc_curve_and_get_perfs(y_test, y_test_pred, plot=False)

        train_results = train_results.append({'topics': num_topics, 'trees': num_trees, 'depth': depth, 'auc': test_roc_auc, 'tpr': tpr}, ignore_index=True)
    train_results.to_csv(GB_TRAIN_RESULTS_CSV_PATH)

elif GRID_SEARCH_LDA_GB == 'Load':
  print('Loading train results from disk...')
  train_results = pd.read_csv(GB_TRAIN_RESULTS_CSV_PATH, index_col=0)

print('10 best results based on TPR:')
print(train_results.sort_values('tpr', ascending=False).head(10))
print()

# Load best configuration back to memory
print('Loading best model set from disk...')
best_config = train_results.sort_values('tpr', ascending=False).iloc[0, :].to_dict()

LDA_MODEL_PATH = OUTPUT_FOLDER / Path(f'lda_model_{int(best_config["topics"])}.bin')
LDA_MODEL_GB_PATH = OUTPUT_FOLDER / Path(f'lda_gb_model_{int(best_config["topics"])}_{int(best_config["trees"])}_{int(best_config["depth"])}.bin')

print(f'Loading model {str(LDA_MODEL_PATH)}...', end='')
if LDA_MODEL_PATH.is_file():
  lda_model = gensim.models.LdaMulticore.load(str(LDA_MODEL_PATH))
  print('Done!')


print(f'Loading model {str(LDA_MODEL_GB_PATH)}...', end='')
if LDA_MODEL_GB_PATH.is_file():
  with open(LDA_MODEL_GB_PATH, 'rb') as file_handler:
    gb = pickle.load(file_handler)
  print('Done!')


### Test final model

#### Visualize toppic model on train data

In [None]:
# visualize topic model
vis = gensimvis.prepare(lda_model, corpus, id2word, mds='mmds', R=30)
vis

In [None]:
# text_vector = id2word.doc2bow(['fuck'])
# # print(lda_model.get_document_topics(text_vector))

# f = plt.figure()
# f.set_figwidth(20)
# f.set_figheight(20)

# # plt.imshow(WordCloud().fit_words(dict(lda_model.show_topic(24, 300))), )
# lda_model.show_topic(23, 300)

#### Test model on entire dataset

In [None]:
# dataprep
df = add_topics_to_df(pan_pj_unified_df, id2word, lda_model)

num_lda_topics = len(lda_model.get_topics())
x_columns = [f'topic_{i}' for i in range(num_lda_topics)]
y_column = ['predator']

x_unified_full = pan_pj_unified_df[x_columns]
y_unified_full = pan_pj_unified_df[y_column]

y_unified_full_pred = rfc.predict_proba(x_unified_full)[:, 1]

display_roc_curve_and_get_perfs(y_unified_full, y_unified_full_pred, plot=True)



#### Test podel on Customer data only

In [None]:
# update dataframe with topics
pj_preprocessed_df = add_topics_to_df(pj_preprocessed_df, id2word, lda_model)

# dataprep
num_lda_topics = len(lda_model.get_topics())
x_columns = [f'topic_{i}' for i in range(num_lda_topics)]
y_column = ['predator']

x_customer = pj_preprocessed_df[x_columns]
y_customer = pj_preprocessed_df[y_column]

y_customer_pred = rfc.predict_proba(x_customer)[:, 1]

display_roc_curve_and_get_perfs(y_customer, y_customer_pred, plot=True)

## Backup - not useful currently

#### TF/IDF - **Not used**

In [None]:
# Train TF/IDF model
tfidf_vectorizer = TfidfVectorizer(min_df = 5, max_df = 0.95)
tfidf_vectorizer.fit(pan12_train_df['preprocessed_bow'])

# Transform chat messages to vocabulary vectors
vectorized_data = tfidf_vectorizer.transform(pan12_train_df['preprocessed_bow'])
print(f'Vectorized data shape: {vectorized_data.shape}')

print(pan12_train_df['preprocessed_bow'][6])
print(vectorized_data[6])


In [None]:
# create dataframe of vectors
tfidf_df = pd.DataFrame(vectorized_data.toarray(), columns=tfidf_vectorizer.get_feature_names())
# tfidf_df[['sex', 'babe', 'young', 'age', 'dick']].sort_values('dick', ascending=False)


In [None]:
# https://melaniewalsh.github.io/Intro-Cultural-Analytics/05-Text-Analysis/03-TF-IDF-Scikit-Learn.html
#Visualize TF/IDF


#### pan12 line level dataloader **Not used**

In [None]:
class Pan12LineLevelDataloader():  
    """
    Wrapper around Torch Dataset to perform text classification
    """

    def __init__(self, chat_data_file: Path, user_labels_file: Path=None, line_labels_file: Path=None, preprocess_fn=None, preprocess_args:Dict=None):
        """
        Args:
            chat_data_file: path to chat xml file
            conversation_labels:
            line_labels:  
        """
       
        self.chat_data_file = chat_data_file
        self.conversations = self._get_conversation_roots(chat_data_file)
        self.preprocess_fn = preprocess_fn
        self.preprocess_args = preprocess_args

        self.user_labels_file = user_labels_file
        self.line_labels_file = line_labels_file
        self.TEXT_COLUMN_NAME = 'text'

        self.length = self._get_ds_length()
        self.num_conversations = len(self.conversations)

        # Initiate queue
        self.message_list = None
        self.current_conversation_id = None
        self.next_conversation_idx = 0
        self.next_message_idx = 0

        # Create sets of problematic lines and authors for labels
        user_labels = pd.read_csv(user_labels_file, delimiter='\t', header=None)
        self.perverted_authors = set(user_labels[0])

        line_labels = pd.read_csv(line_labels_file, delimiter='\t', header=None)
        line_labels['concat'] = line_labels[0] + '_' + line_labels[1].astype(str)
        self.pervert_lines = set(line_labels['concat'])

        self.load_next_conversation_to_list()
                       
    def __iter__(self):
        return self

    def __len__(self) -> int:
        """
        Returns:
            int: length of the dataset
        """
        return self.length

    def __next__(self) -> Dict[str, pd.DataFrame]:
        """Gets element of the dataset

        Args:
            index (int): index of the element in the dataset
        Returns:
            Single element by index
        """        
        message_dict = {}
        try:
            message = self.message_list[self.next_message_idx]
        except(IndexError):
            self.load_next_conversation_to_list()
            message = self.message_list[self.next_message_idx]

        message_dict['conversation_id'] = self.current_conversation_id
        self.next_message_idx += 1
        
        message_dict['line'] = message.attrib['line']  

        for field in message:
            message_dict[field.tag] = field.text
        
        if self.preprocess_fn is not None:
            message_dict['text'] = self.preprocess_fn(message_dict['text'], **self.preprocess_args)
        
        message_dict['author_label'] = 1 if message_dict['author'] in self.perverted_authors else 0
        message_dict['line_label'] = 1 if message_dict['conversation_id'] + '_' + message_dict['line'] in self.pervert_lines else 0

        return message_dict
    
    def _get_conversation_roots(self, file_path):
        doc_tree = ET.parse(file_path)
        conversation_roots = doc_tree.getroot().findall('conversation')
        return conversation_roots

    def _get_ds_length(self):
        number_messages = 0
        for conversation in self.conversations:
            number_messages += len(conversation.findall('message'))
        
        return number_messages

    def load_next_conversation_to_list(self):
        try:
            conversation = self.conversations[self.next_conversation_idx] 
            self.current_conversation_id = conversation.attrib['id']  
        except(IndexError):
            raise StopIteration()

        self.next_conversation_idx += 1
        self.message_list = [m for m in conversation.findall('message')]
        self.next_message_idx = 0

In [None]:
# # Test dataset
# preprocess_args = {'stemmer': PorterStemmer(),
#                     'speller': SpellChecker(),
#                     'words_to_remove': set(stopwords.words('english')),
#                     'emoticons': emoticons,
#                     'chat_slang': chat_slang
#                     }

# pan12_ds = Pan12LineLevelDataloader(PAN12_TEST_DATA_FILE, user_labels_file=PAN12_USER_LABELS_FILE, line_labels_file=PAN12_LINE_LABELS_FILE, preprocess_fn=preprocess_string_for_bow, preprocess_args=preprocess_args)
# print(len(pan12_ds))

# for i, m in enumerate(pan12_ds):
#     print(i, m) 
#     if i==50:
#         break

#### Convert Pan12 to labeled datafreame for use later as Train data - **Not used**

In [None]:
# class Pan12converterToDF():

#     # Pan12 converter for TEST dataset - with line labels!
    
#     """
#     Wrapper around Torch Dataset to perform text classification
#     """

#     def __init__(self, chat_data_file: Path, user_labels_file: Path=None, line_labels_file: Path=None):
#         """
#         Args:
#             chat_data_file: path to chat xml file
#             conversation_labels:
#             line_labels:
#             mode:   full - all data 
#                     positive_lines - Only lines labeled as problematic
#         """
       
#         self.chat_data_file = chat_data_file
#         self.conversations = self._get_conversation_roots(chat_data_file)

#         self.user_labels_file = user_labels_file
#         self.line_labels_file = line_labels_file
#         self.TEXT_COLUMN_NAME = 'text'

#         self.length = self._get_ds_length()
#         self.num_conversations = len(self.conversations)

#         # Initiate queue
#         self.message_list = None
#         self.current_conversation_id = None
#         self.next_conversation_idx = 0
#         self.next_message_idx = 0

#         # Create sets of problematic lines and authors for labels
#         user_labels = pd.read_csv(user_labels_file, delimiter='\t', header=None)
#         self.perverted_authors = set(user_labels[0])

#         line_labels = pd.read_csv(line_labels_file, delimiter='\t', header=None)
#         line_labels['concat'] = line_labels[0] + '_' + line_labels[1].astype(str)
#         self.perverted_conversations = set(line_labels[0].unique())
#         self.pervert_lines = set(line_labels['concat'])


#     def __iter__(self):
#         return self

#     def __len__(self) -> int:
#         """
#         Returns:
#             int: length of the dataset
#         """
#         return self.length

#     def convert(self, filename:Path, save_every=2000, mode: str='full') -> pd.DataFrame:
#         """Gets element of the dataset

#         Args:
#             index (int): index of the element in the dataset
#         Returns:
#             Single element by index
#         """        
#         pan12_df = pd.DataFrame(columns=['conversation_id', 'line', 'author', 'time', 'text', 'line_label', 'author_label'])

#         self._load_next_conversation_to_list(mode) 

#         if(mode == 'full'):
#             iter_len = self.length
#         elif(mode == 'positive_lines'):
#             iter_len = len(self.pervert_lines)
        
#         for i in tqdm(range(iter_len)):
#             message_dict = {}
#             try:
#                 message = self.message_list[self.next_message_idx]
#             except(IndexError):
#                 self._load_next_conversation_to_list(mode)
#                 message = self.message_list[self.next_message_idx]
            
#             message_dict['conversation_id'] = self.current_conversation_id
#             self.next_message_idx += 1
            
#             message_dict['line'] = message.attrib['line']  
#             for field in message:
#                 message_dict[field.tag] = field.text
            
#             message_dict['author_label'] = 1 if message_dict['author'] in self.perverted_authors else 0
#             message_dict['line_label'] = 1 if message_dict['conversation_id'] + '_' + message_dict['line'] in self.pervert_lines else 0
            
#             pan12_df = pan12_df.append(message_dict, ignore_index=True)
#             if i % save_every == 0:
#                 pan12_df.to_csv(filename)
#                 print('.', end='')

#             # #######
#             # if i == 1001:
#             #     print(pan12_df.head(2001))
#             #     break
#             # ######
#         pan12_df.to_csv(filename)
#         return pan12_df
    
#     def _get_conversation_roots(self, file_path):
#         doc_tree = ET.parse(file_path)
#         conversation_roots = doc_tree.getroot().findall('conversation')
#         return conversation_roots

#     def _get_ds_length(self):
#         number_messages = 0
#         for conversation in self.conversations:
#             number_messages += len(conversation.findall('message'))
        
#         return number_messages

#     def _load_next_conversation_to_list(self, mode):
#         try:
#             conversation = self.conversations[self.next_conversation_idx] 
#             self.next_conversation_idx += 1
#             self.current_conversation_id = conversation.attrib['id']  

#             if mode == 'positive_lines':
#                 while self.current_conversation_id not in self.perverted_conversations:
#                     conversation = self.conversations[self.next_conversation_idx] 
#                     self.next_conversation_idx += 1
#                     self.current_conversation_id = conversation.attrib['id']  
     
#         except(IndexError):
#             raise StopIteration()

#         if mode == 'positive_lines':
#             self.message_list = [m for m in conversation.findall('message') if (self.current_conversation_id + '_' + m.attrib['line'] in self.pervert_lines)]
#         else:
#             self.message_list = [m for m in conversation.findall('message')]
#         self.next_message_idx = 0


In [None]:
# # PAN12_PERVERTED_LINES_CSV = OUTPUT_FOLDER / Path('pan12_perverted_lines_preprocessed.csv')
# PAN12_PERVERTED_LINES_CSV = OUTPUT_FOLDER / Path('pan12_full_lines_preprocessed.csv')

# PAN12_FULL_RAW_CSV = OUTPUT_FOLDER / Path('pan12_raw_full.csv')

# if CREATE_FULL_PAN12_DATAFRAME == 'Process':
#     # Create a dataframe of all pan12 test perverted lines
#     pan12_converter = Pan12converterToDF(PAN12_TEST_DATA_FILE, user_labels_file=PAN12_USER_LABELS_FILE, line_labels_file=PAN12_LINE_LABELS_FILE)
#     print(len(pan12_converter))
#     # pan12_df = pan12_converter.convert(PAN12_FULL_RAW_CSV, mode='positive_lines')
#     pan12_df = pan12_converter.convert(PAN12_FULL_RAW_CSV, mode='full')
#     print(f'lines in pan12_df: {len(pan12_df)}')

#     # Preprocess pan12 perverted lines only and save to csv
#     preprocess_args = {'stemmer': PorterStemmer(),
#                         'speller': SpellChecker(),
#                         'words_to_remove': set(stopwords.words('english')),
#                         'emoticons': emoticons,
#                         'chat_slang': chat_slang
#                         }

#     pan12_df = preprocess_df_for_bow(pan12_df, 'text', **preprocess_args)
#     pan12_df.to_csv(PAN12_PERVERTED_LINES_CSV)

#     # add features to pan12 df
#     pan12_df = add_wordlist_features(pan12_df, 'preprocessed_bow', sex_word_list, family_word_list, meeting_word_list)
#     pan12_df.to_csv(PAN12_PERVERTED_LINES_CSV)

# elif CREATE_FULL_PAN12_DATAFRAME == 'Load':
#     pan12_df = pd.read_csv(PAN12_PERVERTED_LINES_CSV)

# pan12_df = pan12_df.dropna()
# pan12_df

#### Load entire PJ dataset as single dataframe of chat lines - **Not used**

In [None]:

# class PjSentencesDataset(Dataset):
#     """
#     Wrapper around Torch Dataset.
#     Prepares an indexed list of PJ conversation in a folder, returns conversations per index (like an array)
#     Load is lazy - loads conversation from disk on request.
#     Uses load_one_chat_as_df_pj() for conversation loading
#     """

#     def __init__(self, data_folder: Path, df_preprocess_fn=None, df_preprocess_args:Dict=None):
#         """
#         Args:
#           data_folder - folder with PJ XML files
#           df_preprocess_fn - function that gets a dataframe and adds preprocesed text column based on given text column

#         """
       
#         self.file_list = list_files_in_dir(data_folder, extension='xml')
#         self.df_preprocess_fn = df_preprocess_fn
#         self.df_preprocess_args = df_preprocess_args
#         self.TEXT_COLUMN_NAME = 'BODY'

        
#     def __len__(self) -> int:
#         """
#         Returns:
#             int: length of the dataset
#         """
#         return len(self.file_list)

#     def __getitem__(self, idx):
#         """Gets element of the dataset

#         Args:
#             index (int): index of the element in the dataset
#         Returns:
#             Single element by index
#         """        
#         sample = load_one_chat_as_df_pj(self.file_list[idx])
#         if (self.df_preprocess_fn is not None) and (sample is not None):
#             sample['conversation'] = self.df_preprocess_fn(sample['conversation'], self.TEXT_COLUMN_NAME, **self.df_preprocess_args)

#         return sample

    

In [None]:
# # Test the dataset
# preprocess_args = {'stemmer': PorterStemmer(),
#                     'speller': SpellChecker(),
#                     'words_to_remove': set(stopwords.words('english')),
#                     'emoticons': emoticons,
#                     'chat_slang': chat_slang,
#                     }
                    
# pj_ds = PjSentencesDataset(PJ_DATA_FOLDER, df_preprocess_fn=preprocess_df_for_bow, df_preprocess_args=preprocess_args)
# print(len(pj_ds))
# print(pj_ds[1]['conversation_id'])
# pj_ds[1]['conversation'].head()

In [None]:
# # Create full dataframe, no preprocessing yet

# def load_pj_dataset(data_folder:Path):
#     pj_df = None                    
#     pj_ds = PjSentencesDataset(data_folder)

#     for i in tqdm(range(len(pj_ds))):
#         conversation_dict = pj_ds[i]
#         if not conversation_dict is None:
#             conversation = conversation_dict['conversation']
#             conversation['conversation_id'] = conversation_dict['conversation_id']

#             if not pj_df is None:
#                 pj_df = pj_df.append(conversation)
#             else:
#                 pj_df = conversation.copy()
    
#     return pj_df

In [None]:
# PJ_PREPROCESSED_CSV_PATH = OUTPUT_FOLDER / Path('pj_preprocessed_dataframe.csv')
# PJ_FULL_RAW_CSV = OUTPUT_FOLDER / Path('pj2_raw_full.csv')

# if CREATE_FULL_PJ_DATAFRAME_SENTENCE_LEVEL == 'Process':
#     # load original dataset
#     pj_sentences_df = load_pj_dataset(PJ_DATA_FOLDER)
#     pj_sentences_df.to_csv(PJ_FULL_RAW_CSV)

#     # preprocess and add features
#     preprocess_args = {'stemmer': PorterStemmer(),
#                         'speller': SpellChecker(),
#                         'words_to_remove': set(stopwords.words('english')),
#                         'emoticons': emoticons,
#                         'chat_slang': chat_slang,
#                         }

#     pj_sentences_df = preprocess_df_for_bow(pj_sentences_df, 'BODY', **preprocess_args)
#     pj_sentences_df = add_wordlist_features(pj_sentences_df, 'preprocessed_bow', sex_word_list, family_word_list, meeting_word_list)
#     pj_sentences_df.to_csv(PJ_PREPROCESSED_CSV_PATH)

# elif CREATE_FULL_PJ_DATAFRAME_SENTENCE_LEVEL == 'Load':
#     pj_sentences_df = pd.read_csv(PJ_PREPROCESSED_CSV_PATH, index_col=0)


# pj_sentences_df = pj_sentences_df[pj_sentences_df['preprocessed_bow'].notna()]
# pj_sentences_df.head()

In [None]:
# pj_sentences_df.groupby(['conversation_id']).sum()

## some thoughts
Bag of words - sexual words, fear, trust, family, approach (Location, transport) , other categories - DrouinBoydHancockJames2017
Good article: file:///D:/docs/DSML_IDC/Semester%204/Cyber/Tasks/Task2/ref%20docs/Early%20Text%20Classification%20using%20Multi-Resolution%20Concept%20Representations.pdf
Ensamble and preprocessing: file:///D:/docs/DSML_IDC/Semester%204/Cyber/Tasks/Task2/ref%20docs/PredatoryConversationDetection.pdf
file:///D:/docs/DSML_IDC/Semester%204/Cyber/Tasks/Task2/ref%20docs/Analyzing_Chat_Conversations_of_Pedophil.pdf
