## Requirements

* Detect predetorial patterns in other side chat messages, alert parents / block chat
    - Per message
    - Sequence
    - Media
* Detect and warn / block personal information giveaway by own side of chat (Child)
    - text
    - media
* Support 2 party / multiple party chats
* Block known predators from past chats



## General - imports paths etc.

## Flow control

In [1]:
CREATE_FULL_PAN12_DATAFRAME = 'Load'
CREATE_FULL_PJ_DATAFRAME = 'Load'

In [2]:
# %pip install pyspellchecker
# %python -m spacy download en_core_web_sm
# %pip install pyLDAvis
# %pip install altair


### Imports

In [31]:
import numpy as np
import pandas as pd
from pathlib import Path

import re
import string

from tqdm.notebook import tqdm as tqdm
tqdm.pandas()
from ipywidgets import IntProgress

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer

# from sklearn.decomposition import PCA
# from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer
import altair

# from sklearn.cluster import MiniBatchKMeans

import matplotlib.pyplot as plt
import matplotlib.cm as cm

from torch.utils.data import Dataset

import gensim
from gensim.models import Word2Vec
import gensim.corpora as corpora
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

import spacy
from spellchecker import SpellChecker

import xml.etree.ElementTree as ET 
from xml.etree.ElementTree import ParseError

import csv

from typing import Dict, Callable, List, Dict, Set, Any
import logging


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

logger = logging.getLogger(__name__)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mryan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  from imp import reload


### Env control and folders

In [4]:
# ENV = 'Colab'
ENV = 'Local'


In [5]:
# Folders
if ENV=='Local':
  PROJECT_ROOT = Path('./')

elif ENV=='Colab':
  from google.colab import drive
  drive.mount('/content/drive')
  PROJECT_ROOT = Path('/content/drive/MyDrive/colab_data/cyber2/')
  

PJ_DATA_FOLDER = PROJECT_ROOT / Path('customer_data')
PAN12_DATA_FILE = PROJECT_ROOT / Path('ref_data/pan12_corpus/pan12-sexual-predator-identification-test-corpus-2012-05-21/pan12-sexual-predator-identification-test-corpus-2012-05-17.xml')
PAN12_LINE_LABELS_FILE = PROJECT_ROOT / Path('ref_data/pan12_corpus/pan12-sexual-predator-identification-test-corpus-2012-05-21/pan12-sexual-predator-identification-groundtruth-problem2.txt')
PAN12_USER_LABELS_FILE = PROJECT_ROOT / Path('ref_data/pan12_corpus/pan12-sexual-predator-identification-test-corpus-2012-05-21/pan12-sexual-predator-identification-groundtruth-problem1.txt')
OUTPUT_FOLDER = PROJECT_ROOT / Path('output')

if not PAN12_DATA_FILE.exists():
    raise FileNotFoundError('File not found!')

if not PAN12_LINE_LABELS_FILE.exists():
    raise FileNotFoundError('File not found!')  

if not PAN12_USER_LABELS_FILE.exists():
    raise FileNotFoundError('File not found!') 

if not PJ_DATA_FOLDER.is_dir():
    raise FileNotFoundError('Directry not found!') 

if not OUTPUT_FOLDER.is_dir():
    print(f'creating output folder: {OUTPUT_FOLDER}')
    OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)
  

### Utility functions

In [6]:
# Define datasets with texts and labels

def list_files_in_dir(folder: Path, extension='*') -> List:
    
    file_list = [f for f in folder.glob(f'**/*.{extension}') if f.is_file()]
    return file_list

## Test funcion
# list_files_in_dir(DATA_FOLDER, 'dtd')

### Load word lists

In [7]:
# Load word lists
SEX_WL_PATH = PROJECT_ROOT / Path(r'sex_words.txt')
with open(SEX_WL_PATH, 'rt') as handle:
    sex_word_list = handle.read().split('\n')

MEETING_WL_PATH = SEX_WL_PATH = PROJECT_ROOT / Path(r'meeting_words.txt')
with open(MEETING_WL_PATH, 'rt') as handle:
    meeting_word_list = handle.read().split('\n')

FAMILY_WL_PATH = SEX_WL_PATH = PROJECT_ROOT / Path(r'family_words.txt')
with open(FAMILY_WL_PATH, 'rt') as handle:
    family_word_list = handle.read().split('\n')

CHAT_SLANG_PATH = SEX_WL_PATH = PROJECT_ROOT / Path(r'chat_slang.txt')
with open(CHAT_SLANG_PATH, mode='rt') as handle:
    csv_reader = csv.reader(handle, delimiter='\t')
    chat_slang = {rows[0]:rows[1] for rows in csv_reader}

EMOTICONS_PATH = SEX_WL_PATH = PROJECT_ROOT / Path(r'emoticons.txt')
with open(EMOTICONS_PATH, mode='rt', encoding="utf8") as handle:
    csv_reader = csv.reader(handle, delimiter='\t')
    emoticons = {rows[0]:rows[1] for rows in csv_reader}


## Preprocessing

### Chat text preprocess

In [8]:

def remove_stopwords(text: str, words_to_remove: List[str])-> str:
    '''
    Gets string, returns it without stopwords
    '''
    return " ".join([word for word in str(text).split() if word not in words_to_remove])


def stem_text(text: str, stemmer: Any)-> str:
    '''
    stem text string
    '''
    return " ".join([stemmer.stem(word) for word in text.split()])


def remove_emoji(text: str) -> str:
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


def remove_emoticons(text: str, emoticons: Dict) -> str:
    emoticon_pattern = re.compile(u'(' + u'|'.join(k for k in emoticons) + u')')
    return emoticon_pattern.sub(r'', text)


def replace_pornsites_with_string(text:str, replacement_string:str='porn')->str:
    pornsite_pattern = re.compile(r'\S+xnxx\.co\S+' + r'|\S+pornhub\.co\S+' + r'|\S+nude\.co\S+' + r'|\S+sex\.co\S+')
    return pornsite_pattern.sub(replacement_string, text)

def remove_urls(text:str)-> str:
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)


def remove_special_characters(text:str)-> str:
    special_chars_pattern = re.compile(r'[^A-Za-z0-9 ]+')
    return special_chars_pattern.sub(r' ', text)


def replace_chat_slang(text: str, chat_slang: Dict[str, str])-> str:
    new_text = []
    for w in text.split():
        if w.upper() in chat_slang.keys():
            new_text.append(chat_slang[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)


def correct_spellings(text: str, speller: Callable) -> str:
    corrected_text = []
    misspelled_words = speller.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(speller.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)


def lemmation(text:str, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    nlp_lem = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    result = nlp_lem(text)
    result = [token.lemma_ for token in result if token.pos_ in allowed_postags]
    result = str(result)
    return result


def contains_words_from_list(text: str, word_list: List[str])-> bool:
    text_words = re.sub("[^\w]", " ",  text).split()
    if any(word in word_list for word in text_words):
        return True
    else:
        return False


def preprocess_string_for_bow(text: str, stemmer: Callable=None, speller: Callable=None, words_to_remove:List[str]=None, emoticons: Dict[str, str]=None, chat_slang: Dict[str, str]=None)-> str:
    try:
        text = remove_emoji(text)
        text = remove_emoticons(text, emoticons)
        text = replace_chat_slang(text, chat_slang)
        text = text.lower()
        text = replace_pornsites_with_string(text)
        text = remove_urls(text)
        text = remove_special_characters(text)
        text = correct_spellings(text, speller)
        # text = remove_stopwords(text, words_to_remove)
        text = lemmation(text)
        # text = stem_text(text, stemmer)
    except(TypeError):
        print(f'Problematic string: {text}')
        text = ''
    return text


def preprocess_df_for_bow(df: pd.DataFrame, text_col: str, output_col_name='preprocessed_bow', stemmer=None, speller=None, words_to_remove=None, emoticons=None, chat_slang=None)-> pd.DataFrame:
    '''
    Gets a PD dataframe and a text column name
    returns the same dataframe with additional column called 'posts_preprocessed_bow'
    '''
    df[output_col_name] = df[text_col].progress_apply(lambda text: preprocess_string_for_bow(text, stemmer=stemmer, speller=speller, words_to_remove=words_to_remove, emoticons=emoticons, chat_slang=chat_slang))
    return df


In [9]:
# test
preprocess_args = {'stemmer': PorterStemmer(),
                    'speller': SpellChecker(),
                    'words_to_remove': set(stopwords.words('english')),
                    'emoticons': emoticons,
                    'chat_slang': chat_slang,
                    }

text = 'r u going to www.google.com http://xnxx.com im walking LOL ths is not &amp;right im caming flight now u r right brb and fu :-)'
text = 'yeah--well I just want to see you before I go in the apt--cause one of my friends got arrested for doing the same thing with a 16 year old--it was a set-up type thing'

preprocess_string_for_bow(text, **preprocess_args)

"['just', 'want', 'see', 'go', 'friend', 'get', 'arrest', 'do', 'same', 'thing', 'year', 'old', 'set', 'type', 'thing']"

## Feature engineering

### Word-list based features

In [10]:
def contains_words_from_list(text: str, word_list: List[str])-> bool:
    text_words = re.sub("[^\w]", " ",  text).split()
    if any(word in word_list for word in text_words):
        return True
    else:
        return False


def add_wordlist_features(df: pd.DataFrame, text_column: str, sex_word_list, family_word_list, meeting_word_list):
    df['contains_sex_words'] = df[text_column].apply(lambda text: contains_words_from_list(text, sex_word_list))
    df['contains_family_words'] = df[text_column].apply(lambda text: contains_words_from_list(text, family_word_list))
    df['contains_meeting_words'] = df[text_column].apply(lambda text: contains_words_from_list(text, meeting_word_list))
    return df


## Datasets

### PJ dataset

#### PJ Convesation level dataset

In [11]:

def load_one_chat_as_df_pj(file_path: Path) -> Dict[str, pd.DataFrame]:
    '''
    Gets an path to a PJ XML file
    returns a dict with three dataframes:
        - victim data
        - predator data
        - conversation posts
    '''
    parser = ET.XMLParser(encoding="utf-8")
    try:
        doc_tree = ET.parse(file_path, parser=parser)
    except(ParseError):
        print(f'failed to parse {str(file_path)}')
        return None
        
    doc_root = doc_tree.getroot()
    
    posts_df = pd.DataFrame(columns = ['USERNAME', 'DATETIME', 'BODY', 'COMMENT', 'CODING'], dtype=str)
    predator_df = pd.DataFrame(columns = ['FIRSTNAME', 'LASTNAME', 'STATEDNAME', 'STATEDAGE', 'GENDER', 'RACE', 'CITY', 'STATE', 'REPEATOFFENDER', 'ADMITGUILT', 'TRUTHFULNAME', 'SCREENNAME'], dtype=str)
    victim_df = pd.DataFrame(columns = ['FIRSTNAME', 'LASTNAME', 'STATEDNAME', 'STATEDAGE', 'GENDER', 'RACE', 'CITY', 'STATE', 'PREVIOUSVICTIMIZATION', 'ADMITGUILT', 'SCREENNAME'], dtype=str)

    for post in doc_root.findall('POST'):
        post_dict = {}
        for field in post:
            post_dict[field.tag] = field.text

        posts_df = posts_df.append(post_dict, ignore_index=True)
    posts_df = posts_df.astype('string')


    for predator in doc_root.findall('PREDATOR'):
        predator_dict = {}
        for field in predator:
            predator_dict[field.tag] = field.text

        predator_df = predator_df.append(predator_dict, ignore_index=True)   
    predator_df = predator_df.astype('string')

    for victim in doc_root.findall('VICTIM'):
        victim_dict = {}
        for field in victim:
            victim_dict[field.tag] = field.text

        victim_df = victim_df.append(victim_dict, ignore_index=True)  
    victim_df = victim_df.astype('string')

    return {'predator': predator_df, 'victim': victim_df, 'conversation': posts_df, 'conversation_id': str(file_path.parts[-1])}


#----------------------------------------------------------
# Test XML parse functions:
file_path = PJ_DATA_FOLDER / Path('ArmySgt1961.xml')
chat_dict = load_one_chat_as_df_pj(file_path)
chat_dict['victim'].head()
chat_dict['predator'].head()
chat_dict['conversation'].head(10)
# chat_dict['conversation_id']


Unnamed: 0,USERNAME,DATETIME,BODY,COMMENT,CODING
0,armysgt1961,(7:02:01 pm),im dennis us army soldier from cincinnati,,
1,peekaboo1293,(7:02:30 pm),hi im becky from ky,,
2,armysgt1961,(7:02:35 pm),how old ru,,
3,peekaboo1293,(7:02:42 pm),13 how old ru,(age stated and he didn't bat an eye),
4,armysgt1961,(7:02:44 pm),u single,,
5,peekaboo1293,(7:02:51 pm),yeah,,
6,peekaboo1293,(7:03:03 pm),i had a bf but we broke up when i moved here,,
7,armysgt1961,(7:03:11 pm),ok u have sex at 13,(he obviously knows my age),
8,peekaboo1293,(7:03:28 pm),u mean did i ever,,
9,armysgt1961,(7:03:32 pm),yeah,,


In [12]:

class PjSentencesDataset(Dataset):
    """
    Wrapper around Torch Dataset.
    Prepares an indexed list of PJ conversation in a folder, returns conversations per index (like an array)
    Load is lazy - loads conversation from disk on request.
    Uses load_one_chat_as_df_pj() for conversation loading
    """

    def __init__(self, data_folder: Path, df_preprocess_fn=None, df_preprocess_args:Dict=None):
        """
        Args:
          data_folder - folder with PJ XML files
          df_preprocess_fn - function that gets a dataframe and adds preprocesed text column based on given text column

        """
       
        self.file_list = list_files_in_dir(data_folder, extension='xml')
        self.df_preprocess_fn = df_preprocess_fn
        self.df_preprocess_args = df_preprocess_args
        self.TEXT_COLUMN_NAME = 'BODY'

        
    def __len__(self) -> int:
        """
        Returns:
            int: length of the dataset
        """
        return len(self.file_list)

    def __getitem__(self, idx):
        """Gets element of the dataset

        Args:
            index (int): index of the element in the dataset
        Returns:
            Single element by index
        """        
        sample = load_one_chat_as_df_pj(self.file_list[idx])
        if (self.df_preprocess_fn is not None) and (sample is not None):
            sample['conversation'] = self.df_preprocess_fn(sample['conversation'], self.TEXT_COLUMN_NAME, **self.df_preprocess_args)

        return sample

    

In [13]:
# # Test the dataset
# preprocess_args = {'stemmer': PorterStemmer(),
#                     'speller': SpellChecker(),
#                     'words_to_remove': set(stopwords.words('english')),
#                     'emoticons': emoticons,
#                     'chat_slang': chat_slang,
#                     }
                    
# pj_ds = PjSentencesDataset(PJ_DATA_FOLDER, df_preprocess_fn=preprocess_df_for_bow, df_preprocess_args=preprocess_args)
# print(len(pj_ds))
# print(pj_ds[1]['conversation_id'])
# pj_ds[1]['conversation'].head()

### Load entire PJ dataset as single dataframe

In [14]:
# Create full dataframe, no preprocessing yet

def load_pj_dataset(data_folder:Path):
    pj_df = None                    
    pj_ds = PjSentencesDataset(data_folder)

    for i in tqdm(range(len(pj_ds))):
        conversation_dict = pj_ds[i]
        if not conversation_dict is None:
            conversation = conversation_dict['conversation']
            conversation['conversation_id'] = conversation_dict['conversation_id']

            if not pj_df is None:
                pj_df = pj_df.append(conversation)
            else:
                pj_df = conversation.copy()
    
    return pj_df

In [15]:
PJ_PREPROCESSED_CSV_PATH = OUTPUT_FOLDER / Path('pj_preprocessed_dataframe.csv')
PJ_FULL_RAW_CSV = OUTPUT_FOLDER / Path('pan12_raw_full.csv')

if CREATE_FULL_PJ_DATAFRAME == 'Process':
    # load original dataset
    pj_df = load_pj_dataset(PJ_DATA_FOLDER)
    pj_df.to_csv(PJ_FULL_RAW_CSV)

    # preprocess and add features
    preprocess_args = {'stemmer': PorterStemmer(),
                        'speller': SpellChecker(),
                        'words_to_remove': set(stopwords.words('english')),
                        'emoticons': emoticons,
                        'chat_slang': chat_slang,
                        }

    pj_df = preprocess_df_for_bow(pj_df, 'BODY', **preprocess_args)
    pj_df = add_wordlist_features(pj_df, 'preprocessed_bow', sex_word_list, family_word_list, meeting_word_list)
    pj_df.to_csv(PJ_PREPROCESSED_CSV_PATH)

elif CREATE_FULL_PJ_DATAFRAME == 'Load':
    pj_df = pd.read_csv(PJ_PREPROCESSED_CSV_PATH)

pj_df.head()

Unnamed: 0.1,Unnamed: 0,USERNAME,DATETIME,BODY,COMMENT,CODING,conversation_id,preprocessed_bow,contains_sex_words,contains_family_words,contains_meeting_words
0,0,tunnels12000,07/19/06 7:48:24 PM,hi,,,tunnels12000.xml,[],False,False,False
1,1,tracy_in_xcess,(07/19/06 7:49:06 PM),hi,,,tunnels12000.xml,[],False,False,False
2,2,tunnels12000,07/19/06 7:49:09 PM,very pretty pic,,,tunnels12000.xml,"['very', 'pretty', 'pic']",False,False,False
3,3,tunnels12000,07/19/06 7:49:19 PM,im david hope i didnt bother u,,\n,tunnels12000.xml,['bother'],False,False,False
4,4,tracy_in_xcess,07/19/06 7:49:48 PM,no thats ok,,,tunnels12000.xml,['s'],False,False,False


In [16]:
pj_df.groupby(['conversation_id']).sum()

Unnamed: 0_level_0,Unnamed: 0,contains_sex_words,contains_family_words,contains_meeting_words
conversation_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ArmySgt1961.xml,4560,5,6,4
arthinice.xml,574056,43,7,8
aticloose.xml,13861,16,3,6
corazon23456partio23456.xml,97020,6,13,8
crazytrini85.xml,22366,11,4,3
flxnonya.xml,6903,13,4,1
fotophix.xml,5253,1,2,4
ghost27_73.xml,3692403,60,36,44
hiexcitement.xml,580503,12,3,18
i_8u_raw.xml,1119756,15,19,51


### Convert Pan12 to labeled datafreame for use later as Train data

In [17]:
class Pan12converterToDF():

    # Pan12 converter for TEST dataset - with line labels!
    
    """
    Wrapper around Torch Dataset to perform text classification
    """

    def __init__(self, chat_data_file: Path, user_labels_file: Path=None, line_labels_file: Path=None):
        """
        Args:
            chat_data_file: path to chat xml file
            conversation_labels:
            line_labels:
            mode:   full - all data 
                    positive_lines - Only lines labeled as problematic
        """
       
        self.chat_data_file = chat_data_file
        self.conversations = self._get_conversation_roots(chat_data_file)

        self.user_labels_file = user_labels_file
        self.line_labels_file = line_labels_file
        self.TEXT_COLUMN_NAME = 'text'

        self.length = self._get_ds_length()
        self.num_conversations = len(self.conversations)

        # Initiate queue
        self.message_list = None
        self.current_conversation_id = None
        self.next_conversation_idx = 0
        self.next_message_idx = 0

        # Create sets of problematic lines and authors for labels
        user_labels = pd.read_csv(user_labels_file, delimiter='\t', header=None)
        self.perverted_authors = set(user_labels[0])

        line_labels = pd.read_csv(line_labels_file, delimiter='\t', header=None)
        line_labels['concat'] = line_labels[0] + '_' + line_labels[1].astype(str)
        self.perverted_conversations = set(line_labels[0].unique())
        self.pervert_lines = set(line_labels['concat'])


    def __iter__(self):
        return self

    def __len__(self) -> int:
        """
        Returns:
            int: length of the dataset
        """
        return self.length

    def convert(self, filename:Path, save_every=2000, mode: str='full') -> pd.DataFrame:
        """Gets element of the dataset

        Args:
            index (int): index of the element in the dataset
        Returns:
            Single element by index
        """        
        pan12_df = pd.DataFrame(columns=['conversation_id', 'line', 'author', 'time', 'text', 'line_label', 'author_label'])

        self._load_next_conversation_to_list(mode) 

        if(mode == 'full'):
            iter_len = self.length
        elif(mode == 'positive_lines'):
            iter_len = len(self.pervert_lines)
        
        for i in tqdm(range(iter_len)):
            message_dict = {}
            try:
                message = self.message_list[self.next_message_idx]
            except(IndexError):
                self._load_next_conversation_to_list(mode)
                message = self.message_list[self.next_message_idx]
            
            message_dict['conversation_id'] = self.current_conversation_id
            self.next_message_idx += 1
            
            message_dict['line'] = message.attrib['line']  
            for field in message:
                message_dict[field.tag] = field.text
            
            message_dict['author_label'] = 1 if message_dict['author'] in self.perverted_authors else 0
            message_dict['line_label'] = 1 if message_dict['conversation_id'] + '_' + message_dict['line'] in self.pervert_lines else 0
            
            pan12_df = pan12_df.append(message_dict, ignore_index=True)
            if i % save_every == 0:
                pan12_df.to_csv(filename)
                print('.', end='')

            # #######
            # if i == 1001:
            #     print(pan12_df.head(2001))
            #     break
            # ######
        pan12_df.to_csv(filename)
        return pan12_df
    
    def _get_conversation_roots(self, file_path):
        doc_tree = ET.parse(file_path)
        conversation_roots = doc_tree.getroot().findall('conversation')
        return conversation_roots

    def _get_ds_length(self):
        number_messages = 0
        for conversation in self.conversations:
            number_messages += len(conversation.findall('message'))
        
        return number_messages

    def _load_next_conversation_to_list(self, mode):
        try:
            conversation = self.conversations[self.next_conversation_idx] 
            self.next_conversation_idx += 1
            self.current_conversation_id = conversation.attrib['id']  

            if mode == 'positive_lines':
                while self.current_conversation_id not in self.perverted_conversations:
                    conversation = self.conversations[self.next_conversation_idx] 
                    self.next_conversation_idx += 1
                    self.current_conversation_id = conversation.attrib['id']  
     
        except(IndexError):
            raise StopIteration()

        if mode == 'positive_lines':
            self.message_list = [m for m in conversation.findall('message') if (self.current_conversation_id + '_' + m.attrib['line'] in self.pervert_lines)]
        else:
            self.message_list = [m for m in conversation.findall('message')]
        self.next_message_idx = 0


In [18]:
# PAN12_PERVERTED_LINES_CSV = OUTPUT_FOLDER / Path('pan12_perverted_lines_preprocessed.csv')
PAN12_PERVERTED_LINES_CSV = OUTPUT_FOLDER / Path('pan12_full_lines_preprocessed.csv')

PAN12_FULL_RAW_CSV = OUTPUT_FOLDER / Path('pan12_raw_full.csv')

if CREATE_FULL_PAN12_DATAFRAME == 'Process':
    # Create a dataframe of all pan12 test perverted lines
    pan12_converter = Pan12converterToDF(PAN12_DATA_FILE, user_labels_file=PAN12_USER_LABELS_FILE, line_labels_file=PAN12_LINE_LABELS_FILE)
    print(len(pan12_converter))
    # pan12_df = pan12_converter.convert(PAN12_FULL_RAW_CSV, mode='positive_lines')
    pan12_df = pan12_converter.convert(PAN12_FULL_RAW_CSV, mode='full')
    print(f'lines in pan12_df: {len(pan12_df)}')

    # Preprocess pan12 perverted lines only and save to csv
    preprocess_args = {'stemmer': PorterStemmer(),
                        'speller': SpellChecker(),
                        'words_to_remove': set(stopwords.words('english')),
                        'emoticons': emoticons,
                        'chat_slang': chat_slang
                        }

    pan12_df = preprocess_df_for_bow(pan12_df, 'text', **preprocess_args)
    pan12_df.to_csv(PAN12_PERVERTED_LINES_CSV)

    # add features to pan12 df
    pan12_df = add_wordlist_features(pan12_df, 'preprocessed_bow', sex_word_list, family_word_list, meeting_word_list)
    pan12_df.to_csv(PAN12_PERVERTED_LINES_CSV)

elif CREATE_FULL_PAN12_DATAFRAME == 'Load':
    pan12_df = pd.read_csv(PAN12_PERVERTED_LINES_CSV)

pan12_df = pan12_df.dropna()
pan12_df

FileNotFoundError: [Errno 2] No such file or directory: 'output\\pan12_full_lines_preprocessed.csv'

## Temp section

In [40]:
pan12_df = pd.read_csv(PAN12_FULL_RAW_CSV, index_col=0)
pan12_df = pan12_df.dropna()
pan12_df.text = pan12_df.text.astype('string')
pan12_df.dtypes

conversation_id    object
line                int64
author             object
time               object
text               string
line_label          int64
author_label        int64
dtype: object

In [41]:
# Create bag f words list from chat sentences
def create_bow_from_text(text):
    bow_words = []
    text_word_list = gensim.utils.simple_preprocess(text, deacc=True)
    bow_words.append(text_word_list)
    return bow_words[0]


def create_bow_from_text_list(text_list):
    bow_words = []
    for text in text_list:
        # text_word_list = gensim.utils.simple_preprocess(text, deacc=True)
        # bow_words.append(text_word_list)
        bow_words.append(create_bow_from_text(text))
    
    return bow_words


bow_words = create_bow_from_text_list(pan12_df['text'])
bow_words


[['bugmail',
  'bug',
  'new',
  'mark',
  'eof',
  'terminated',
  'script',
  'elements',
  'as',
  'malformed',
  'lt',
  'http',
  'lists',
  'org',
  'archives',
  'public',
  'public',
  'html',
  'bugzilla',
  'may',
  'html',
  'gt'],
 ['henri', 'can', 'ask', 'you', 'firefox', 'build', 'question', 'windows'],
 ['cfda', 'sure', 'but', 'probably', 'don', 'know', 'the', 'answer'],
 ['it',
  'appears',
  'the',
  'build',
  'runs',
  'through',
  'it',
  'creates',
  'firefox',
  'exe',
  'in',
  'dist',
  'bin'],
 ['when',
  'start',
  'it',
  'get',
  'my',
  'standard',
  'install',
  'of',
  'ff',
  'instead'],
 ['same',
  'if',
  'make',
  'package',
  'unzip',
  'it',
  'and',
  'start',
  'from',
  'there'],
 ['cfda', 'do', 'you', 'already', 'have', 'the', 'usual', 'firefox', 'open'],
 ['likely'],
 ['so', 'do', 'need', 'to', 'close', 'all', 'instances'],
 ['other'],
 ['cfda',
  'at',
  'least',
  'with',
  'the',
  'linux',
  'version',
  'you',
  'need',
  'to'],
 ['unless'

In [42]:
# create word indexes and frequencies from chat sentences bow
id2word = corpora.Dictionary(bow_words)
corpus = []
for word in bow_words:
    corpus.append(id2word.doc2bow(word))


print(pan12_df['text'][7])
print(corpus[7])
print(id2word[31])

Likely
[(67, 1)]
don


In [43]:
# create LDA topic model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
id2word=id2word,
num_topics=10, 
update_every=1,
chunksize=100,
passes=10,
alpha='auto')

In [None]:
# visualize
vis = gensimvis.prepare(lda_model, corpus, id2word, mds='mmds', R=30)
vis

## Classification

### LDA topic model - Bad performance due to sexual term sparsity and diversity

In [None]:
# Create bag f words list from chat sentences
def create_bow_from_text(text):
    bow_words = []
    text_word_list = gensim.utils.simple_preprocess(text, deacc=True)
    bow_words.append(text_word_list)
    return bow_words[0]


def create_bow_from_text_list(text_list):
    bow_words = []
    for text in text_list:
        # text_word_list = gensim.utils.simple_preprocess(text, deacc=True)
        # bow_words.append(text_word_list)
        bow_words.append(create_bow_from_text(text))
    
    return bow_words


bow_words = create_bow_from_text_list(pan12_df['preprocessed_bow'])
bow_words

# bow_words.append(sex_word_list)

In [None]:
# create word indexes and frequencies from chat sentences bow
id2word = corpora.Dictionary(bow_words)
corpus = []
for word in bow_words:
    corpus.append(id2word.doc2bow(word))


print(pan12_df['preprocessed_bow'][7])
print(corpus[7])
print(id2word[31])

you ever have enough to shave it
[(1, 1), (5, 1), (31, 1), (34, 1), (36, 1), (37, 1), (38, 1)]
shave


In [None]:
# create LDA topic model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
id2word=id2word,
num_topics=10, 
update_every=1,
chunksize=100,
passes=10,
alpha='auto')

In [None]:
# visualize
vis = gensimvis.prepare(lda_model, corpus, id2word, mds='mmds', R=30)
vis

### TF/IDF

In [None]:
# Train TF/IDF model
tfidf_vectorizer = TfidfVectorizer(min_df = 5, max_df = 0.95)
tfidf_vectorizer.fit(pan12_df['preprocessed_bow'])

# Transform chat messages to vocabulary vectors
vectorized_data = tfidf_vectorizer.transform(pan12_df['preprocessed_bow'])
print(f'Vectorized data shape: {vectorized_data.shape}')

print(pan12_df['preprocessed_bow'][6])
print(vectorized_data[6])


Vectorized data shape: (6476, 873)
you have hair down there
  (0, 863)	0.14431819925711445
  (0, 714)	0.4574372173172504
  (0, 310)	0.3281164897432353
  (0, 297)	0.6278215402777263
  (0, 195)	0.5177867633477271


In [None]:
# create dataframe of vectors
tfidf_df = pd.DataFrame(vectorized_data.toarray(), columns=tfidf_vectorizer.get_feature_names())
# tfidf_df[['sex', 'babe', 'young', 'age', 'dick']].sort_values('dick', ascending=False)


Unnamed: 0,sex,babe,young,age,dick
952,0.0,0.0,0.0,0.0,0.847736
4753,0.0,0.0,0.0,0.0,0.847736
4455,0.0,0.0,0.0,0.0,0.847736
5624,0.0,0.0,0.0,0.0,0.722405
3287,0.0,0.0,0.0,0.0,0.679020
...,...,...,...,...,...
2178,0.0,0.0,0.0,0.0,0.000000
2177,0.0,0.0,0.0,0.0,0.000000
2174,0.0,0.0,0.0,0.0,0.000000
2173,0.0,0.0,0.0,0.0,0.000000


In [None]:
# https://melaniewalsh.github.io/Intro-Cultural-Analytics/05-Text-Analysis/03-TF-IDF-Scikit-Learn.html
#Visualize TF/IDF


## Backup - not useful currently

### Pan12 dataloader and dataset

#### Pan12 convesation level dataset 

In [None]:
# class Pan12Dataset(Dataset):
#     '''
#     Wrapper around Torch Dataset.
#     Prepares an indexed list of Pan12 conversation in a folder, returns conversations per index (like an array)
#     Load is lazy - loads conversation from disk on request.
#     Uses load_one_chat_as_df_pj() for conversation loading
#     '''

#     def __init__(self, chat_data_file: Path, conversation_labels: Path=None, line_labels: Path=None, preprocess_fn=None, preprocess_args=None):
#         """
#         Args:
#             chat_data_file: path to chat xml file
#             conversation_labels:
#             line_labels:  
#         """
       
#         self.chat_data_file = chat_data_file
#         self.conversations = self._get_conversation_roots(chat_data_file)
#         self.preprocess_fn = preprocess_fn
#         self.preprocess_args = preprocess_args

#         self.conversation_labels = conversation_labels
#         self.line_labels = line_labels

#         self.TEXT_COLUMN_NAME = 'text'

                
#     def __len__(self) -> int:
#         """
#         Returns:
#             int: length of the dataset
#         """
#         return len(self.conversations)

#     def __getitem__(self, idx) -> Dict[str, pd.DataFrame]:
#         """Gets element of the dataset

#         Args:
#             index (int): index of the element in the dataset
#         Returns:
#             Single element by index
#         """        

#         conversation = self.conversations[idx]
#         conversation_id = conversation.attrib['id']
#         conversation_df = pd.DataFrame(columns = ['author', 'line', 'time', 'text'], dtype=str)

#         for message in conversation.findall('message'):
#             message_dict = {}
#             message_dict['line'] = message.attrib['line']
#             for field in message:
#                 message_dict[field.tag] = field.text

#             conversation_df = conversation_df.append(message_dict, ignore_index=True)
                
#         if self.preprocess_fn is not None:
#             conversation_df = self.preprocess_fn(conversation_df, self.TEXT_COLUMN_NAME, **self.preprocess_args)

#         return {'conversation_id': conversation_id, 'conversation': conversation_df}
    
#     def _get_conversation_roots(self, file_path):
#         doc_tree = ET.parse(file_path)
#         conversation_roots = doc_tree.getroot().findall('conversation')
#         return conversation_roots


In [None]:
# preprocess_args = {'stemmer': PorterStemmer(),
#                     'speller': SpellChecker(),
#                     'words_to_remove': set(stopwords.words('english')),
#                     'emoticons': emoticons,
#                     'chat_slang': chat_slang
#                     }

# pan12_ds = Pan12Dataset(PAN12_DATA_FILE, preprocess_fn=preprocess_df_for_bow, preprocess_args=preprocess_args)
# pan12_ds[34]['conversation']

#### pan12 line level dataloader

In [None]:
# class Pan12LineLevelDataloader():  
#     """
#     Wrapper around Torch Dataset to perform text classification
#     """

#     def __init__(self, chat_data_file: Path, user_labels_file: Path=None, line_labels_file: Path=None, preprocess_fn=None, preprocess_args:Dict=None):
#         """
#         Args:
#             chat_data_file: path to chat xml file
#             conversation_labels:
#             line_labels:  
#         """
       
#         self.chat_data_file = chat_data_file
#         self.conversations = self._get_conversation_roots(chat_data_file)
#         self.preprocess_fn = preprocess_fn
#         self.preprocess_args = preprocess_args

#         self.user_labels_file = user_labels_file
#         self.line_labels_file = line_labels_file
#         self.TEXT_COLUMN_NAME = 'text'

#         self.length = self._get_ds_length()
#         self.num_conversations = len(self.conversations)

#         # Initiate queue
#         self.message_list = None
#         self.current_conversation_id = None
#         self.next_conversation_idx = 0
#         self.next_message_idx = 0

#         # Create sets of problematic lines and authors for labels
#         user_labels = pd.read_csv(user_labels_file, delimiter='\t', header=None)
#         self.perverted_authors = set(user_labels[0])

#         line_labels = pd.read_csv(line_labels_file, delimiter='\t', header=None)
#         line_labels['concat'] = line_labels[0] + '_' + line_labels[1].astype(str)
#         self.pervert_lines = set(line_labels['concat'])

#         self.load_next_conversation_to_list()
                       
#     def __iter__(self):
#         return self

#     def __len__(self) -> int:
#         """
#         Returns:
#             int: length of the dataset
#         """
#         return self.length

#     def __next__(self) -> Dict[str, pd.DataFrame]:
#         """Gets element of the dataset

#         Args:
#             index (int): index of the element in the dataset
#         Returns:
#             Single element by index
#         """        
#         message_dict = {}
#         try:
#             message = self.message_list[self.next_message_idx]
#         except(IndexError):
#             self.load_next_conversation_to_list()
#             message = self.message_list[self.next_message_idx]

#         message_dict['conversation_id'] = self.current_conversation_id
#         self.next_message_idx += 1
        
#         message_dict['line'] = message.attrib['line']  

#         for field in message:
#             message_dict[field.tag] = field.text
        
#         if self.preprocess_fn is not None:
#             message_dict['text'] = self.preprocess_fn(message_dict['text'], **self.preprocess_args)
        
#         message_dict['author_label'] = 1 if message_dict['author'] in self.perverted_authors else 0
#         message_dict['line_label'] = 1 if message_dict['conversation_id'] + '_' + message_dict['line'] in self.pervert_lines else 0

#         return message_dict
    
#     def _get_conversation_roots(self, file_path):
#         doc_tree = ET.parse(file_path)
#         conversation_roots = doc_tree.getroot().findall('conversation')
#         return conversation_roots

#     def _get_ds_length(self):
#         number_messages = 0
#         for conversation in self.conversations:
#             number_messages += len(conversation.findall('message'))
        
#         return number_messages

#     def load_next_conversation_to_list(self):
#         try:
#             conversation = self.conversations[self.next_conversation_idx] 
#             self.current_conversation_id = conversation.attrib['id']  
#         except(IndexError):
#             raise StopIteration()

#         self.next_conversation_idx += 1
#         self.message_list = [m for m in conversation.findall('message')]
#         self.next_message_idx = 0

In [None]:
# # Test dataset
# preprocess_args = {'stemmer': PorterStemmer(),
#                     'speller': SpellChecker(),
#                     'words_to_remove': set(stopwords.words('english')),
#                     'emoticons': emoticons,
#                     'chat_slang': chat_slang
#                     }

# pan12_ds = Pan12LineLevelDataloader(PAN12_DATA_FILE, user_labels_file=PAN12_USER_LABELS_FILE, line_labels_file=PAN12_LINE_LABELS_FILE, preprocess_fn=preprocess_string_for_bow, preprocess_args=preprocess_args)
# print(len(pan12_ds))

# for i, m in enumerate(pan12_ds):
#     print(i, m) 
#     if i==50:
#         break

## some thoughts
Bag of words - sexual words, fear, trust, family, approach (Location, transport) , other categories - DrouinBoydHancockJames2017
Good article: file:///D:/docs/DSML_IDC/Semester%204/Cyber/Tasks/Task2/ref%20docs/Early%20Text%20Classification%20using%20Multi-Resolution%20Concept%20Representations.pdf
Ensamble and preprocessing: file:///D:/docs/DSML_IDC/Semester%204/Cyber/Tasks/Task2/ref%20docs/PredatoryConversationDetection.pdf
file:///D:/docs/DSML_IDC/Semester%204/Cyber/Tasks/Task2/ref%20docs/Analyzing_Chat_Conversations_of_Pedophil.pdf
