## General - imports paths etc.

In [None]:
# %pip install pyspellchecker

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

import re
import string

from tqdm.notebook import tqdm as tqdm
from ipywidgets import IntProgress

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer

import torch
from torch.utils.data import Dataset

import gensim
from gensim.models import Word2Vec

import spacy
from spellchecker import SpellChecker

import xml.etree.ElementTree as ET 
import csv

from typing import Dict, Callable, List, Dict, Set, Any
import logging


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

logger = logging.getLogger(__name__)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mryan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Env control

In [2]:
# ENV = 'Colab'
ENV = 'Local'


In [3]:
# Folders
if ENV=='Local':
  PROJECT_ROOT = Path('./')

elif ENV=='Colab':
  from google.colab import drive
  drive.mount('/content/drive')
  PROJECT_ROOT = Path('/content/drive/MyDrive/colab_data/cyber2/')
  

PJ_DATA_FOLDER = PROJECT_ROOT / Path('customer_data')
PAN12_DATA_FILE = PROJECT_ROOT / Path('ref_data/pan12_corpus/pan12-sexual-predator-identification-test-corpus-2012-05-21/pan12-sexual-predator-identification-test-corpus-2012-05-17.xml')
PAN12_LINE_LABELS_FILE = PROJECT_ROOT / Path('ref_data/pan12_corpus/pan12-sexual-predator-identification-test-corpus-2012-05-21/pan12-sexual-predator-identification-groundtruth-problem2.txt')
PAN12_USER_LABELS_FILE = PROJECT_ROOT / Path('ref_data/pan12_corpus/pan12-sexual-predator-identification-test-corpus-2012-05-21/pan12-sexual-predator-identification-groundtruth-problem1.txt')
OUTPUT_FOLDER = PROJECT_ROOT

if not PAN12_DATA_FILE.exists():
    raise FileNotFoundError('File not found!')

if not PAN12_LINE_LABELS_FILE.exists():
    raise FileNotFoundError('File not found!')  

if not PAN12_USER_LABELS_FILE.exists():
    raise FileNotFoundError('File not found!') 

if not PJ_DATA_FOLDER.is_dir():
    raise FileNotFoundError('Directry not found!') 

if not OUTPUT_FOLDER.is_dir():
    raise FileNotFoundError('Directry not found!') 
  

#### Utility functions

In [4]:
# Define datasets with texts and labels

def list_files_in_dir(folder: Path, extension='*') -> List:
    
    file_list = [f for f in folder.glob(f'**/*.{extension}') if f.is_file()]
    return file_list

## Test funcion
# list_files_in_dir(DATA_FOLDER, 'dtd')

### Word lists

In [40]:
# Load word lists
SEX_WL_PATH = PROJECT_ROOT / Path(r'sex_words.txt')
with open(SEX_WL_PATH, 'rt') as handle:
    sex_word_list = handle.read().split('\n')

MEETING_WL_PATH = SEX_WL_PATH = PROJECT_ROOT / Path(r'meeting_words.txt')
with open(MEETING_WL_PATH, 'rt') as handle:
    meeting_word_list = handle.read().split('\n')

FAMILY_WL_PATH = SEX_WL_PATH = PROJECT_ROOT / Path(r'family_words.txt')
with open(FAMILY_WL_PATH, 'rt') as handle:
    family_word_list = handle.read().split('\n')

CHAT_SLANG_PATH = SEX_WL_PATH = PROJECT_ROOT / Path(r'chat_slang.txt')
with open(CHAT_SLANG_PATH, mode='rt') as handle:
    csv_reader = csv.reader(handle, delimiter='\t')
    chat_slang = {rows[0]:rows[1] for rows in csv_reader}

EMOTICONS_PATH = SEX_WL_PATH = PROJECT_ROOT / Path(r'emoticons.txt')
with open(EMOTICONS_PATH, mode='rt', encoding="utf8") as handle:
    csv_reader = csv.reader(handle, delimiter='\t')
    emoticons = {rows[0]:rows[1] for rows in csv_reader}


## Preprocessing

### Chat text preprocess

In [41]:

def remove_stopwords(text: str, words_to_remove: List[str])-> str:
    '''
    Gets string, returns it without stopwords
    '''
    return " ".join([word for word in str(text).split() if word not in words_to_remove])


def stem_text(text: str, stemmer: Any)-> str:
    '''
    stem text string
    '''
    return " ".join([stemmer.stem(word) for word in text.split()])


def remove_emoji(text: str) -> str:
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


def remove_emoticons(text: str, emoticons: Dict) -> str:
    emoticon_pattern = re.compile(u'(' + u'|'.join(k for k in emoticons) + u')')
    return emoticon_pattern.sub(r'', text)


def replace_pornsites_with_string(text:str, replacement_string:str='porn')->str:
    pornsite_pattern = re.compile(r'\S+xnxx\.co\S+' + r'|\S+pornhub\.co\S+' + r'|\S+nude\.co\S+' + r'|\S+sex\.co\S+')
    return pornsite_pattern.sub(replacement_string, text)

def remove_urls(text:str)-> str:
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)


def remove_special_characters(text:str)-> str:
    special_chars_pattern = re.compile(r'[^A-Za-z0-9 ]+')
    return special_chars_pattern.sub(r' ', text)


def replace_chat_slang(text: str, chat_slang: Dict[str, str])-> str:
    new_text = []
    for w in text.split():
        if w.upper() in chat_slang.keys():
            new_text.append(chat_slang[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)


def correct_spellings(text: str, speller: Callable) -> str:
    corrected_text = []
    misspelled_words = speller.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(speller.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)


def preprocess_string_for_bow(text: str, stemmer: Callable=None, speller: Callable=None, words_to_remove:List[str]=None, emoticons: Dict[str, str]=None, chat_slang: Dict[str, str]=None)-> str:
    try:
        text = remove_emoji(text)
        text = remove_emoticons(text, emoticons)
        text = replace_chat_slang(text, chat_slang)
        text = text.lower()
        text = replace_pornsites_with_string(text)
        text = remove_urls(text)
        text = remove_special_characters(text)
        text = correct_spellings(text, speller)
        # text = remove_stopwords(text, words_to_remove)
        # text = stem_text(text, stemmer)
    except(TypeError):
        print(f'Problematic string: {text}')
        text = ''
    return text


def preprocess_df_for_bow(df: pd.DataFrame, text_col: str, output_col_name='preprocessed_bow', stemmer=None, speller=None, words_to_remove=None, emoticons=None, chat_slang=None)-> pd.DataFrame:
    '''
    Gets a PD dataframe and a text column name
    returns the same dataframe with additional column called 'posts_preprocessed_bow'
    '''
    df[output_col_name] = df[text_col]
    df[output_col_name] = df[output_col_name].apply(lambda text: preprocess_string_for_bow(text, stemmer=stemmer, speller=speller, words_to_remove=words_to_remove, emoticons=emoticons, chat_slang=chat_slang))
    return df


# test
preprocess_args = {'stemmer': PorterStemmer(),
                    'speller': SpellChecker(),
                    'words_to_remove': set(stopwords.words('english')),
                    'emoticons': emoticons,
                    'chat_slang': chat_slang
                    }

text = 'r u going to www.google.com http://xnxx.com im walking LOL ths is not &amp;right im caming flight now u r right brb and fu :-)'
text = 'yeah--well I just want to see you before I go in the apt--cause one of my friends got arrested for doing the same thing with a 16 year old--it was a set-up type thing'

preprocess_string_for_bow(text, **preprocess_args)

'yeah well i just want to see you before i go in the apt cause one of my friends got arrested for doing the same thing with a 16 year old it was a set up type thing'

## Datasets

### PJ dataset

#### PJ Convesation level dataset

In [None]:
def load_one_chat_as_df_pj(file_path: Path) -> Dict[str, pd.DataFrame]:
    '''
    Gets an path to a PJ XML file
    returns a dict with three dataframes:
        - victim data
        - predator data
        - conversation posts
    '''
    parser = ET.XMLParser(encoding="utf-8")
    doc_tree = ET.parse(file_path, parser=parser)
    doc_root = doc_tree.getroot()
    
    posts_df = pd.DataFrame(columns = ['USERNAME', 'DATETIME', 'BODY', 'COMMENT', 'CODING'], dtype=str)
    predator_df = pd.DataFrame(columns = ['FIRSTNAME', 'LASTNAME', 'STATEDNAME', 'STATEDAGE', 'GENDER', 'RACE', 'CITY', 'STATE', 'REPEATOFFENDER', 'ADMITGUILT', 'TRUTHFULNAME', 'SCREENNAME'], dtype=str)
    victim_df = pd.DataFrame(columns = ['FIRSTNAME', 'LASTNAME', 'STATEDNAME', 'STATEDAGE', 'GENDER', 'RACE', 'CITY', 'STATE', 'PREVIOUSVICTIMIZATION', 'ADMITGUILT', 'SCREENNAME'], dtype=str)

    for post in doc_root.findall('POST'):
        post_dict = {}
        for field in post:
            post_dict[field.tag] = field.text

        posts_df = posts_df.append(post_dict, ignore_index=True)
    posts_df = posts_df.astype('string')


    for predator in doc_root.findall('PREDATOR'):
        predator_dict = {}
        for field in predator:
            predator_dict[field.tag] = field.text

        predator_df = predator_df.append(predator_dict, ignore_index=True)   
    predator_df = predator_df.astype('string')


    for victim in doc_root.findall('VICTIM'):
        victim_dict = {}
        for field in victim:
            victim_dict[field.tag] = field.text

        victim_df = victim_df.append(victim_dict, ignore_index=True)  
    victim_df = victim_df.astype('string')

    return {'predator': predator_df, 'victim': victim_df, 'conversation': posts_df}


#----------------------------------------------------------
# Test XML parse functions:
file_path = PJ_DATA_FOLDER / Path('ArmySgt1961.xml')
chat_dict = load_one_chat_as_df_pj(file_path)
chat_dict['victim'].head()
chat_dict['predator'].head()
chat_dict['conversation'].head(10)


In [None]:

class PjDataset(Dataset):
    """
    Wrapper around Torch Dataset.
    Prepares an indexed list of PJ conversation in a folder, returns conversations per index (like an array)
    Load is lazy - loads conversation from disk on request.
    Uses load_one_chat_as_df_pj() for conversation loading
    """

    def __init__(self, data_folder: Path, df_preprocess_fn=None, df_preprocess_args:Dict=None):
        """
        Args:
          data_folder - folder with PJ XML files
          df_preprocess_fn - function that gets a dataframe and adds preprocesed text column based on given text column

        """
       
        self.file_list = list_files_in_dir(data_folder, extension='xml')
        self.df_preprocess_fn = df_preprocess_fn
        self.df_preprocess_args = df_preprocess_args
        self.TEXT_COLUMN_NAME = 'BODY'

        
    def __len__(self) -> int:
        """
        Returns:
            int: length of the dataset
        """
        return len(self.file_list)

    def __getitem__(self, idx):
        """Gets element of the dataset

        Args:
            index (int): index of the element in the dataset
        Returns:
            Single element by index
        """        
        sample = load_one_chat_as_df_pj(self.file_list[idx])['conversation']
        if self.df_preprocess_fn is not None:
            sample = self.df_preprocess_fn(sample, self.TEXT_COLUMN_NAME, **self.df_preprocess_args)

        return sample

    

In [None]:
# Test the dataset
preprocess_args = {'stemmer': PorterStemmer(),
                    'speller': SpellChecker(),
                    'words_to_remove': set(stopwords.words('english')),
                    'emoticons': emoticons,
                    'chat_slang': chat_slang
                    }
                    
pj_ds = PjDataset(PJ_DATA_FOLDER, df_preprocess_fn=preprocess_df_for_bow, df_preprocess_args=preprocess_args)
print(len(pj_ds))
print(pj_ds.file_list[1])
pj_ds[1].head(100)

### Pan12 dataloader and dataset

#### Pan12 convesation level dataset 

In [None]:
class Pan12Dataset(Dataset):
    '''
    Wrapper around Torch Dataset.
    Prepares an indexed list of Pan12 conversation in a folder, returns conversations per index (like an array)
    Load is lazy - loads conversation from disk on request.
    Uses load_one_chat_as_df_pj() for conversation loading
    '''

    def __init__(self, chat_data_file: Path, conversation_labels: Path=None, line_labels: Path=None, preprocess_fn=None, preprocess_args=None):
        """
        Args:
            chat_data_file: path to chat xml file
            conversation_labels:
            line_labels:  
        """
       
        self.chat_data_file = chat_data_file
        self.conversations = self._get_conversation_roots(chat_data_file)
        self.preprocess_fn = preprocess_fn
        self.preprocess_args = preprocess_args

        self.conversation_labels = conversation_labels
        self.line_labels = line_labels

        self.TEXT_COLUMN_NAME = 'text'

                
    def __len__(self) -> int:
        """
        Returns:
            int: length of the dataset
        """
        return len(self.conversations)

    def __getitem__(self, idx) -> Dict[str, pd.DataFrame]:
        """Gets element of the dataset

        Args:
            index (int): index of the element in the dataset
        Returns:
            Single element by index
        """        

        conversation = self.conversations[idx]
        conversation_id = conversation.attrib['id']
        conversation_df = pd.DataFrame(columns = ['author', 'line', 'time', 'text'], dtype=str)

        for message in conversation.findall('message'):
            message_dict = {}
            message_dict['line'] = message.attrib['line']
            for field in message:
                message_dict[field.tag] = field.text

            conversation_df = conversation_df.append(message_dict, ignore_index=True)
                
        if self.preprocess_fn is not None:
            conversation_df = self.preprocess_fn(conversation_df, self.TEXT_COLUMN_NAME, **self.preprocess_args)

        return {'conversation_id': conversation_id, 'conversation': conversation_df}
    
    def _get_conversation_roots(self, file_path):
        doc_tree = ET.parse(file_path)
        conversation_roots = doc_tree.getroot().findall('conversation')
        return conversation_roots


In [None]:
preprocess_args = {'stemmer': PorterStemmer(),
                    'speller': SpellChecker(),
                    'words_to_remove': set(stopwords.words('english')),
                    'emoticons': emoticons,
                    'chat_slang': chat_slang
                    }

pan12_ds = Pan12Dataset(PAN12_DATA_FILE, preprocess_fn=preprocess_df_for_bow, preprocess_args=preprocess_args)
pan12_ds[34]['conversation']

#### pan12 line level dataloader

In [None]:
class Pan12LineLevelDataloader():  
    """
    Wrapper around Torch Dataset to perform text classification
    """

    def __init__(self, chat_data_file: Path, user_labels_file: Path=None, line_labels_file: Path=None, preprocess_fn=None, preprocess_args:Dict=None):
        """
        Args:
            chat_data_file: path to chat xml file
            conversation_labels:
            line_labels:  
        """
       
        self.chat_data_file = chat_data_file
        self.conversations = self._get_conversation_roots(chat_data_file)
        self.preprocess_fn = preprocess_fn
        self.preprocess_args = preprocess_args

        self.user_labels_file = user_labels_file
        self.line_labels_file = line_labels_file
        self.TEXT_COLUMN_NAME = 'text'

        self.length = self._get_ds_length()
        self.num_conversations = len(self.conversations)

        # Initiate queue
        self.message_list = None
        self.current_conversation_id = None
        self.next_conversation_idx = 0
        self.next_message_idx = 0

        # Create sets of problematic lines and authors for labels
        user_labels = pd.read_csv(user_labels_file, delimiter='\t', header=None)
        self.perverted_authors = set(user_labels[0])

        line_labels = pd.read_csv(line_labels_file, delimiter='\t', header=None)
        line_labels['concat'] = line_labels[0] + '_' + line_labels[1].astype(str)
        self.pervert_lines = set(line_labels['concat'])

        self.load_next_conversation_to_list()
                       
    def __iter__(self):
        return self

    def __len__(self) -> int:
        """
        Returns:
            int: length of the dataset
        """
        return self.length

    def __next__(self) -> Dict[str, pd.DataFrame]:
        """Gets element of the dataset

        Args:
            index (int): index of the element in the dataset
        Returns:
            Single element by index
        """        
        message_dict = {}
        try:
            message = self.message_list[self.next_message_idx]
        except(IndexError):
            self.load_next_conversation_to_list()
            message = self.message_list[self.next_message_idx]

        message_dict['conversation_id'] = self.current_conversation_id
        self.next_message_idx += 1
        
        message_dict['line'] = message.attrib['line']  

        for field in message:
            message_dict[field.tag] = field.text
        
        if self.preprocess_fn is not None:
            message_dict['text'] = self.preprocess_fn(message_dict['text'], **self.preprocess_args)
        
        message_dict['author_label'] = 1 if message_dict['author'] in self.perverted_authors else 0
        message_dict['line_label'] = 1 if message_dict['conversation_id'] + '_' + message_dict['line'] in self.pervert_lines else 0

        return message_dict
    
    def _get_conversation_roots(self, file_path):
        doc_tree = ET.parse(file_path)
        conversation_roots = doc_tree.getroot().findall('conversation')
        return conversation_roots

    def _get_ds_length(self):
        number_messages = 0
        for conversation in self.conversations:
            number_messages += len(conversation.findall('message'))
        
        return number_messages

    def load_next_conversation_to_list(self):
        try:
            conversation = self.conversations[self.next_conversation_idx] 
            self.current_conversation_id = conversation.attrib['id']  
        except(IndexError):
            raise StopIteration()

        self.next_conversation_idx += 1
        self.message_list = [m for m in conversation.findall('message')]
        self.next_message_idx = 0

In [None]:
# Test dataset
preprocess_args = {'stemmer': PorterStemmer(),
                    'speller': SpellChecker(),
                    'words_to_remove': set(stopwords.words('english')),
                    'emoticons': emoticons,
                    'chat_slang': chat_slang
                    }

pan12_ds = Pan12LineLevelDataloader(PAN12_DATA_FILE, user_labels_file=PAN12_USER_LABELS_FILE, line_labels_file=PAN12_LINE_LABELS_FILE, preprocess_fn=preprocess_string_for_bow, preprocess_args=preprocess_args)
print(len(pan12_ds))

for i, m in enumerate(pan12_ds):
    print(i, m) 
    if i==50:
        break

### Convert Pan12 to labeled datafreame for use later as Train data

In [10]:
class Pan12converterToDF():

    # TODO: add labels!
    
    """
    Wrapper around Torch Dataset to perform text classification
    """

    def __init__(self, chat_data_file: Path, user_labels_file: Path=None, line_labels_file: Path=None):
        """
        Args:
            chat_data_file: path to chat xml file
            conversation_labels:
            line_labels:
            mode:   full - all data 
                    positive_lines - Only lines labeled as problematic
        """
       
        self.chat_data_file = chat_data_file
        self.conversations = self._get_conversation_roots(chat_data_file)

        self.user_labels_file = user_labels_file
        self.line_labels_file = line_labels_file
        self.TEXT_COLUMN_NAME = 'text'

        self.length = self._get_ds_length()
        self.num_conversations = len(self.conversations)

        # Initiate queue
        self.message_list = None
        self.current_conversation_id = None
        self.next_conversation_idx = 0
        self.next_message_idx = 0

        # Create sets of problematic lines and authors for labels
        user_labels = pd.read_csv(user_labels_file, delimiter='\t', header=None)
        self.perverted_authors = set(user_labels[0])

        line_labels = pd.read_csv(line_labels_file, delimiter='\t', header=None)
        line_labels['concat'] = line_labels[0] + '_' + line_labels[1].astype(str)
        self.perverted_conversations = set(line_labels[0].unique())
        self.pervert_lines = set(line_labels['concat'])


    def __iter__(self):
        return self

    def __len__(self) -> int:
        """
        Returns:
            int: length of the dataset
        """
        return self.length

    def convert(self, filename:Path, save_every=2000, mode: str='full') -> pd.DataFrame:
        """Gets element of the dataset

        Args:
            index (int): index of the element in the dataset
        Returns:
            Single element by index
        """        
        pan12_df = pd.DataFrame(columns=['conversation_id', 'line', 'author', 'time', 'text', 'line_label', 'author_label'])

        self._load_next_conversation_to_list(mode) 

        if(mode == 'full'):
            iter_len = self.length
        elif(mode == 'positive_lines'):
            iter_len = len(self.pervert_lines)
        
        for i in tqdm(range(iter_len)):
            message_dict = {}
            try:
                message = self.message_list[self.next_message_idx]
            except(IndexError):
                self._load_next_conversation_to_list(mode)
                message = self.message_list[self.next_message_idx]
            
            message_dict['conversation_id'] = self.current_conversation_id
            self.next_message_idx += 1
            
            message_dict['line'] = message.attrib['line']  
            for field in message:
                message_dict[field.tag] = field.text
            
            message_dict['author_label'] = 1 if message_dict['author'] in self.perverted_authors else 0
            message_dict['line_label'] = 1 if message_dict['conversation_id'] + '_' + message_dict['line'] in self.pervert_lines else 0
            
            pan12_df = pan12_df.append(message_dict, ignore_index=True)
            if i % save_every == 0:
                pan12_df.to_csv(filename)
                print('.', end='')

            # #######
            # if i == 1001:
            #     print(pan12_df.head(2001))
            #     break
            # ######
        pan12_df.to_csv(filename)
        return pan12_df
    
    def _get_conversation_roots(self, file_path):
        doc_tree = ET.parse(file_path)
        conversation_roots = doc_tree.getroot().findall('conversation')
        return conversation_roots

    def _get_ds_length(self):
        number_messages = 0
        for conversation in self.conversations:
            number_messages += len(conversation.findall('message'))
        
        return number_messages

    def _load_next_conversation_to_list(self, mode):
        try:
            conversation = self.conversations[self.next_conversation_idx] 
            self.next_conversation_idx += 1
            self.current_conversation_id = conversation.attrib['id']  

            if mode == 'positive_lines':
                while self.current_conversation_id not in self.perverted_conversations:
                    conversation = self.conversations[self.next_conversation_idx] 
                    self.next_conversation_idx += 1
                    self.current_conversation_id = conversation.attrib['id']  
     
        except(IndexError):
            raise StopIteration()

        if mode == 'positive_lines':
            self.message_list = [m for m in conversation.findall('message') if (self.current_conversation_id + '_' + m.attrib['line'] in self.pervert_lines)]
        else:
            self.message_list = [m for m in conversation.findall('message')]
        self.next_message_idx = 0


In [42]:
# Create a dataframe of all pan12 test perverted lines

pan12_converter = Pan12converterToDF(PAN12_DATA_FILE, user_labels_file=PAN12_USER_LABELS_FILE, line_labels_file=PAN12_LINE_LABELS_FILE)
print(len(pan12_converter))
pan12_df = pan12_converter.convert(OUTPUT_FOLDER / Path('pan12_csv.zip'), mode='positive_lines')
print(len(pan12_df))
pan12_df.head(100)



2058781


  0%|          | 0/6478 [00:00<?, ?it/s]

....6478


Unnamed: 0,conversation_id,line,author,time,text,line_label,author_label
0,3763edf61689c00884dba353dba5352e,27,a8e6e3985a82dfde8ee95b5f099ec606,21:20,i wanna work inside with u,1,1
1,8c5582b1fa2190b69e51e7154d246bbb,8,54b595f1920b5b1988e907ea693303b4,00:02,we could've had sex,1,1
2,8c5582b1fa2190b69e51e7154d246bbb,9,54b595f1920b5b1988e907ea693303b4,00:02,kidding bout that,1,1
3,6eab795c5f6a9d822d25a2b153736799,33,2eba3cbb71e6ea5af3ede4d7b898f99d,18:28,what do u ussually say when ur going to be gon...,1,1
4,6eab795c5f6a9d822d25a2b153736799,35,2eba3cbb71e6ea5af3ede4d7b898f99d,18:28,what does he say to that?,1,1
...,...,...,...,...,...,...,...
95,cc5fba01f752fae4846aed8f26731b7b,109,b8931a8b614fb54f6051ffc75f39db29,14:40,i will teach you,1,1
96,cc5fba01f752fae4846aed8f26731b7b,111,b8931a8b614fb54f6051ffc75f39db29,14:40,so what would yo udo to me first,1,1
97,cc5fba01f752fae4846aed8f26731b7b,114,b8931a8b614fb54f6051ffc75f39db29,14:41,you can stroke my cock or suck on my nipples,1,1
98,cc5fba01f752fae4846aed8f26731b7b,116,b8931a8b614fb54f6051ffc75f39db29,14:42,i would love to come to you now honey,1,1


In [43]:
# Preprocess pan12 perverted lines 

preprocess_args = {'stemmer': PorterStemmer(),
                    'speller': SpellChecker(),
                    'words_to_remove': set(stopwords.words('english')),
                    'emoticons': emoticons,
                    'chat_slang': chat_slang
                    }

pan12_df = preprocess_df_for_bow(pan12_df, 'text', **preprocess_args)
pan12_df.to_csv(OUTPUT_FOLDER / Path('pan12_perverted_lines_preprocessed.zip'))

pan12_df

Unnamed: 0,conversation_id,line,author,time,text,line_label,author_label,preprocessed_bow
0,3763edf61689c00884dba353dba5352e,27,a8e6e3985a82dfde8ee95b5f099ec606,21:20,i wanna work inside with u,1,1,i want to work inside with you
1,8c5582b1fa2190b69e51e7154d246bbb,8,54b595f1920b5b1988e907ea693303b4,00:02,we could've had sex,1,1,we could ve had sex
2,8c5582b1fa2190b69e51e7154d246bbb,9,54b595f1920b5b1988e907ea693303b4,00:02,kidding bout that,1,1,kidding about that
3,6eab795c5f6a9d822d25a2b153736799,33,2eba3cbb71e6ea5af3ede4d7b898f99d,18:28,what do u ussually say when ur going to be gon...,1,1,what do you usually say when you are going to ...
4,6eab795c5f6a9d822d25a2b153736799,35,2eba3cbb71e6ea5af3ede4d7b898f99d,18:28,what does he say to that?,1,1,what does he say to that
...,...,...,...,...,...,...,...,...
6473,cf5d918cdd1601c608c62de1c0641dbd,1,b18ae7c450091f1f200e896d765cce6d,14:33,"Hey, i go to work most days now, so seeing me ...",1,1,hey i go to work most days now so seeing me on...
6474,ec391a65097a955029afaedc23d5fa81,36,2e0d170f2addfb0048f9424a2daa5a73,18:34,u like older guys?,1,1,you like older guys
6475,82add2c9da3c267a98b3981375b6c238,55,c5502c7c9bb5e28508a3e19ec869f6d2,09:32,just sitting here naked talking to you,1,1,just sitting here naked talking to you
6476,82add2c9da3c267a98b3981375b6c238,68,c5502c7c9bb5e28508a3e19ec869f6d2,09:36,"see you might be if i was there, and we could ...",1,1,see you might be if i was there and we could b...


## Classification

### Word list based classification

In [37]:
def contains_words_from_list(text: str, word_list: List[str])-> bool:
    text_words = re.sub("[^\w]", " ",  text).split()
    if any(word in word_list for word in text_words):
        return True
    else:
        return False

pan12_df['contains_sex_words'] = pan12_df['preprocessed_bow'].apply(lambda text: contains_words_from_list(text, sex_word_list))
pan12_df['contains_family_words'] = pan12_df['preprocessed_bow'].apply(lambda text: contains_words_from_list(text, family_word_list))
pan12_df['contains_meeting_words'] = pan12_df['preprocessed_bow'].apply(lambda text: contains_words_from_list(text, meeting_word_list))

pan12_df.to_csv(OUTPUT_FOLDER / Path('pan12_perverted_lines_preprocessed.zip'))

# pan12_df[(pan12_df.contains_sex_words == False) & (pan12_df.contains_family_words == False) & (pan12_df.contains_meeting_words == False)]
pan12_df[(pan12_df.contains_meeting_words == True)]

Unnamed: 0,conversation_id,line,author,time,text,line_label,author_label,preprocessed_bow,contains_sex_words,contains_family_words,contains_meeting_words
0,3763edf61689c00884dba353dba5352e,27,a8e6e3985a82dfde8ee95b5f099ec606,21:20,i wanna work inside with u,1,1,i want to work inside with you,False,False,False
2,8c5582b1fa2190b69e51e7154d246bbb,9,54b595f1920b5b1988e907ea693303b4,00:02,kidding bout that,1,1,kidding bout that,False,False,False
3,6eab795c5f6a9d822d25a2b153736799,33,2eba3cbb71e6ea5af3ede4d7b898f99d,18:28,what do u ussually say when ur going to be gon...,1,1,what do you usually say when you are going to ...,False,False,False
4,6eab795c5f6a9d822d25a2b153736799,35,2eba3cbb71e6ea5af3ede4d7b898f99d,18:28,what does he say to that?,1,1,what does he say to that,False,False,False
5,6eab795c5f6a9d822d25a2b153736799,65,2eba3cbb71e6ea5af3ede4d7b898f99d,18:43,how often do u shave ur legs?,1,1,how often do you shave you are legs,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
6470,ab42a85c6dd371e3bf98475d5642ac74,67,86acb75ad942a8df784694ad33c83068,03:05,have u ever had a guy lick you b4?,1,1,have you ever had a guy lick you be,False,False,False
6471,ab42a85c6dd371e3bf98475d5642ac74,69,86acb75ad942a8df784694ad33c83068,03:06,will u wanna lemme try it?,1,1,will you want to lemme try it,False,False,False
6472,ab42a85c6dd371e3bf98475d5642ac74,75,86acb75ad942a8df784694ad33c83068,03:09,sneak out tonite babe,1,1,sneak out tonite babe,False,False,False
6473,cf5d918cdd1601c608c62de1c0641dbd,1,b18ae7c450091f1f200e896d765cce6d,14:33,"Hey, i go to work most days now, so seeing me ...",1,1,hey i go to work most days now so seeing me on...,False,False,False


### TF/IDF - Not started

In [25]:
tfIdfVectorizer=TfidfVectorizer(use_idf=True)
tfIdf = tfIdfVectorizer.fit_transform(dataset)
df = pd.DataFrame(tfIdf[0].T.todense(), index=tfIdfVectorizer.get_feature_names(), columns=["TF-IDF"])
df = df.sort_values('TF-IDF', ascending=False)
print (df.head(25))

NameError: name 'TfidfVectorizer' is not defined

## some thoughts
Bag of words - sexual words, fear, trust, family, approach (Location, transport) , other categories - DrouinBoydHancockJames2017
Good article: file:///D:/docs/DSML_IDC/Semester%204/Cyber/Tasks/Task2/ref%20docs/Early%20Text%20Classification%20using%20Multi-Resolution%20Concept%20Representations.pdf
Ensamble and preprocessing: file:///D:/docs/DSML_IDC/Semester%204/Cyber/Tasks/Task2/ref%20docs/PredatoryConversationDetection.pdf
file:///D:/docs/DSML_IDC/Semester%204/Cyber/Tasks/Task2/ref%20docs/Analyzing_Chat_Conversations_of_Pedophil.pdf
