In [1]:
!pip install pyspellchecker



## General - imports paths etc.

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path

import re
import string

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer

import spacy
from spellchecker import SpellChecker

import xml.etree.ElementTree as ET 
import csv

from typing import Dict, Callable, List, Dict, Set, Any
import logging


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

logger = logging.getLogger(__name__)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mryan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Folders
DATA_FOLDER = Path('.\customer_data')
file_path = DATA_FOLDER / Path('ArmySgt1961.xml')

### Word lists

In [4]:
# Load word lists
SEX_WL_PATH = Path(r'.\sex_words.txt')
with open(SEX_WL_PATH, 'rt') as handle:
    sex_word_list = handle.read().split('\n')

MEETING_WL_PATH = Path(r'.\meeting_words.txt')
with open(MEETING_WL_PATH, 'rt') as handle:
    meeting_word_list = handle.read().split('\n')

FAMILY_WL_PATH = Path(r'.\family_words.txt')
with open(FAMILY_WL_PATH, 'rt') as handle:
    family_word_list = handle.read().split('\n')

CHAT_SLANG_PATH = Path(r'.\chat_slang.txt')
with open(CHAT_SLANG_PATH, mode='rt') as handle:
    csv_reader = csv.reader(handle, delimiter='\t')
    chat_slang = {rows[0]:rows[1] for rows in csv_reader}

EMOTICONS_PATH = Path(r'.\emoticons.txt')
with open(EMOTICONS_PATH, mode='rt', encoding="utf8") as handle:
    csv_reader = csv.reader(handle, delimiter='\t')
    emoticons = {rows[0]:rows[1] for rows in csv_reader}


## Data loaders, data parsers, data sets


In [5]:
def parse_chat_file_pj(file_path: Path) -> Dict[str, pd.DataFrame]:
    '''
    Gets an XML doctree (ET parser)
    returns three dataframes:
        - victim
        - predator
        - chat posts
    '''
    doc_tree = ET.parse(file_path)
    doc_root = doc_tree.getroot()
    
    posts_df = pd.DataFrame(columns = ['USERNAME', 'DATETIME', 'BODY', 'COMMENT', 'CODING'], dtype=str)
    predator_df = pd.DataFrame(columns = ['FIRSTNAME', 'LASTNAME', 'STATEDNAME', 'STATEDAGE', 'GENDER', 'RACE', 'CITY', 'STATE', 'REPEATOFFENDER', 'ADMITGUILT', 'TRUTHFULNAME', 'SCREENNAME'], dtype=str)
    victim_df = pd.DataFrame(columns = ['FIRSTNAME', 'LASTNAME', 'STATEDNAME', 'STATEDAGE', 'GENDER', 'RACE', 'CITY', 'STATE', 'PREVIOUSVICTIMIZATION', 'ADMITGUILT', 'SCREENNAME'], dtype=str)

    for post in doc_root.findall('POST'):
        post_dict = {}
        for field in post:
            post_dict[field.tag] = field.text

        posts_df = posts_df.append(post_dict, ignore_index=True)
    posts_df = posts_df.astype('string')


    for predator in doc_root.findall('PREDATOR'):
        predator_dict = {}
        for field in predator:
            predator_dict[field.tag] = field.text

        predator_df = predator_df.append(predator_dict, ignore_index=True)   
    predator_df = predator_df.astype('string')


    for victim in doc_root.findall('VICTIM'):
        victim_dict = {}
        for field in victim:
            victim_dict[field.tag] = field.text

        victim_df = victim_df.append(victim_dict, ignore_index=True)  
    victim_df = victim_df.astype('string')

    return {'predator': predator_df, 'victim': victim_df, 'posts': posts_df}


#----------------------------------------------------------
# Test XML parse functions:
file_path = DATA_FOLDER / Path('ArmySgt1961.xml')
chat_dict =  parse_chat_file_pj(file_path)
chat_dict['victim'].head()
chat_dict['predator'].head()
chat_dict['posts'].head(10)


Unnamed: 0,USERNAME,DATETIME,BODY,COMMENT,CODING
0,armysgt1961,(7:02:01 pm),im dennis us army soldier from cincinnati,,
1,peekaboo1293,(7:02:30 pm),hi im becky from ky,,
2,armysgt1961,(7:02:35 pm),how old ru,,
3,peekaboo1293,(7:02:42 pm),13 how old ru,(age stated and he didn't bat an eye),
4,armysgt1961,(7:02:44 pm),u single,,
5,peekaboo1293,(7:02:51 pm),yeah,,
6,peekaboo1293,(7:03:03 pm),i had a bf but we broke up when i moved here,,
7,armysgt1961,(7:03:11 pm),ok u have sex at 13,(he obviously knows my age),
8,peekaboo1293,(7:03:28 pm),u mean did i ever,,
9,armysgt1961,(7:03:32 pm),yeah,,


## Preprocessing

### Chat text preprocess

In [13]:

def remove_stopwords(text: str, words_to_remove: List[str])-> str:
    '''
    Gets string, returns it without stopwords
    '''
    return " ".join([word for word in str(text).split() if word not in words_to_remove])


def stem_text(text: str, stemmer: Any)-> str:
    '''
    stem text string
    '''
    return " ".join([stemmer.stem(word) for word in text.split()])


def remove_emoji(text: str) -> str:
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


def remove_emoticons(text: str, emoticons: Dict) -> str:
    emoticon_pattern = re.compile(u'(' + u'|'.join(k for k in emoticons) + u')')
    return emoticon_pattern.sub(r'', text)


def remove_urls(text:str)-> str:
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)


def replace_chat_slang(text: str, chat_slang: Dict[str, str])-> str:
    new_text = []
    for w in text.split():
        if w.upper() in chat_slang.keys():
            new_text.append(chat_slang[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)


def correct_spellings(text: str, speller: Callable) -> str:
    corrected_text = []
    misspelled_words = speller.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(speller.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)


def preprocess_text_for_bow(text: str, stemmer: Callable, speller: Callable, words_to_remove:List[str], emoticons: Dict[str, str])-> str:
    text = remove_emoji(text)
    text = remove_emoticons(text, emoticons)
    text = replace_chat_slang(text, chat_slang)
    text = text.lower()
    text = remove_stopwords(text, words_to_remove)
    text = correct_spellings(text, speller)
    text = stem_text(text, stemmer)
    return text


def preprocess_posts_for_bow(df: pd.DataFrame, text_col: str)-> pd.DataFrame:
    '''
    Gets a PD dataframe and a text column name
    returns the same dataframe with additional column called 'posts_preprocessed_bow'
    '''
    speller = SpellChecker()
    words_to_remove = set(stopwords.words('english'))
    stemmer = PorterStemmer()

    df['posts_preprocessed_bow'] = df[text_col]
    df['posts_preprocessed_bow'] = df['posts_preprocessed_bow'].apply(lambda text: preprocess_text_for_bow(text, stemmer, speller, words_to_remove, emoticons))




In [15]:
    speller = SpellChecker()
    words_to_remove = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    text = 'LOL ths is not right im caming right now u r right brb and fu :-)'

    preprocess_text_for_bow(text, stemmer, speller, words_to_remove, emoticons)

'laugh loud the right come right right right back fuck'

## some thoughts
Chat Preprocess: https://www.kaggle.com/code/sudalairajkumar/getting-started-with-text-preprocessing/notebook
Bag of words - sexual words, fear, trust, family, approach (Location, transport) , other categories - DrouinBoydHancockJames2017
Good article: file:///D:/docs/DSML_IDC/Semester%204/Cyber/Tasks/Task2/ref%20docs/Early%20Text%20Classification%20using%20Multi-Resolution%20Concept%20Representations.pdf
Ensamble and preprocessing: file:///D:/docs/DSML_IDC/Semester%204/Cyber/Tasks/Task2/ref%20docs/PredatoryConversationDetection.pdf
file:///D:/docs/DSML_IDC/Semester%204/Cyber/Tasks/Task2/ref%20docs/Analyzing_Chat_Conversations_of_Pedophil.pdf
