In [1]:
# !pip install pyspellchecker
# !pip install gensim

## General - imports paths etc.

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path

import re
import string

from tqdm.notebook import tqdm as tqdm
from ipywidgets import IntProgress

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer

import torch
from torch.utils.data import Dataset

import gensim
from gensim.models import Word2Vec

import spacy
from spellchecker import SpellChecker

import xml.etree.ElementTree as ET 
import csv

from typing import Dict, Callable, List, Dict, Set, Any
import logging


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

logger = logging.getLogger(__name__)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mryan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
# Folders
PJ_DATA_FOLDER = Path('./customer_data')
PAN12_DATA_FILE = Path('./ref_data/pan12_corpus/pan12-sexual-predator-identification-test-corpus-2012-05-21/pan12-sexual-predator-identification-test-corpus-2012-05-17.xml')
PAN12_LINE_LABELS_FILE = Path('./ref_data/pan12_corpus/pan12-sexual-predator-identification-test-corpus-2012-05-21/pan12-sexual-predator-identification-groundtruth-problem2.txt')
PAN12_USER_LABELS_FILE = Path('./ref_data/pan12_corpus/pan12-sexual-predator-identification-test-corpus-2012-05-21/pan12-sexual-predator-identification-groundtruth-problem1.txt')
OUTPUT_FOLDER = Path('./')




#### Utility functions

In [4]:
# Define datasets with texts and labels

def list_files_in_dir(folder: Path, extension='*') -> List:
    
    file_list = [f for f in folder.glob(f'**/*.{extension}') if f.is_file()]
    return file_list

## Test funcion
# list_files_in_dir(DATA_FOLDER, 'dtd')

### Word lists

In [5]:
# Load word lists
SEX_WL_PATH = Path(r'.\sex_words.txt')
with open(SEX_WL_PATH, 'rt') as handle:
    sex_word_list = handle.read().split('\n')

MEETING_WL_PATH = Path(r'.\meeting_words.txt')
with open(MEETING_WL_PATH, 'rt') as handle:
    meeting_word_list = handle.read().split('\n')

FAMILY_WL_PATH = Path(r'.\family_words.txt')
with open(FAMILY_WL_PATH, 'rt') as handle:
    family_word_list = handle.read().split('\n')

CHAT_SLANG_PATH = Path(r'.\chat_slang.txt')
with open(CHAT_SLANG_PATH, mode='rt') as handle:
    csv_reader = csv.reader(handle, delimiter='\t')
    chat_slang = {rows[0]:rows[1] for rows in csv_reader}

EMOTICONS_PATH = Path(r'.\emoticons.txt')
with open(EMOTICONS_PATH, mode='rt', encoding="utf8") as handle:
    csv_reader = csv.reader(handle, delimiter='\t')
    emoticons = {rows[0]:rows[1] for rows in csv_reader}


## Preprocessing

### Chat text preprocess

In [6]:

def remove_stopwords(text: str, words_to_remove: List[str])-> str:
    '''
    Gets string, returns it without stopwords
    '''
    return " ".join([word for word in str(text).split() if word not in words_to_remove])


def stem_text(text: str, stemmer: Any)-> str:
    '''
    stem text string
    '''
    return " ".join([stemmer.stem(word) for word in text.split()])


def remove_emoji(text: str) -> str:
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


def remove_emoticons(text: str, emoticons: Dict) -> str:
    emoticon_pattern = re.compile(u'(' + u'|'.join(k for k in emoticons) + u')')
    return emoticon_pattern.sub(r'', text)


def replace_pornsites_with_string(text:str)->str:
    pornsite_pattern = re.compile(r'\S+xnxx\.co\S+' + r'|\S+pornhub\.co\S+' + r'|\S+nude\.co\S+' + r'|\S+sex\.co\S+')
    return pornsite_pattern.sub(r'porn site', text)

def remove_urls(text:str)-> str:
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)


def remove_special_characters(text:str)-> str:
    special_chars_pattern = re.compile(r'[^A-Za-z0-9 ]+')
    return special_chars_pattern.sub(r'', text)


def replace_chat_slang(text: str, chat_slang: Dict[str, str])-> str:
    new_text = []
    for w in text.split():
        if w.upper() in chat_slang.keys():
            new_text.append(chat_slang[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)


def correct_spellings(text: str, speller: Callable) -> str:
    corrected_text = []
    misspelled_words = speller.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(speller.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)


def preprocess_string_for_bow(text: str, stemmer: Callable=None, speller: Callable=None, words_to_remove:List[str]=None, emoticons: Dict[str, str]=None)-> str:
    # text = remove_emoji(text)
    # text = remove_emoticons(text, emoticons)
    text = replace_chat_slang(text, chat_slang)
    text = text.lower()
    text = replace_pornsites_with_string(text)
    text = remove_urls(text)
    text = remove_special_characters(text)
    text = remove_stopwords(text, words_to_remove)
    text = correct_spellings(text, speller)
    # text = stem_text(text, stemmer)
    return text


def preprocess_df_for_bow(df: pd.DataFrame, text_col: str, output_col_name='preprocessed_bow', stemmer=None, speller=None, words_to_remove=None, emoticons=None)-> pd.DataFrame:
    '''
    Gets a PD dataframe and a text column name
    returns the same dataframe with additional column called 'posts_preprocessed_bow'
    '''
    speller = SpellChecker()
    words_to_remove = set(stopwords.words('english'))
    stemmer = PorterStemmer()

    df[output_col_name] = df[text_col]
    df[output_col_name] = df[output_col_name].apply(lambda text: preprocess_string_for_bow(text, stemmer=stemmer, speller=speller, words_to_remove=words_to_remove, emoticons=emoticons))
    return df


# test
speller = SpellChecker()
words_to_remove = set(stopwords.words('english'))
stemmer = PorterStemmer()
text = 'r u going to www.google.com http://xnxx.com im walking LOL ths is not &amp;right im caming flight now u r right brb and fu :-)'

preprocess_string_for_bow(text, stemmer=stemmer, speller=speller, words_to_remove=words_to_remove, emoticons=emoticons)

'going porn site walking laughing loud the alright coming flight right right back fuck'

## Datasets

### Convesation level datasets


#### PJ Convesation level dataset

In [7]:
def load_one_chat_pj(file_path: Path) -> Dict[str, pd.DataFrame]:
    '''
    Gets an XML doctree (ET parser)
    returns three dataframes:
        - victim
        - predator
        - chat posts
    '''
    doc_tree = ET.parse(file_path)
    doc_root = doc_tree.getroot()
    
    posts_df = pd.DataFrame(columns = ['USERNAME', 'DATETIME', 'BODY', 'COMMENT', 'CODING'], dtype=str)
    predator_df = pd.DataFrame(columns = ['FIRSTNAME', 'LASTNAME', 'STATEDNAME', 'STATEDAGE', 'GENDER', 'RACE', 'CITY', 'STATE', 'REPEATOFFENDER', 'ADMITGUILT', 'TRUTHFULNAME', 'SCREENNAME'], dtype=str)
    victim_df = pd.DataFrame(columns = ['FIRSTNAME', 'LASTNAME', 'STATEDNAME', 'STATEDAGE', 'GENDER', 'RACE', 'CITY', 'STATE', 'PREVIOUSVICTIMIZATION', 'ADMITGUILT', 'SCREENNAME'], dtype=str)

    for post in doc_root.findall('POST'):
        post_dict = {}
        for field in post:
            post_dict[field.tag] = field.text

        posts_df = posts_df.append(post_dict, ignore_index=True)
    posts_df = posts_df.astype('string')


    for predator in doc_root.findall('PREDATOR'):
        predator_dict = {}
        for field in predator:
            predator_dict[field.tag] = field.text

        predator_df = predator_df.append(predator_dict, ignore_index=True)   
    predator_df = predator_df.astype('string')


    for victim in doc_root.findall('VICTIM'):
        victim_dict = {}
        for field in victim:
            victim_dict[field.tag] = field.text

        victim_df = victim_df.append(victim_dict, ignore_index=True)  
    victim_df = victim_df.astype('string')

    return {'predator': predator_df, 'victim': victim_df, 'conversation': posts_df}


#----------------------------------------------------------
# Test XML parse functions:
file_path = PJ_DATA_FOLDER / Path('ArmySgt1961.xml')
chat_dict = load_one_chat_pj(file_path)
chat_dict['victim'].head()
chat_dict['predator'].head()
chat_dict['conversation'].head(10)


Unnamed: 0,USERNAME,DATETIME,BODY,COMMENT,CODING
0,armysgt1961,(7:02:01 pm),im dennis us army soldier from cincinnati,,
1,peekaboo1293,(7:02:30 pm),hi im becky from ky,,
2,armysgt1961,(7:02:35 pm),how old ru,,
3,peekaboo1293,(7:02:42 pm),13 how old ru,(age stated and he didn't bat an eye),
4,armysgt1961,(7:02:44 pm),u single,,
5,peekaboo1293,(7:02:51 pm),yeah,,
6,peekaboo1293,(7:03:03 pm),i had a bf but we broke up when i moved here,,
7,armysgt1961,(7:03:11 pm),ok u have sex at 13,(he obviously knows my age),
8,peekaboo1293,(7:03:28 pm),u mean did i ever,,
9,armysgt1961,(7:03:32 pm),yeah,,


In [8]:

class PjDataset(Dataset):
    """
    Wrapper around Torch Dataset to perform text classification
    """

    def __init__(self, data_folder: Path, preprocess_fn=None):
        """
        Args:
            texts (List[str]): a list with texts to classify or to train the
                classifier on
            labels List[str]: a list with classification labels (optional)
        """
       
        self.file_list = list_files_in_dir(data_folder, extension='xml')
        self.preprocess_fn = preprocess_fn
        self.TEXT_COLUMN_NAME = 'BODY'
        
    def __len__(self) -> int:
        """
        Returns:
            int: length of the dataset
        """
        return len(self.file_list)

    def __getitem__(self, idx):
        """Gets element of the dataset

        Args:
            index (int): index of the element in the dataset
        Returns:
            Single element by index
        """        
        sample = load_one_chat_pj(self.file_list[idx])['conversation']
        if self.preprocess_fn is not None:
            sample = self.preprocess_fn(sample, self.TEXT_COLUMN_NAME)

        return sample

    

In [9]:

# Test the dataset
pj_ds = PjDataset(PJ_DATA_FOLDER, preprocess_fn=preprocess_df_for_bow)
print(pj_ds.file_list[0])
pj_ds[0].head(100)


customer_data\ArmySgt1961.xml


Unnamed: 0,USERNAME,DATETIME,BODY,COMMENT,CODING,preprocessed_bow
0,armysgt1961,(7:02:01 pm),im dennis us army soldier from cincinnati,,,dennis us army soldier cincinnati
1,peekaboo1293,(7:02:30 pm),hi im becky from ky,,,hi becky ky
2,armysgt1961,(7:02:35 pm),how old ru,,,old
3,peekaboo1293,(7:02:42 pm),13 how old ru,(age stated and he didn't bat an eye),,13 old
4,armysgt1961,(7:02:44 pm),u single,,,single
...,...,...,...,...,...,...
91,peekaboo1293,7:27:10 PM,(incoming phone number given),,,incoming phone number given
92,armysgt1961,7:27:37 PM,name,,,name
93,peekaboo1293,7:27:58 PM,boo real name is becky,,,boo real name becky
94,peekaboo1293,8:38:52 PM,where r u,,,


#### pan12 convesation level dataset 

In [10]:
class Pan12Dataset(Dataset):
    """
    Wrapper around Torch Dataset to perform text classification
    """

    def __init__(self, chat_data_file: Path, conversation_labels: Path=None, line_labels: Path=None, preprocess_fn=None, preprocess_args=None):
        """
        Args:
            chat_data_file: path to chat xml file
            conversation_labels:
            line_labels:  
        """
       
        self.chat_data_file = chat_data_file
        self.conversations = self._get_conversation_roots(chat_data_file)
        self.preprocess_fn = preprocess_fn
        self.preprocess_args = preprocess_args

        self.conversation_labels = conversation_labels
        self.line_labels = line_labels

        self.TEXT_COLUMN_NAME = 'text'

                
    def __len__(self) -> int:
        """
        Returns:
            int: length of the dataset
        """
        return len(self.conversations)

    def __getitem__(self, idx) -> Dict[str, pd.DataFrame]:
        """Gets element of the dataset

        Args:
            index (int): index of the element in the dataset
        Returns:
            Single element by index
        """        

        conversation = self.conversations[idx]
        conversation_id = conversation.attrib['id']
        conversation_df = pd.DataFrame(columns = ['author', 'line', 'time', 'text'], dtype=str)

        for message in conversation.findall('message'):
            message_dict = {}
            message_dict['line'] = message.attrib['line']
            for field in message:
                message_dict[field.tag] = field.text

            conversation_df = conversation_df.append(message_dict, ignore_index=True)
                
        if self.preprocess_fn is not None:
            conversation_df = self.preprocess_fn(conversation_df, self.TEXT_COLUMN_NAME, *self.preprocess_args)

        return {'conversation_id': conversation_id, 'conversation': conversation_df}
    
    def _get_conversation_roots(self, file_path):
        doc_tree = ET.parse(file_path)
        conversation_roots = doc_tree.getroot().findall('conversation')
        return conversation_roots

    
PAN12_DATA_FILE = Path('./ref_data/pan12_corpus/pan12-sexual-predator-identification-training-corpus-2012-05-01/pan12-sexual-predator-identification-training-corpus-2012-05-01.xml')
PAN12_LINE_LABELS_FILE = Path('./ref_data/pan12_corpus/pan12-sexual-predator-identification-training-corpus-2012-05-01/pan12-sexual-predator-identification-diff.txt')

# preprocess_string_for_bow(text, stemmer=stemmer, speller=speller, words_to_remove=words_to_remove, emoticons=emoticons)
preprocess_args = {'stemmer':stemmer, 'speller':speller, 'words_to_remove':words_to_remove, 'emoticons':emoticons}
pan12_ds = Pan12Dataset(PAN12_DATA_FILE, preprocess_fn=preprocess_df_for_bow, preprocess_args=preprocess_args)


In [11]:
pan12_ds[45]['conversation']

Unnamed: 0,author,line,time,text,stemmer
0,0c8dce20967cf80665e60051b8ab2d3c,1,17:40,jrossi: how do I make my peft tests working on...,rossi make left tests working ie
1,0c8dce20967cf80665e60051b8ab2d3c,2,17:40,it doesn't give any error messages or anything...,doesnt give error messages anything
2,0c8dce20967cf80665e60051b8ab2d3c,3,17:40,most of my perf tests don't work on IE9,perf tests dont work ie
3,0c8dce20967cf80665e60051b8ab2d3c,4,17:41,one reason seems to be that textContent isn't ...,one reason seems textcontent isnt implemented


### Line level dataloaders

#### pan12 line level dataset

In [27]:
class Pan12LineLevelDataloader():

    # TODO: add labels!
    
    """
    Wrapper around Torch Dataset to perform text classification
    """

    def __init__(self, chat_data_file: Path, user_labels_file: Path=None, line_labels_file: Path=None, preprocess_fn=None, preprocess_args:Dict=None):
        """
        Args:
            chat_data_file: path to chat xml file
            conversation_labels:
            line_labels:  
        """
       
        self.chat_data_file = chat_data_file
        self.conversations = self._get_conversation_roots(chat_data_file)
        self.preprocess_fn = preprocess_fn
        self.preprocess_args = preprocess_args

        self.user_labels_file = user_labels_file
        self.line_labels_file = line_labels_file
        self.TEXT_COLUMN_NAME = 'text'

        self.length = self._get_ds_length()
        self.num_conversations = len(self.conversations)

        # Initiate queue
        self.message_list = None
        self.current_conversation_id = None
        self.next_conversation_idx = 0
        self.next_message_idx = 0

        # Create sets of problematic lines and authors for labels
        user_labels = pd.read_csv(user_labels_file, delimiter='\t', header=None)
        self.perverted_authors = set(user_labels[0])

        line_labels = pd.read_csv(line_labels_file, delimiter='\t', header=None)
        line_labels['concat'] = line_labels[0] + '_' + line_labels[1].astype(str)
        self.pervert_lines = set(line_labels['concat'])

        self.load_next_conversation_to_list()
                       
    def __iter__(self):
        return self

    def __len__(self) -> int:
        """
        Returns:
            int: length of the dataset
        """
        return self.length

    def __next__(self) -> Dict[str, pd.DataFrame]:
        """Gets element of the dataset

        Args:
            index (int): index of the element in the dataset
        Returns:
            Single element by index
        """        
        message_dict = {}
        try:
            message = self.message_list[self.next_message_idx]
        except(IndexError):
            self.load_next_conversation_to_list()
            message = self.message_list[self.next_message_idx]

        message_dict['conversation_id'] = self.current_conversation_id
        self.next_message_idx += 1
        
        message_dict['line'] = message.attrib['line']  

        for field in message:
            message_dict[field.tag] = field.text
        
        if self.preprocess_fn is not None:
            message_dict['text'] = self.preprocess_fn(message_dict['text'], **self.preprocess_args)
        
        message_dict['author_label'] = 1 if message_dict['author'] in self.perverted_authors else 0
        message_dict['line_label'] = 1 if message_dict['conversation_id'] + '_' + message_dict['line'] in self.pervert_lines else 0

        return message_dict
    
    def _get_conversation_roots(self, file_path):
        doc_tree = ET.parse(file_path)
        conversation_roots = doc_tree.getroot().findall('conversation')
        return conversation_roots

    def _get_ds_length(self):
        number_messages = 0
        for conversation in self.conversations:
            number_messages += len(conversation.findall('message'))
        
        return number_messages

    def load_next_conversation_to_list(self):
        try:
            conversation = self.conversations[self.next_conversation_idx] 
            self.current_conversation_id = conversation.attrib['id']  
        except(IndexError):
            raise StopIteration()

        self.next_conversation_idx += 1
        self.message_list = [m for m in conversation.findall('message')]
        self.next_message_idx = 0




In [28]:
# Test dataset
preprocess_args = {'stemmer':stemmer, 'speller':speller, 'words_to_remove':words_to_remove, 'emoticons':emoticons}
pan12_ds = Pan12LineLevelDataloader(PAN12_DATA_FILE, user_labels_file=PAN12_USER_LABELS_FILE, line_labels_file=PAN12_LINE_LABELS_FILE, preprocess_fn=preprocess_string_for_bow, preprocess_args=preprocess_args)
print(len(pan12_ds))

for i, m in enumerate(pan12_ds):
    print(i, m) 
    if i==100:
        break

2058781
0 {'conversation_id': 'affc2df0951b733d14ba92d19d9b7695', 'line': '1', 'author': '0a39f78bcb297ab0ebe8a29c28bfed89', 'time': '15:24', 'text': 'gmail bug 6978 new mark exterminated script elements malformed it', 'author_label': 0, 'line_label': 0}
1 {'conversation_id': 'affc2df0951b733d14ba92d19d9b7695', 'line': '2', 'author': '60659cfda992013e610f285c46692d28', 'time': '15:32', 'text': 'henri ask firebox build question windows', 'author_label': 0, 'line_label': 0}
2 {'conversation_id': 'affc2df0951b733d14ba92d19d9b7695', 'line': '3', 'author': 'b8810fee2f4a71f849f3f7409546d1d9', 'time': '15:34', 'text': '60659cfda992013e610f285c46692d28 sure probably dont know answer', 'author_label': 0, 'line_label': 0}
3 {'conversation_id': 'affc2df0951b733d14ba92d19d9b7695', 'line': '4', 'author': '60659cfda992013e610f285c46692d28', 'time': '15:35', 'text': 'appears build runs creates firefoxexe dustbin', 'author_label': 0, 'line_label': 0}
4 {'conversation_id': 'affc2df0951b733d14ba92d19d9b

### Convert Pan12 to labeled datafreame

In [35]:
class Pan12converterToDF():

    # TODO: add labels!
    
    """
    Wrapper around Torch Dataset to perform text classification
    """

    def __init__(self, chat_data_file: Path, user_labels_file: Path=None, line_labels_file: Path=None):
        """
        Args:
            chat_data_file: path to chat xml file
            conversation_labels:
            line_labels:  
        """
       
        self.chat_data_file = chat_data_file
        self.conversations = self._get_conversation_roots(chat_data_file)

        self.user_labels_file = user_labels_file
        self.line_labels_file = line_labels_file
        self.TEXT_COLUMN_NAME = 'text'

        self.length = self._get_ds_length()
        self.num_conversations = len(self.conversations)

        # Initiate queue
        self.message_list = None
        self.current_conversation_id = None
        self.next_conversation_idx = 0
        self.next_message_idx = 0

        # Create sets of problematic lines and authors for labels
        user_labels = pd.read_csv(user_labels_file, delimiter='\t', header=None)
        self.perverted_authors = set(user_labels[0])

        line_labels = pd.read_csv(line_labels_file, delimiter='\t', header=None)
        line_labels['concat'] = line_labels[0] + '_' + line_labels[1].astype(str)
        self.pervert_lines = set(line_labels['concat'])

        self.load_next_conversation_to_list() 


    def __iter__(self):
        return self

    def __len__(self) -> int:
        """
        Returns:
            int: length of the dataset
        """
        return self.length

    def convert(self, filename:Path) -> pd.DataFrame:
        """Gets element of the dataset

        Args:
            index (int): index of the element in the dataset
        Returns:
            Single element by index
        """        
        
        pan12_df = pd.DataFrame(columns=['conversation_id', 'line', 'author', 'time', 'text', 'line_label', 'author_label'])
        for i in tqdm(range(self.length)):
            message_dict = {}
            try:
                message = self.message_list[self.next_message_idx]
            except(IndexError):
                self.load_next_conversation_to_list()
                message = self.message_list[self.next_message_idx]
            
            message_dict['conversation_id'] = self.current_conversation_id
            self.next_message_idx += 1
            
            message_dict['line'] = message.attrib['line']  
            for field in message:
                message_dict[field.tag] = field.text
            
            message_dict['author_label'] = 1 if message_dict['author'] in self.perverted_authors else 0
            message_dict['line_label'] = 1 if message_dict['conversation_id'] + '_' + message_dict['line'] in self.pervert_lines else 0
            
            pan12_df = pan12_df.append(message_dict, ignore_index=True)
            if i % 1000 == 0:
                pan12_df.to_csv(filename)
                print('.', end='')

            # #######
            # if i == 2001:
            #     print(pan12_df.head(100))
            #     break
            # ######
        pan12_df.to_csv(filename)
        return pan12_df
    
    def _get_conversation_roots(self, file_path):
        doc_tree = ET.parse(file_path)
        conversation_roots = doc_tree.getroot().findall('conversation')
        return conversation_roots

    def _get_ds_length(self):
        number_messages = 0
        for conversation in self.conversations:
            number_messages += len(conversation.findall('message'))
        
        return number_messages

    def load_next_conversation_to_list(self):
        try:
            conversation = self.conversations[self.next_conversation_idx] 
            self.current_conversation_id = conversation.attrib['id']  
        except(IndexError):
            raise StopIteration()

        self.next_conversation_idx += 1
        self.message_list = [m for m in conversation.findall('message')]
        self.next_message_idx = 0


In [36]:
# # Test converter

# pan12_converter = Pan12converterToDF(PAN12_DATA_FILE, user_labels_file=PAN12_USER_LABELS_FILE, line_labels_file=PAN12_LINE_LABELS_FILE)
# print(len(pan12_converter))
# pan12_df = pan12_converter.convert(OUTPUT_FOLDER / Path('pan12_csv.zip'))
# print(len(pan12_df))
# pan12_df.head(100)


2058781


  0%|          | 0/2058781 [00:00<?, ?it/s]

...                     conversation_id line                            author  \
0   affc2df0951b733d14ba92d19d9b7695    1  0a39f78bcb297ab0ebe8a29c28bfed89   
1   affc2df0951b733d14ba92d19d9b7695    2  60659cfda992013e610f285c46692d28   
2   affc2df0951b733d14ba92d19d9b7695    3  b8810fee2f4a71f849f3f7409546d1d9   
3   affc2df0951b733d14ba92d19d9b7695    4  60659cfda992013e610f285c46692d28   
4   affc2df0951b733d14ba92d19d9b7695    5  60659cfda992013e610f285c46692d28   
..                               ...  ...                               ...   
95  17784c5a093477c1706b1a68cea7c802    4  fcfda042c76436b97eca32b6c0490d1d   
96  17784c5a093477c1706b1a68cea7c802    5  fcfda042c76436b97eca32b6c0490d1d   
97  17784c5a093477c1706b1a68cea7c802    6  fcfda042c76436b97eca32b6c0490d1d   
98  17784c5a093477c1706b1a68cea7c802    7  fcfda042c76436b97eca32b6c0490d1d   
99  17784c5a093477c1706b1a68cea7c802    8  fcfda042c76436b97eca32b6c0490d1d   

     time                                       

Unnamed: 0,conversation_id,line,author,time,text,line_label,author_label
0,affc2df0951b733d14ba92d19d9b7695,1,0a39f78bcb297ab0ebe8a29c28bfed89,15:24,bugmail: [Bug 6978] New: Mark eof-terminated s...,0,0
1,affc2df0951b733d14ba92d19d9b7695,2,60659cfda992013e610f285c46692d28,15:32,"Henri, can I ask you a Firefox build question ...",0,0
2,affc2df0951b733d14ba92d19d9b7695,3,b8810fee2f4a71f849f3f7409546d1d9,15:34,"60659cfda992013e610f285c46692d28: sure, but I ...",0,0
3,affc2df0951b733d14ba92d19d9b7695,4,60659cfda992013e610f285c46692d28,15:35,"It appears the build runs through, it creates ...",0,0
4,affc2df0951b733d14ba92d19d9b7695,5,60659cfda992013e610f285c46692d28,15:35,"when I start it, I get my standard install of ...",0,0
...,...,...,...,...,...,...,...
95,17784c5a093477c1706b1a68cea7c802,4,fcfda042c76436b97eca32b6c0490d1d,03:10,hi,0,0
96,17784c5a093477c1706b1a68cea7c802,5,fcfda042c76436b97eca32b6c0490d1d,03:10,sorry,0,0
97,17784c5a093477c1706b1a68cea7c802,6,fcfda042c76436b97eca32b6c0490d1d,03:10,i was just angry,0,0
98,17784c5a093477c1706b1a68cea7c802,7,fcfda042c76436b97eca32b6c0490d1d,03:10,what,0,0


## TF/IDF - Not started

In [None]:
tfIdfVectorizer=TfidfVectorizer(use_idf=True)
tfIdf = tfIdfVectorizer.fit_transform(dataset)
df = pd.DataFrame(tfIdf[0].T.todense(), index=tfIdfVectorizer.get_feature_names(), columns=["TF-IDF"])
df = df.sort_values('TF-IDF', ascending=False)
print (df.head(25))

## some thoughts
Bag of words - sexual words, fear, trust, family, approach (Location, transport) , other categories - DrouinBoydHancockJames2017
Good article: file:///D:/docs/DSML_IDC/Semester%204/Cyber/Tasks/Task2/ref%20docs/Early%20Text%20Classification%20using%20Multi-Resolution%20Concept%20Representations.pdf
Ensamble and preprocessing: file:///D:/docs/DSML_IDC/Semester%204/Cyber/Tasks/Task2/ref%20docs/PredatoryConversationDetection.pdf
file:///D:/docs/DSML_IDC/Semester%204/Cyber/Tasks/Task2/ref%20docs/Analyzing_Chat_Conversations_of_Pedophil.pdf
