# Generate PAN 2012 training set in CSV

In [None]:
from google.colab import drive
from config import *

import pandas as pd
import xml.etree.ElementTree as et

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


## Turning XML dataset into CSV

In [None]:
dataset = DRIVE_PATH + 'pan12-sexual-predator-identification-training-corpus-2012-05-01.xml'

tree = et.parse(dataset)
root = tree.getroot()

In [None]:
cols = ['conversation_id', 'line', 'author', 'time', 'message']
rows = []

# Iterating XML to build DataFrame

for node in root:

    for element in node:      
        line = element.attrib.get('line')
        author = element.find('author').text
        time = element.find('time').text
        message = element.find('text').text
        
        rows.append({
            'conversation_id': node.attrib.get('id'),
            'line': line,
            'author': author,
            'time': time,
            'message': message
        })
        
df = pd.DataFrame(rows, columns = cols)

In [None]:
df

Unnamed: 0,conversation_id,line,author,time,message
0,e621da5de598c9321a1d505ea95e6a2d,1,97964e7a9e8eb9cf78f2e4d7b2ff34c7,03:20,Hola.
1,e621da5de598c9321a1d505ea95e6a2d,2,0158d0d6781fc4d493f243d4caa49747,03:20,hi.
2,e621da5de598c9321a1d505ea95e6a2d,3,0158d0d6781fc4d493f243d4caa49747,03:20,whats up?
3,e621da5de598c9321a1d505ea95e6a2d,4,97964e7a9e8eb9cf78f2e4d7b2ff34c7,03:20,not a ton.
4,e621da5de598c9321a1d505ea95e6a2d,5,97964e7a9e8eb9cf78f2e4d7b2ff34c7,03:20,you?
...,...,...,...,...,...
903602,4ed6b02ae537fdfd6078597b706292a8,101,74bfc043bd5ce9c17b37ffae6e0ba2fa,22:36,oh ok
903603,4ed6b02ae537fdfd6078597b706292a8,102,8cd850ea4215ee7c4b94b6bcc0bae593,22:36,i will look for you tomorrow
903604,4ed6b02ae537fdfd6078597b706292a8,103,74bfc043bd5ce9c17b37ffae6e0ba2fa,22:36,ok
903605,4ed6b02ae537fdfd6078597b706292a8,104,8cd850ea4215ee7c4b94b6bcc0bae593,22:36,bye lissa


In [None]:
# Save dataframe in CSV

df.to_csv(DRIVE_PATH + 'databases/training.csv', index=False, header=True, sep=';')

In [None]:
# Info

df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 903607 entries, 0 to 903606
Data columns (total 5 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   conversation_id  903607 non-null  object
 1   line             903607 non-null  object
 2   author           903607 non-null  object
 3   time             903607 non-null  object
 4   message          900632 non-null  object
dtypes: object(5)
memory usage: 34.5+ MB


In [None]:
print('Number of conversations: ', len(df['conversation_id'].value_counts()))
print('Number of messages: ', len(df['message']))
print('Number of authors: ', len(df['author'].value_counts()))

Number of conversations:  66927
Number of messages:  903607
Number of authors:  97689
