# Generate PAN 2012 test set in CSV

In [1]:
from google.colab import drive
from config import *

import pandas as pd
import xml.etree.ElementTree as et

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


## Turning XML dataset into CSV

In [3]:
dataset = DRIVE_PATH + 'pan12-sexual-predator-identification-test-corpus-2012-05-17.xml'

tree = et.parse(dataset)
root = tree.getroot()

In [4]:
cols = ['conversation_id', 'line', 'author', 'time', 'message']
rows = []

# Iterating XML to build DataFrame

for node in root:

    for element in node:      
        line = element.attrib.get('line')
        author = element.find('author').text
        time = element.find('time').text
        message = element.find('text').text
        
        rows.append({
            'conversation_id': node.attrib.get('id'),
            'line': line,
            'author': author,
            'time': time,
            'message': message
        })
        
df = pd.DataFrame(rows, columns = cols)

In [5]:
df

Unnamed: 0,conversation_id,line,author,time,message
0,affc2df0951b733d14ba92d19d9b7695,1,0a39f78bcb297ab0ebe8a29c28bfed89,15:24,bugmail: [Bug 6978] New: Mark eof-terminated s...
1,affc2df0951b733d14ba92d19d9b7695,2,60659cfda992013e610f285c46692d28,15:32,"Henri, can I ask you a Firefox build question ..."
2,affc2df0951b733d14ba92d19d9b7695,3,b8810fee2f4a71f849f3f7409546d1d9,15:34,"60659cfda992013e610f285c46692d28: sure, but I ..."
3,affc2df0951b733d14ba92d19d9b7695,4,60659cfda992013e610f285c46692d28,15:35,"It appears the build runs through, it creates ..."
4,affc2df0951b733d14ba92d19d9b7695,5,60659cfda992013e610f285c46692d28,15:35,"when I start it, I get my standard install of ..."
...,...,...,...,...,...
2058776,8deed7c66340728e6863f3d931d4cca7,53,105ca6d0fd6c2c3e136980d3548f16ab,02:32,"Come the revolution, I'll worry about it"
2058777,8deed7c66340728e6863f3d931d4cca7,54,48a4fba2c4916bbc8aae694a3877dca3,02:33,no pasaran!
2058778,8deed7c66340728e6863f3d931d4cca7,55,48a4fba2c4916bbc8aae694a3877dca3,02:33,:)
2058779,8deed7c66340728e6863f3d931d4cca7,56,e1e3d026bb7ee9ee264316da10378aeb,02:35,25mhz is slow :(


In [6]:
# Save dataframe in CSV

df.to_csv(DRIVE_PATH + 'databases/test.csv', index=False, header=True, sep=';')

In [7]:
# Info

df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2058781 entries, 0 to 2058780
Data columns (total 5 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   conversation_id  2058781 non-null  object
 1   line             2058781 non-null  object
 2   author           2058781 non-null  object
 3   time             2058781 non-null  object
 4   message          2052328 non-null  object
dtypes: object(5)
memory usage: 78.5+ MB


In [8]:
print('Number of conversations: ', len(df['conversation_id'].value_counts()))
print('Number of messages: ', len(df['message']))
print('Number of authors: ', len(df['author'].value_counts()))

Number of conversations:  155128
Number of messages:  2058781
Number of authors:  218702
