In [1]:
import os, re, unidecode
import pandas as pd
from gensim.parsing import preprocessing as pproc

In [2]:
apply_preprocessing = False

In [3]:
"""
Apply standard pre-processing techniques to a text and return the normalized string.
"""
def process_text(string, remove_stopwords=True, stemming=False):
    
    string = unidecode.unidecode(string)
    string = string.lower()
    abbreviations = re.findall(r'(?:[a-z]\.)+', string)
    for abbr in abbreviations:
        string = string.replace(abbr, abbr.replace('.',''))
    string = pproc.strip_punctuation2(string)
    if remove_stopwords:
        string = pproc.remove_stopwords(string)
    if stemming:
        string = pproc.stem_text(string)
    string = string.strip()
    return string

In [4]:
def prepare_reuters():
    data = []
    ids = []
    for file_name in os.listdir('data/reuters/documents'):
        with open('data/reuters/documents/'+file_name, 'r') as file:
            content = file.read()
            for document in content.split('<REUTERS')[1:]:

                if document.find('<TEXT TYPE="BRIEF"') != -1 or document.find('<TEXT TYPE="UNPROC"') != -1:
                    continue

                id_start = document.find('NEWID="') + len('NEWID="')
                id_end = id_start + document[id_start:].find('">')
                doc_id = int(document[id_start:id_end])
                ids.append(doc_id)

                start_topics = document.find('<TOPICS>') + len('<TOPICS>')
                end_topics = document.find('</TOPICS>')
                topic_tags = document[start_topics:end_topics]
                topics = []
                while len(topic_tags) > 0:
                    start_index = topic_tags.find('<D>') + len('<D>')
                    end_index = topic_tags.find('</D>')
                    topics.append(topic_tags[start_index:end_index])
                    topic_tags = topic_tags[end_index+len('</D>'):]
                if len(topics) > 0:
                    topics = ','.join(topics)
                else:
                    topics = ''
                start_body = document.find('<BODY>') + len('<BODY>')
                end_body = document.find('</BODY>')
                body = document[start_body:end_body]
                body = body.replace('\n', ' ')
                body = body.replace('     ', ' ')
                body = re.sub(' +', ' ', body)
                body = body.replace(' Reuter &#3;', '')

                if apply_preprocessing:
                    body = process_text(body)

                data.append({'doc_id':doc_id, 'labels':topics, 'text':body})
    df = pd.DataFrame(data, index=ids)
    df.to_csv('data/reuters.csv', index=False)

In [5]:
def prepare_webscope():
    df = pd.read_table('data/webscope/Webscope_R4/movie_db_yoda', sep='\t', encoding='ISO-8859-1',
                       names=['doc_id', 'text', 'labels'], usecols=[0,2,10], index_col=0)

    df['labels'] = [','.join(genres.split('|')) for genres in df['labels']]
    if apply_preprocessing:
        df['text'] = [process_text(t) for t in df['text']]
    df.to_csv('data/webscope_r4.csv')

In [6]:
prepare_webscope()