In [2]:
import base64
import datetime
import glob
import numpy
import os
import pandas
import pickle
import requests
import string
import sys

In [8]:
# Algunos de acá deberían venir del entorno
SOURCE_DIR = os.path.abspath('dataset')
SEARCHED_EXTENSIONS = ["txt"]

# Dataset data
DATASET_TARGET_FILE=os.path.abspath('dataset.data')
DATASET_NAME = '20NewsGroups'
DATASET_DESCRIPTION = "Dataset de 20NewsGroup: http://qwone.com/~jason/20Newsgroups"
DATASET_TEXT_LABEL = "data"
DATASET_OBSERVATIONS = ""

In [9]:
def clean_text(text):
    PERMITTED_CHARS = "0123456789abcdefghijklmnopqrstuvwxyz_-. " 
    text = text.lower()
    text = "".join(c for c in text if c in PERMITTED_CHARS)
    text = text.replace(" ", "_")
    return text

In [10]:
def get_subdirs_dict(dir_path):
    result = {}
    subdir_names = [o for o in os.listdir(dir_path) if os.path.isdir(os.path.join(dir_path,o))]
    for subdir_name in subdir_names:
        key = clean_text(subdir_name)
        result[key] = os.path.join(dir_path, subdir_name)
    return result


In [11]:
def get_subfiles_with_extensions(source_path, extensions):
    dir_contents = os.listdir(source_path)
    files = []
    for content in dir_contents:
        content_path = os.path.join(source_path, content)
        if os.path.isdir(content_path):
            files = files + get_subfiles_with_extensions(content_path, extensions)
        else:
            for extension in extensions:
                if content_path.endswith(extension):
                    files.append(content_path)
    return files


In [12]:
def convert_file_to_base64(filepath):
    file = open(filepath, 'rb')
    data = file.read()
    file.close()
    raw_base64 = base64.b64encode(data) 
    return raw_base64.decode('utf8')

In [25]:
def read_file(filepath):
    try:
        with open(filepath, 'r', encoding='utf-8') as file:
            file_content = file.read()
            return file_content
    except:
        print(f'Cannot read file {filepath}')

In [16]:
def export_with_pickle(file_path, object_to_save):
    file = open(file_path, 'wb')
    picklestring = pickle.dumps(object_to_save, protocol=pickle.HIGHEST_PROTOCOL)
    file.write(picklestring)
    file.close()

In [22]:
def generate_dataframe(source_path, encoding = "utf8"):
    subdirs_dictionary = get_subdirs_dict(source_path)
    categories = subdirs_dictionary.keys()
    data = {DATASET_TEXT_LABEL:[]}
    extracted_count = 0
    for category in categories:
        data[category] = []
    for subdir_category, subdir_path in subdirs_dictionary.items():
        subdir_category = clean_text(subdir_category)
        subfiles = get_subfiles_with_extensions(subdir_path, SEARCHED_EXTENSIONS)
        for subfile in subfiles:
            file_base64 = convert_file_to_base64(subfile)
            text_content = read_file(subfile) 
            data[DATASET_TEXT_LABEL].append(text_content)
            for specific_category in categories:
                category_value = 1 if specific_category == subdir_category else 0
                data[specific_category].append(category_value)
            extracted_count += 1
            print(f'Extracted {extracted_count} files...', sep=" ", end="\r", flush=True)
    sys.stdout.flush()
    print("")
    print(f'Extracted all files...', sep=" ", end="\r", flush=True)
    return pandas.DataFrame(data)

In [23]:
def generate_dataset_file(target_file, dataframe):
    file_content = {
        "name": DATASET_NAME,
        "description": DATASET_DESCRIPTION,
        "date": datetime.datetime.now(),
        "text_label": DATASET_TEXT_LABEL,
        "observations": DATASET_OBSERVATIONS,
        "dataframe": dataframe
    }
    export_with_pickle(target_file, file_content)
    return True

In [27]:
dataframe = generate_dataframe(SOURCE_DIR)
display(dataframe)

Extracted 8355 files...
Extracted all files...

Unnamed: 0,data,deportes,electronica,espacio,medicina,politica,religion,tecnologia,vehiculos
0,"From: ching@bigwpi.WPI.EDU (""The Logistician"")...",1,0,0,0,0,0,0,0
1,From: dxf12@po.CWRU.Edu (Douglas Fowler)\nSubj...,1,0,0,0,0,0,0,0
2,From: dxf12@po.CWRU.Edu (Douglas Fowler)\nSubj...,1,0,0,0,0,0,0,0
3,From: derich@netcom.com (Scotty*Tissue)\nSubje...,1,0,0,0,0,0,0,0
4,From: mmilitzo@scott.skidmore.edu (matthew mil...,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
8350,"From: ""tom neumann"" <tom.neumann@canrem.com>\n...",0,0,0,0,0,0,0,1
8351,From: davec@ECE.Concordia.CA (Dave Chu)\nSubje...,0,0,0,0,0,0,0,1
8352,From: tszeto@sneezy.ts.stratus.com (Tommy Szet...,0,0,0,0,0,0,0,1
8353,From: tquinn@heartland.bradley.edu (Terry Quin...,0,0,0,0,0,0,0,1


In [28]:
generate_dataset_file(DATASET_TARGET_FILE, dataframe)

True