## Imports

In [1]:
import pandas as pd
import os
import glob

## Configuration
*input_dir:* The path to the directory that contains your text files. Please make sure to use a '/' (slash) in the end. For example: `path/to/texts/`.

*dataframe_filename:* The filename for the resulting pandas DataFrame. You may use the **.p** extension indicating a pickled file, but you are free to use whatever you like. Just make sure this is consistent in the subsequent sentiment analysis step.

In [2]:
input_dir = 'Y:/data/projekte/cartas/data/cartas_SA_new/sentiment-annotation/all_selections/'
dataframe_filename = "all_texts_spanish.p"

## Directory Setup (Optional)
Creates directories according to the configuration if not already created manually.

In [3]:
if not os.path.exists(input_dir):
    os.makedirs(input_dir)

## Data Preparation

### Load texts

In [11]:
text_file_names = glob.glob("{}*.txt".format(input_dir))
print("found {} texts".format(len(text_file_names)))
texts = []
for text_file_name in text_file_names:
    if "\\" in text_file_name:
        corrected_filename = text_file_name.split("\\")[-1]
    elif "/" in text_file_name:
        corrected_filename = text_file_name.split("/")[-1]
    with open(text_file_name, "r", encoding="utf-8") as input_file:
        texts.append([corrected_filename, input_file.read()])
print("loaded {} texts".format(len(texts)))

found 400 texts
loaded 400 texts


### Create DataFrame

In [12]:
print("searching files for attributes and text")
prepared_texts = []
num_attributes = 0
for filename, text in texts:
    lines = text.split("\n")
    prepared_text = {"filename": filename}
    cur_line = 0
    for line in lines:
        line_type, line_content = line.split("=")[:2]
        if line_type != "text":
            try:
                line_content = float(line_content)
            except ValueError:
                pass
            prepared_text.update({line_type: line_content})
        else:
            break
        cur_line += 1
    num_attributes = max(num_attributes, cur_line)
    prepared_text.update({"text": " ".join(lines[cur_line:])[5:]})
    prepared_texts.append(prepared_text)

print("found {} additional attributes in .txt files".format(num_attributes))

texts_df = pd.DataFrame(prepared_texts)
texts_df.set_index("filename", inplace=True)

searching files for attributes and text
found 4 additional attributes in .txt files


### Save DataFrame

In [13]:
texts_df.to_pickle(dataframe_filename)

In [14]:
texts_df

Unnamed: 0_level_0,periodical title,author,year,issue number,text
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1735-1736_El-Duende-Crítico_Frai-Manuel-de-San-Josef_Vol-1_Nr-004_112-823.txt,El-Duende-Crítico,Frai Manuel de San Josef,1735-1736,1-004,Jueves 29 de Diciembre de 1735 El nacimiento #...
1735-1736_El-Duende-Crítico_Frai-Manuel-de-San-Josef_Vol-1_Nr-010_112-841.txt,El-Duende-Crítico,Frai Manuel de San Josef,1735-1736,1-010,"Jueves 9. de Febrero de 1736 Notas, que escrib..."
1761-06-09_El-Duende-especulativo-sobre-la-vida-civil_Juan-Antonio-Mercadal-(Francisco-Mariano-Nipho-o-Juan-Enrique-de-Graef)_Vol-1_Nr-01_093-235.txt,El-Duende-especulativo-sobre-la-vida-civil,Juan Antonio Mercadal [Francisco Mariano Nipho...,1761-06-09,1-01,Num. I. Martes 9. de Junio de 1761. ##START:ZM...
1761-06-13_El-Duende-especulativo-sobre-la-vida-civil_Juan-Antonio-Mercadal-(Francisco-Mariano-Nipho-o-Juan-Enrique-de-Graef)_Vol-1_Nr-02_093-236.txt,El-Duende-especulativo-sobre-la-vida-civil,Juan Antonio Mercadal [Francisco Mariano Nipho...,1761-06-13,1-02,"NUM. II. ##START:ZM## Decet affectus animi, ne..."
1761-06-19_El-Duende-especulativo-sobre-la-vida-civil_Juan-Antonio-Mercadal-(Francisco-Mariano-Nipho-o-Juan-Enrique-de-Graef)_Vol-1_Nr-03_093-237.txt,El-Duende-especulativo-sobre-la-vida-civil,Juan Antonio Mercadal [Francisco Mariano Nipho...,1761-06-19,1-03,NUM. III. Viernes 19. de Junio de 1761. ##STAR...
...,...,...,...,...,...
1804_El-Regañón-general_Anónimo-(Ventura-Ferrer)_Vol-2_Nr-62_7946.txt,El-Regañón-general,Anónimo [Ventura Ferrer],1804,2-62,Núm.° 62. Sábado 4 de Agosto de 1804 Costumbre...
1804_El-Regañón-general_Anónimo-(Ventura-Ferrer)_Vol-2_Nr-63_7947.txt,El-Regañón-general,Anónimo [Ventura Ferrer],1804,2-63,Núm.° 63. Miércoles 8 de Agosto de 1804. Circu...
1804_El-Regañón-general_Anónimo-(Ventura-Ferrer)_Vol-2_Nr-64_7948.txt,El-Regañón-general,Anónimo [Ventura Ferrer],1804,2-64,Núm.° 64. Sábado 11 de Agosto de 1804. Concluy...
1812_El-Pensador-Mexicano_José-Joaquín-Fernández-de-Lizardi_Vol-1_Nr-008_8080.txt,El-Pensador-Mexicano,José Joaquín Fernández de Lizardi,1812,1-008,Número 8 Ejecusión de Justicia ##START:MT## En...
