# Data loading and basic cleaning


Basic imports and a custom function to load the dataset along with the column specifying the language

In [2]:
from sklearn.datasets import load_files
import pandas as pd
import numpy as np
import os


def get_dataset(foldername,rootpath):
    path = os.path.join(rootpath,foldername)
    data = load_files(path,encoding="utf-8")  
    Xa, ya, y_names = data.data, data.target, data.target_names
    df = pd.DataFrame(list(zip(Xa,ya)),columns = ["text","label"])
    df["section"] = foldername
    df["label_name"] = df["label"].apply(lambda x: y_names[int(x)] )
    return df

df = pd.DataFrame(columns=["text","label","section","label_name"])

for name in os.listdir("documents_challenge"):
    df = df.append(get_dataset(name,"./documents_challenge"))
    
df.head()

Unnamed: 0,text,label,section,label_name
0,\n\nMientras sucedía lo referido con los Carer...,1,PAN11,es
1,"Las escribí á V. E., y V. E. las trasladó al ...",1,PAN11,es
2,"I pass over in silence, not to annoy the read...",0,PAN11,en
3,"Cambiaban las formas, pero el alma permanecía...",1,PAN11,es
4,"Rose from the bench that worked,\nand walked ...",0,PAN11,en


Check that the loading process is correct and all documents are loaded

In [3]:
df.shape

(23128, 4)

## Text cleaning

Basic text cleaning. After the process is complete, the cleaned data is stored on a csv file in in order not to have to repeat the cleaning process every time further analysis is to be conducted. Steps included in this function are the most basic ones. Further exploration should explore frequencies, n_grams and substituting named entities by a chosen label

In [5]:
from nltk.corpus import stopwords
import unidecode
import re
def doClean(text,language):   


    # Remove all that doesn't resemble a word
    a = re.sub(r'\W', ' ', text)
    # Remove numbers
    a = re.sub(" \d+", "label_num", a)
    # Remove extra spaces
    a = re.sub(r'\s+', ' ', a, flags=re.I)
    # Transform to lowercase
    a = a.lower()
    # remove stopwords by language
    if language == "en":
        a = a.split()
        a = [ word for word in a if word not in stopwords.words("english")]
        a = " ".join(a)
    elif language == "es":
        a = a.split()
        a = [ word for word in a if word not in stopwords.words("spanish")]
        a = " ".join(a)
    elif language == "fr":
        a = a.split()
        a = [ word for word in a if word not in stopwords.words("french")]
        a = " ".join(a)
    return a
print("Cleaning text...")
df["text"] = df.apply(lambda x : doClean(x["text"],x["label_name"]),axis=1)
print("Done! Writing file...")
df.to_csv("clandata.csv",index=False)
print("Done!")

Cleaning text...
Done! Writing file...
Done!
