In [1]:
import pandas as pd
import re
import unicodedata

## Create functions for load raw csv as dataframes

In [52]:
# Function to create raw dataframes

a = 0

def read_csv(filename):
    df = pd.read_csv(filename, sep='delimiter', header=None)
    return df

def set_column_name(df, colname):
    df.columns = [colname]
    return df

def extract_categories(df):
    unique = df.data.unique()
    categories = []
    for i in range(len(unique)):
        if ("►" in unique[i]):
            print(unique[i])
            categories.append(unique[i])
        else:
            pass
    return categories

def _label_race(row, categories):
    global a
    if row['data'] in categories:
        a +=1
        if a == 0:
            pass
        elif a >1:
            a = 0
        else:
            pass
        return a
    else:
        return -1
    
def create_tag_column(df, categories):
    df['tag'] = df.apply (lambda row: _label_race(row, categories), axis=1)
    return df

def create_dataframe_set(df):
    a = pd.factorize(df['tag'].isin([0,1]).iloc[::-1].cumsum().sort_index())[0]
    dfs = dict(tuple(df.groupby(a)))
    return dfs

def create_dfs_by_cat(dfs, categories):
    new_cats = []
    for i in range(len(dfs)):
        key = dfs[i]["data"].to_list()[-1]
        if key in categories:
            n_c = dict()
            n_c[key] = dfs[i+1]
        else:
            pass
        new_cats.append(n_c)

    return new_cats

def create_csv(new_cats):
    for i, d in enumerate(new_cats):
        print(i)
        for k, v in d.items():
            print(k)
            n_df = pd.DataFrame(v)
            if ("/" in k):
                new_name = k.replace("/","_")
            else:
                new_name = k
                
            export_csv = n_df.to_csv (f"data/{new_name}.csv", index = None, header=True) #Don't forget to add '.csv' at the end of the path 
 

                

## Functions for clean the data and save as csv

In [53]:
# Functions for clean the data

pattern_name = re.compile("[A-Z]+[a-z]+")
pattern_unused_rows = "\[+[0-9]+-[0-9]+\]"

def remove_accent(text):
    output = unicodedata.normalize('NFD', text).encode('ascii', 'ignore')
    decoded = output.decode('utf-8')
    return decoded

#def read_csv(file):
#    df = pd.read_csv(file)
#    return df

def unused_df_rows(dff):
    aux_df = dff[dff['data'].astype(str).str.match(pattern_unused_rows)]
    return aux_df

def get_index_to_filter(u_df, dff):
    indexNames = u_df.index
    indexUnused = dff[-1:].index
    
    indexes = indexNames.to_list() + indexUnused.to_list()
    return indexes

def drop_rows(dff, indexes):
    # Delete these row indexes from dataFrame
    dff.drop(indexes, inplace=True)
    return dff

def drop_column(dff):
    dff = dff.drop(columns=['tag'])
    return dff

def rename_column(dff, name):
    dff = dff.rename(columns={"data": name})
    return dff

def save_clean_df(dff, filename):
    dff.to_csv(f'clean_data/{filename}.csv', index = None, header=True)


def get_name(filename):
    result = pattern_name.findall(filename)[-1]
    return result


def clean_and_save(df, file_name):

    f = remove_accent(file_name)
    column_name = get_name(f)

    aux_df = unused_df_rows(df)
    indexes = get_index_to_filter(aux_df, df)

    n_df = drop_rows(df, indexes)
    n_df.head()

    df = drop_column(df)
    print(column_name)
    df = rename_column(df, column_name)
    save_clean_df(df, column_name)

    
def create_new_df(df, file_name):
    
    n_df = pd.DataFrame(columns=["date", "document", "label_1", "label_2"])

    
    f = remove_accent(file_name)
    column_name = get_name(f)

    aux_df = unused_df_rows(df)
    indexes = get_index_to_filter(aux_df, df)

    n_df = drop_rows(df, indexes)
    n_df.head()

    df = drop_column(df)
    
    print(column_name)
    
    df = rename_column(df, column_name)
    
    df

In [55]:
def grab_raw_clean_and_save(new_cats):
    for i, d in enumerate(new_cats):
        for k, v in d.items():
            n_df = pd.DataFrame(v)
            if ("/" in k):
                new_name = k.replace("/","_")
            else:
                new_name = k

            clean_and_save(n_df, new_name)

def main():
    
    filename = "2019-01-01/2019-01-01.csv"
    
    df = read_csv(filename)
    df = set_column_name(df, "data")
    
    categories = extract_categories(df)
    
    df = create_tag_column(df, categories)
    
    dfs = create_dataframe_set(df)
    
    new_cats = create_dfs_by_cat(dfs, categories)
    
    grab_raw_clean_and_save(new_cats)

main()

  


►Casas y chalets
►Lotes y terrenos
►Departamentos
►Locales Comerciales
►Otros
►Habitaciones
►Automóviles
►Camionetas
►Electricidad
►Carpintería
►Plomería
►Albañilería
►Técnicos
►Pintores/decoradores
►Profesores
►Amplificaciones
►Productos
Casas
Casas
Lotes
Departamentos
Departamentos
Departamentos
Comerciales
Comerciales
Otros
Habitaciones
Automoviles
Automoviles
Camionetas
Electricidad
Carpinteria
Plomeria
Albanileria
Tecnicos
Pintores
Profesores
Amplificaciones
Productos
Otros
Otros


## Create unique dataframe

In [None]:
def create_unique_df(new_cats, date):
    for i, d in enumerate(new_cats):
        for k, v in d.items():
            n_df = pd.DataFrame(v)
            if ("/" in k):
                new_name = k.replace("/","_")
            else:
                new_name = k

            clean_and_save(n_df, new_name)

def main():
    
    filename = "2019-01-01/2019-01-01.csv"
    
    df = read_csv(filename)
    df = set_column_name(df, "data")
    
    categories = extract_categories(df)
    
    df = create_tag_column(df, categories)
    
    dfs = create_dataframe_set(df)
    
    new_cats = create_dfs_by_cat(dfs, categories)
    


In [58]:
data = [["2001-1-1","sadasd", "lal", "lel"]] 

df_1 = pd.DataFrame(data, columns=["date", "document", "label_1", "label_2"])

In [56]:
df = pd.DataFrame(columns=["date", "document", "label_1", "label_2"])
df.head()

Unnamed: 0,date,document,label_1,label_2


In [60]:
df = df.append(df_1, ignore_index=True)
df.head()

Unnamed: 0,date,document,label_1,label_2
0,2001-1-1,sadasd,lal,lel
