In [1]:
import pandas as pd
import numpy as np
import plotly

In [2]:
industry = pd.read_csv('sm_dj_industry.csv')
pharma = pd.read_csv('sm_dj_pharma.csv')
tourism = pd.read_csv('sm_dj_tourism.csv')
dax = pd.read_csv('sm_dax.csv')
sp500 = pd.read_csv('sm_sp500.csv')
hero = pd.read_csv('sm_hero.csv')
nasdaq100 = pd.read_csv('sm_nasdaq.csv')
psi20 = pd.read_csv('sm_psi20.csv')
retail = pd.read_csv('sm_dj_retail.csv')
cac = pd.read_csv('sm_dj_cac.csv')

In [3]:
def pre_processor (df):
    
    """This function applies some pre processing to each stock market dataframe"""
    
    # List the columns that do not contain special characters in the name
    columns_wo_char = ['Último', 'Abertura', 'Alta', 'Baixa']
    
    # List the columns that contain special characters in the name '%' or 'M'
    columns_w_char = ['Vol.', 'Var. %']
    
    # Convert the columns into float
    for column_wo_char in columns_wo_char:
        # Replace characters in the middle of the string
        df[column_wo_char] = df[column_wo_char].map(lambda number: number.replace('.', '').replace(',', '.'))
        # Convert to float
        df[column_wo_char] = df[column_wo_char].astype(float)
        
    # Convert the columns into float and remove the last character
    for column_w_char in columns_w_char:
        # Remove the last character
        df[column_w_char] = df[column_w_char].map(lambda value : value[:-1])
        # Replace characters in the middle of the string
        df[column_w_char] = df[column_w_char].map(lambda number: number.replace('.', '').replace(',', '.'))
        # Replace ''
        df[column_w_char] = df[column_w_char].map(lambda number: '0' if number == '' else number) 
        # Convert to float
        df[column_w_char] = df[column_w_char].astype(float)
        
    # Rename column in order to keep the measure unit    
    df.rename(columns={'Vol.' : 'Vol. (M)'})
    
    # Convert 'Data' (portuguese for date) into datetime
    df['Data'] = pd.to_datetime(df['Data'], errors = 'raise', format = '%d.%m.%Y')
    # Create column with the year
    df['Year'] = df['Data'].dt.year
    # Create column with the month
    df['Month'] = df['Data'].dt.month
    # Add a 0 before the month if it is < 10
    df['Month'] = df['Month'].map(lambda month : '0' + str(month) if month < 10 else month)
    
    # Drop column 'Data' for further actions (groupby)
    # df.drop(columns = 'Data', inplace = True)
    
    # Concatenate Month and Year into one column
    df['Year+Month'] = df['Year'].map(str) + '-' + df['Month'].map(str) 

In [4]:
df_list = [nasdaq100, pharma, industry, tourism, dax, hero, psi20, retail, cac, sp500]
for df in df_list:
    pre_processor(df)

In [6]:
df_list = [[nasdaq100, 'nasdaq100'], 
           [pharma, 'pharma'], 
           [industry, 'industry'], 
           [tourism, 'tourism'], 
           [dax, 'dax'], 
           [hero, 'hero'], 
           [psi20, 'psi20'], 
           [retail, 'retail'], 
           [cac, 'cac'], 
           [sp500, 'sp500']
          ]

for df in df_list:
    df[0].to_csv(df[1] + '.csv', sep='\t')