In [15]:
import pandas as pd

# Padronizar datas + remover nulos

**O Yahoo Finance já traz a data como índice. Vamos:**

- garantir que é datetime

- remover valores nulos

- Datas padronizadas evitam bugs em séries temporais

- dropna() evita métricas erradas

In [16]:
def clean_market_data(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

     # se existir coluna Date, usar como índice
    if "Date" in df.columns:
        df["Date"] = pd.to_datetime(df["Date"], utc=True)
        df.set_index("Date", inplace=True)
        df.index = df.index.tz_localize(None)


    # remover linhas com valores nulos
    df.dropna(inplace=True)

    return df

# Retorno diário
**Fórmula:**

> retorno = (preço_atual / preço_anterior) - 1

- Usei pct_change() para calcular retornos percentuais diários.


In [17]:
def calculate_daily_return(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    df["daily_return"] = df["Close"].pct_change()

    return df

# Média móvel (7 dias)


In [18]:
def calculate_moving_average(df: pd.DataFrame, window: int = 7) -> pd.DataFrame:
    df = df.copy()

    df[f"ma_{window}"] = df["Close"].rolling(window=window).mean()

    return df

# Volatilidade simples

**Definição:**

- desvio padrão dos retornos

- janela móvel (ex: 7 dias)

In [19]:
def calculate_volatility(df: pd.DataFrame, window: int = 7) -> pd.DataFrame:
    df = df.copy()

    df[f"volatility_{window}"] = (
        df["daily_return"]
        .rolling(window=window)
        .std()
    )

    return df

# Função FINAL (pipeline completo)

In [20]:
def process_market_data(df: pd.DataFrame) -> pd.DataFrame:
    df = clean_market_data(df)
    df = calculate_daily_return(df)
    df = calculate_moving_average(df, window=7)
    df = calculate_volatility(df, window=7)

    return df

# Exemplo de Uso

In [21]:
df = pd.read_csv(r"..\data\raw\AAPL_stock_data.csv", sep=";")
processed_df = process_market_data(df)

In [22]:
display(processed_df.shape)


(249, 10)

In [23]:
display(processed_df.head(10))


Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,daily_return,ma_7,volatility_7
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2025-01-02 05:00:00,247.80922,247.978468,240.731247,242.752106,55740700,0.0,0.0,,,
2025-01-03 05:00:00,242.264297,243.080598,240.800915,242.264297,40244100,0.0,0.0,-0.002009,,
2025-01-06 05:00:00,243.210031,246.216438,242.105028,243.896927,45045600,0.0,0.0,0.006739,,
2025-01-07 05:00:00,241.885999,244.444435,240.263348,241.119476,40856000,0.0,0.0,-0.011388,,
2025-01-08 05:00:00,240.830782,242.612732,238.969207,241.607269,37628900,0.0,0.0,0.002023,,
2025-01-10 05:00:00,238.929391,239.078725,231.950958,235.78363,61710900,0.0,0.0,-0.024104,,
2025-01-13 05:00:00,232.478557,233.613424,228.685713,233.344635,49630700,0.0,0.0,-0.010344,240.109763,
2025-01-14 05:00:00,233.693058,235.056885,231.423325,232.229675,39435300,0.0,0.0,-0.004778,238.606559,0.010159
2025-01-15 05:00:00,233.583558,237.884115,233.374497,236.799011,39832000,0.0,0.0,0.019676,237.825803,0.014183
2025-01-16 05:00:00,236.281354,236.938371,227.003309,227.232269,71759100,0.0,0.0,-0.0404,235.445138,0.019051


In [24]:
display(processed_df.tail(10))


Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,daily_return,ma_7,volatility_7
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2025-12-16 05:00:00,272.820007,275.5,271.790009,274.609985,37648600,0.0,0.0,0.001824,276.982853,0.006508
2025-12-17 05:00:00,275.01001,276.160004,271.640015,271.839996,50138700,0.0,0.0,-0.010087,276.118565,0.007182
2025-12-18 05:00:00,273.609985,273.630005,266.950012,272.190002,51630700,0.0,0.0,0.001288,275.405709,0.007376
2025-12-19 05:00:00,272.149994,274.600006,269.899994,273.670013,144632000,0.0,0.0,0.005437,274.675711,0.007314
2025-12-22 05:00:00,272.859985,273.880005,270.51001,270.970001,36571800,0.0,0.0,-0.009866,273.66714,0.007812
2025-12-23 05:00:00,270.839996,272.5,269.559998,272.359985,29642000,0.0,0.0,0.00513,272.821424,0.008366
2025-12-24 05:00:00,272.339996,275.429993,272.200012,273.809998,17910600,0.0,0.0,0.005324,272.778569,0.00693
2025-12-26 05:00:00,274.160004,275.369995,272.859985,273.399994,21521800,0.0,0.0,-0.001497,272.605713,0.006887
2025-12-29 05:00:00,272.690002,274.359985,272.350006,273.76001,23715200,0.0,0.0,0.001317,272.880001,0.005476
2025-12-30 05:00:00,272.809998,274.079987,272.279999,273.079987,22139600,0.0,0.0,-0.002484,273.007141,0.005629


In [25]:
display(processed_df.info())


<class 'pandas.DataFrame'>
DatetimeIndex: 249 entries, 2025-01-02 05:00:00 to 2025-12-30 05:00:00
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Open          249 non-null    float64
 1   High          249 non-null    float64
 2   Low           249 non-null    float64
 3   Close         249 non-null    float64
 4   Volume        249 non-null    int64  
 5   Dividends     249 non-null    float64
 6   Stock Splits  249 non-null    float64
 7   daily_return  248 non-null    float64
 8   ma_7          243 non-null    float64
 9   volatility_7  242 non-null    float64
dtypes: float64(9), int64(1)
memory usage: 21.4 KB


None

In [26]:
display(processed_df.describe())

Unnamed: 0,Open,High,Low,Close,Volume,Dividends,Stock Splits,daily_return,ma_7,volatility_7
count,249.0,249.0,249.0,249.0,249.0,249.0,249.0,248.0,243.0,242.0
mean,231.418907,234.109716,228.99938,231.618751,54283740.0,0.004137,0.0,0.000681,230.974547,0.016509
std,26.607098,26.295522,26.751525,26.555819,23362970.0,0.032443,0.0,0.020489,25.979072,0.012631
min,171.364011,189.691359,168.633375,171.832428,17910600.0,0.0,0.0,-0.092456,189.665721,0.003744
25%,209.488932,211.516708,207.283587,209.568771,40896200.0,0.0,0.0,-0.006729,208.245135,0.009425
50%,229.027982,232.080362,226.850094,229.497528,48013300.0,0.0,0.0,0.00079,229.654515,0.014009
75%,253.853917,255.672152,250.796869,253.793961,57365700.0,0.0,0.0,0.006617,251.512616,0.018844
max,286.200012,288.619995,283.299988,286.190002,184395900.0,0.26,0.0,0.153288,281.380005,0.084847
