In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("data/stock_details_5_years.csv")
df

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Company
0,2018-11-29 00:00:00-05:00,43.829761,43.863354,42.639594,43.083508,167080000,0.00,0.0,AAPL
1,2018-11-29 00:00:00-05:00,104.769074,105.519257,103.534595,104.636131,28123200,0.00,0.0,MSFT
2,2018-11-29 00:00:00-05:00,54.176498,55.007500,54.099998,54.729000,31004000,0.00,0.0,GOOGL
3,2018-11-29 00:00:00-05:00,83.749496,84.499496,82.616501,83.678497,132264000,0.00,0.0,AMZN
4,2018-11-29 00:00:00-05:00,39.692784,40.064904,38.735195,39.037853,54917200,0.04,0.0,NVDA
...,...,...,...,...,...,...,...,...,...
602957,2023-11-29 00:00:00-05:00,26.360001,26.397499,26.120001,26.150000,1729147,0.00,0.0,PPL
602958,2023-11-29 00:00:00-05:00,27.680000,28.535000,27.680000,28.350000,1940066,0.00,0.0,FITB
602959,2023-11-29 00:00:00-05:00,75.940002,76.555000,75.257500,75.610001,298699,0.00,0.0,IFF
602960,2023-11-29 00:00:00-05:00,45.230000,45.259998,44.040001,44.209999,2217579,0.00,0.0,CCJ


In [3]:
def add_julian_date_to_data(df):
    df = df.copy()
    df['Date'] = pd.to_datetime(df['Date'])
    df['julian_date'] = df['Date'].apply(lambda x: x.to_julian_date())
    return df


In [4]:
def add_days_distance_between_target_and_last_data_to_data(df):
    """
    data have gaps (weekends etc.) - this will add the information about how long was the gap from last known data
    so on normal days it should be 1 but for example on monday it should be 3
    """
    df = df.copy()
    
    df["data_freshness"] = df["julian_date"] - df.shift(1)["julian_date"]
    
    return df


In [5]:
def add_targed_to_data(df):
    df = df.copy()
    df["target"] = df.shift(-1)["Close"]
    return df
    

In [6]:
def add_shifts_to_data(df):
    df = df.copy()
    window_size = 10
    columns_to_take_shifts = ["Open", "High", "Low", "Close", "Volume"]
    for i in range(window_size):
        for column in columns_to_take_shifts:
            df[f"shift_{i}_{column}"] = df[f"{column}"].shift(i+1)
    return df
    
    
    
    

In [7]:
def split_by_company_apply_transform_merge_again(df):
    companies = df['Company'].unique()
    companies_dfs = []
    for company in companies:
        single_company_df =  df[df['Company'] == company]
        single_company_df = single_company_df.copy()
        single_company_df = add_julian_date_to_data(single_company_df)
        single_company_df = add_shifts_to_data(single_company_df)
        single_company_df = add_days_distance_between_target_and_last_data_to_data(single_company_df)
        single_company_df = add_targed_to_data(single_company_df)
        single_company_df = single_company_df.dropna()
        
        companies_dfs.append(single_company_df)
        
    combined_df = pd.concat(companies_dfs)
    sorted_df = combined_df.sort_values(by='Date')
    return sorted_df
        
    

In [8]:
df = split_by_company_apply_transform_merge_again(df)
df

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Company,julian_date,...,shift_8_Low,shift_8_Close,shift_8_Volume,shift_9_Open,shift_9_High,shift_9_Low,shift_9_Close,shift_9_Volume,data_freshness,target
4585,2018-12-14 00:00:00-05:00,40.552001,40.571198,39.659377,39.707367,162814800,0.0,0.0,AAPL,2458466.5,...,42.478826,42.850754,158126000.0,43.829761,43.863354,42.639594,43.083508,167080000.0,1.0,39.337856
4994,2018-12-14 00:00:00-05:00,77.874250,78.819199,77.266780,77.595825,1936700,0.0,0.0,TROW,2458466.5,...,80.861906,83.206703,3022300.0,81.757944,82.863347,81.305733,81.958931,1491200.0,1.0,76.802750
4683,2018-12-14 00:00:00-05:00,83.190394,83.507646,82.062390,82.591141,6337800,0.0,0.0,MDT,2458466.5,...,85.270137,85.948700,6581400.0,84.054019,85.754840,83.992332,85.129150,6013900.0,1.0,80.943184
4684,2018-12-14 00:00:00-05:00,57.113197,57.536196,56.593269,57.113197,2179800,0.0,0.0,PLD,2458466.5,...,58.796359,59.342724,3558700.0,59.386800,59.474923,58.778742,59.201740,1589200.0,1.0,54.618435
4993,2018-12-14 00:00:00-05:00,27.840000,28.770000,27.730000,28.180000,4867000,0.0,0.0,TCOM,2458466.5,...,27.660000,28.850000,4217800.0,28.760000,28.760000,27.500000,28.040001,6433700.0,1.0,27.309999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
602053,2023-11-28 00:00:00-05:00,119.970001,120.110001,118.559998,118.849998,3493700,0.0,0.0,GE,2460276.5,...,115.570000,117.250000,4794600.0,115.139999,116.559998,114.709999,115.519997,4592000.0,1.0,119.305000
602225,2023-11-28 00:00:00-05:00,119.139999,119.690002,118.599998,119.580002,1903100,0.0,0.0,PAYX,2460276.5,...,115.339996,116.260002,1737800.0,114.160004,114.870003,113.870003,114.480003,1275400.0,1.0,121.550003
602129,2023-11-28 00:00:00-05:00,38.720001,38.900002,38.599998,38.720001,937500,0.0,0.0,RELX,2460276.5,...,36.040001,36.209999,883500.0,36.110001,36.400002,36.060001,36.360001,500300.0,1.0,38.605000
602187,2023-11-28 00:00:00-05:00,36.709999,37.500000,36.459999,37.220001,11337400,0.0,0.0,FCX,2460276.5,...,34.549999,35.169998,11724600.0,33.610001,33.930000,33.380001,33.430000,8944500.0,1.0,36.869999


In [9]:
max(df["data_freshness"])

4.0