In [1060]:
import numpy as np
import pandas as pd
import pickle

## IMPORT DATA

#### SHOW SAVED PAIRS

In [1061]:
# get dataframe from the previous step
df = pd.read_pickle("pairs_to_download.pkl")
print(df)

x_list = df.index.tolist()
y_list = df.columns.tolist()

new_pairs_list = [] # list of pairs as tuples (A,B)

for i in range(len(x_list)):
    for j in range(len(y_list)):
        if df.iloc[i,j]>0:
            new_pairs_list.append((x_list[i], y_list[j]))

new_pairs_list

          ES       OGE       WEC      WTRG       XEL
D   0.011405       NaN  0.004544       NaN  0.019043
ES       NaN       NaN  0.010105       NaN       NaN
NI       NaN  0.000979       NaN       NaN       NaN
SO       NaN       NaN       NaN  0.003839       NaN


[('D', 'ES'),
 ('D', 'WEC'),
 ('D', 'XEL'),
 ('ES', 'WEC'),
 ('NI', 'OGE'),
 ('SO', 'WTRG')]

#### SELECT TICKERS & RESOLUTION TO READ DATA

In [1062]:
# PICK FILENAME FORMAT
# "Y": "MA_1D.csv" for downloads with yfinance
# "T": "NYSE_MA, 1D.csv" for downloads from TradingView

file_format = "Y"

# SELECT TICKERS
ticker1 ="OGE"
ticker2 ="NI"

# need exchange info if fileformat is T
ticker1_exchange = 'NYSE'
ticker2_exchange = 'NYSE'

# SELECT RESOLUTION
# 5m,30m,1h,1d
resolution = '1d'

if file_format == "T":
    filename1 = ticker1_exchange.upper() +"_" + ticker1 + ", " + resolution.upper() + ".csv"
    filename2 = ticker2_exchange.upper() +"_" + ticker2 + ", " + resolution.upper() + ".csv"
elif file_format == "Y":
    filename1 = ticker1 +"_" + resolution.upper() + ".csv"
    filename2 = ticker2 +"_" + resolution.upper() + ".csv"
else:
    filename1 = "NYSE_MA, 1D.csv"
    filename2 = "NYSE_V, 1D.csv"


#### READ SAMPLE DATA

In [1063]:
# get a sample from file1
import random
p = 0.01  # 1% of all the lines
df1_sample = pd.read_csv(filename1,header=0, skiprows=lambda i: i>0 and random.random() > p)
df1_sample.head()

Unnamed: 0,time,Open,High,Low,Close,Adj Close,Volume,ticker
0,2018-05-14 00:00:00-04:00,34.259998,34.299999,33.790001,33.990002,28.005199,1777000,OGE
1,2018-12-06 00:00:00-05:00,40.279999,40.540001,39.310001,40.360001,33.896553,2368200,OGE
2,2019-01-17 00:00:00-05:00,40.049999,40.439999,40.0,40.189999,34.065327,1461500,OGE
3,2019-10-03 00:00:00-04:00,44.700001,44.779999,44.23,44.650002,38.496525,874200,OGE
4,2019-12-12 00:00:00-05:00,42.98,43.360001,42.880001,43.139999,37.528828,842300,OGE


In [1064]:
# get a sample from file2
df2_sample = pd.read_csv(filename2,header=0, skiprows=lambda i: i>0 and random.random() > p)
df2_sample.head()

Unnamed: 0,time,Open,High,Low,Close,Adj Close,Volume,ticker
0,2018-01-02 00:00:00-05:00,25.639999,25.690001,25.309999,25.360001,21.766855,2695100,NI
1,2018-02-28 00:00:00-05:00,23.34,23.459999,23.110001,23.129999,20.021904,5825000,NI
2,2018-09-18 00:00:00-04:00,25.540001,25.959999,25.450001,25.92,22.788452,6549200,NI
3,2019-01-02 00:00:00-05:00,25.290001,25.290001,24.690001,25.07,22.208942,2717400,NI
4,2019-03-19 00:00:00-04:00,28.02,28.15,27.73,27.799999,24.813883,1882100,NI


#### GET ALL DATA

In [1065]:
df1 = pd.read_csv(filename1, parse_dates=["time"])
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   time       1259 non-null   object 
 1   Open       1259 non-null   float64
 2   High       1259 non-null   float64
 3   Low        1259 non-null   float64
 4   Close      1259 non-null   float64
 5   Adj Close  1259 non-null   float64
 6   Volume     1259 non-null   int64  
 7   ticker     1259 non-null   object 
dtypes: float64(5), int64(1), object(2)
memory usage: 78.8+ KB


In [1066]:
df2=pd.read_csv(filename2, parse_dates=["time"])
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   time       1259 non-null   object 
 1   Open       1259 non-null   float64
 2   High       1259 non-null   float64
 3   Low        1259 non-null   float64
 4   Close      1259 non-null   float64
 5   Adj Close  1259 non-null   float64
 6   Volume     1259 non-null   int64  
 7   ticker     1259 non-null   object 
dtypes: float64(5), int64(1), object(2)
memory usage: 78.8+ KB


#### FILTER

In [1067]:
df1.columns = map(str.lower, df1.columns)
df1

Unnamed: 0,time,open,high,low,close,adj close,volume,ticker
0,2017-10-18 00:00:00-04:00,36.910000,37.040001,36.860001,36.869999,29.759892,745900,OGE
1,2017-10-19 00:00:00-04:00,36.869999,37.009998,36.810001,36.919998,29.800251,845100,OGE
2,2017-10-20 00:00:00-04:00,36.910000,37.209999,36.869999,37.090000,29.937468,542500,OGE
3,2017-10-23 00:00:00-04:00,37.290001,37.290001,37.049999,37.230000,30.050472,666900,OGE
4,2017-10-24 00:00:00-04:00,37.150002,37.320000,36.810001,37.000000,29.864830,1202300,OGE
...,...,...,...,...,...,...,...,...
1254,2022-10-12 00:00:00-04:00,34.630001,34.650002,33.959999,33.980000,33.980000,1110000,OGE
1255,2022-10-13 00:00:00-04:00,33.610001,34.650002,33.279999,34.500000,34.500000,1564200,OGE
1256,2022-10-14 00:00:00-04:00,34.730000,35.009998,33.619999,33.759998,33.759998,1308700,OGE
1257,2022-10-17 00:00:00-04:00,34.169998,35.130001,34.160000,34.590000,34.590000,1175100,OGE


In [1068]:
df1.columns = map(str.lower, df1.columns)
df1 = df1[['time','close','volume']]
df1 = df1.rename(columns={"time": "datetime"})
df1['datetime'] = pd.to_datetime(df1['datetime'], utc=True)  # be sure that you get datetime

# we don't want null or zero values (price & volume), use forward fill
df1['close'] = df1['close'].fillna(method='ffill')
df1['volume'] = df1['volume'].replace(to_replace=0, method='ffill')
df1

Unnamed: 0,datetime,close,volume
0,2017-10-18 04:00:00+00:00,36.869999,745900
1,2017-10-19 04:00:00+00:00,36.919998,845100
2,2017-10-20 04:00:00+00:00,37.090000,542500
3,2017-10-23 04:00:00+00:00,37.230000,666900
4,2017-10-24 04:00:00+00:00,37.000000,1202300
...,...,...,...
1254,2022-10-12 04:00:00+00:00,33.980000,1110000
1255,2022-10-13 04:00:00+00:00,34.500000,1564200
1256,2022-10-14 04:00:00+00:00,33.759998,1308700
1257,2022-10-17 04:00:00+00:00,34.590000,1175100


In [1069]:
df2.columns = map(str.lower, df2.columns)
df2 = df2[['time','close','volume']]
df2 = df2.rename(columns={"time": "datetime"})
df2['datetime'] = pd.to_datetime(df2['datetime'], utc=True)  # be sure that you get datetime

# we don't want null or zero values (price & volume), use forward fill
df2['close'] = df2['close'].fillna(method='ffill')
df2['volume'] = df2['volume'].replace(to_replace=0, method='ffill')
df2

Unnamed: 0,datetime,close,volume
0,2017-10-18 04:00:00+00:00,26.430000,1241200
1,2017-10-19 04:00:00+00:00,26.760000,1522500
2,2017-10-20 04:00:00+00:00,26.660000,1847700
3,2017-10-23 04:00:00+00:00,26.820000,1225300
4,2017-10-24 04:00:00+00:00,26.670000,1408300
...,...,...,...
1254,2022-10-12 04:00:00+00:00,24.190001,4083200
1255,2022-10-13 04:00:00+00:00,25.240000,7062000
1256,2022-10-14 04:00:00+00:00,24.490000,5675700
1257,2022-10-17 04:00:00+00:00,24.889999,6351100


#### ADD TICKER INFORMATION

In [1070]:
df1_1 = df1.copy();
if not 'ticker' in df1_1.columns:
    df1_1['ticker'] = ticker1;
df1_1.head()

Unnamed: 0,datetime,close,volume,ticker
0,2017-10-18 04:00:00+00:00,36.869999,745900,OGE
1,2017-10-19 04:00:00+00:00,36.919998,845100,OGE
2,2017-10-20 04:00:00+00:00,37.09,542500,OGE
3,2017-10-23 04:00:00+00:00,37.23,666900,OGE
4,2017-10-24 04:00:00+00:00,37.0,1202300,OGE


In [1071]:
df2_1 = df2.copy();
if not 'ticker' in df2_1.columns:
    df2_1['ticker'] = ticker2;
df2_1.head()

Unnamed: 0,datetime,close,volume,ticker
0,2017-10-18 04:00:00+00:00,26.43,1241200,NI
1,2017-10-19 04:00:00+00:00,26.76,1522500,NI
2,2017-10-20 04:00:00+00:00,26.66,1847700,NI
3,2017-10-23 04:00:00+00:00,26.82,1225300,NI
4,2017-10-24 04:00:00+00:00,26.67,1408300,NI


## METHOD 1 (CONCATENATE  & KEEP ALL ROWS)

In [1072]:
df12_c = pd.concat([df2_1,df1_1]).reset_index(drop=True) # need to drop old index values
df12_c

Unnamed: 0,datetime,close,volume,ticker
0,2017-10-18 04:00:00+00:00,26.430000,1241200,NI
1,2017-10-19 04:00:00+00:00,26.760000,1522500,NI
2,2017-10-20 04:00:00+00:00,26.660000,1847700,NI
3,2017-10-23 04:00:00+00:00,26.820000,1225300,NI
4,2017-10-24 04:00:00+00:00,26.670000,1408300,NI
...,...,...,...,...
2513,2022-10-12 04:00:00+00:00,33.980000,1110000,OGE
2514,2022-10-13 04:00:00+00:00,34.500000,1564200,OGE
2515,2022-10-14 04:00:00+00:00,33.759998,1308700,OGE
2516,2022-10-17 04:00:00+00:00,34.590000,1175100,OGE


In [1073]:
#df12_c_agg = df12_c.groupby('DateTime').agg({'ticker':['close','Volume']})
df12_c_agg = df12_c.groupby('ticker')
df12_c_agg.head()

Unnamed: 0,datetime,close,volume,ticker
0,2017-10-18 04:00:00+00:00,26.43,1241200,NI
1,2017-10-19 04:00:00+00:00,26.76,1522500,NI
2,2017-10-20 04:00:00+00:00,26.66,1847700,NI
3,2017-10-23 04:00:00+00:00,26.82,1225300,NI
4,2017-10-24 04:00:00+00:00,26.67,1408300,NI
1259,2017-10-18 04:00:00+00:00,36.869999,745900,OGE
1260,2017-10-19 04:00:00+00:00,36.919998,845100,OGE
1261,2017-10-20 04:00:00+00:00,37.09,542500,OGE
1262,2017-10-23 04:00:00+00:00,37.23,666900,OGE
1263,2017-10-24 04:00:00+00:00,37.0,1202300,OGE


In [1074]:
df12_c_1 = df12_c.set_index(["ticker", "datetime"]).sort_index() # set indexes
df12_c_1.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,close,volume
ticker,datetime,Unnamed: 2_level_1,Unnamed: 3_level_1
NI,2017-10-18 04:00:00+00:00,26.43,1241200
NI,2017-10-19 04:00:00+00:00,26.76,1522500
NI,2017-10-20 04:00:00+00:00,26.66,1847700
NI,2017-10-23 04:00:00+00:00,26.82,1225300
NI,2017-10-24 04:00:00+00:00,26.67,1408300


#### SLICE DATAFRAME TO GET TICKER DATA

In [1075]:
df12_c_1.xs(ticker1)

Unnamed: 0_level_0,close,volume
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-10-18 04:00:00+00:00,36.869999,745900
2017-10-19 04:00:00+00:00,36.919998,845100
2017-10-20 04:00:00+00:00,37.090000,542500
2017-10-23 04:00:00+00:00,37.230000,666900
2017-10-24 04:00:00+00:00,37.000000,1202300
...,...,...
2022-10-12 04:00:00+00:00,33.980000,1110000
2022-10-13 04:00:00+00:00,34.500000,1564200
2022-10-14 04:00:00+00:00,33.759998,1308700
2022-10-17 04:00:00+00:00,34.590000,1175100


In [1076]:
df12_c_1.xs(ticker2)

Unnamed: 0_level_0,close,volume
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-10-18 04:00:00+00:00,26.430000,1241200
2017-10-19 04:00:00+00:00,26.760000,1522500
2017-10-20 04:00:00+00:00,26.660000,1847700
2017-10-23 04:00:00+00:00,26.820000,1225300
2017-10-24 04:00:00+00:00,26.670000,1408300
...,...,...
2022-10-12 04:00:00+00:00,24.190001,4083200
2022-10-13 04:00:00+00:00,25.240000,7062000
2022-10-14 04:00:00+00:00,24.490000,5675700
2022-10-17 04:00:00+00:00,24.889999,6351100


In [1077]:
# END OF METHOD 1, CONTINUE WITH METHOD 2

## METHOD 2 (MERGE TICKERS  & KEEP ONLY INNER TIME DATA)

In [1078]:
df12_m = pd.merge(df1_1, df2_1, on="datetime")
df12_m.head()

Unnamed: 0,datetime,close_x,volume_x,ticker_x,close_y,volume_y,ticker_y
0,2017-10-18 04:00:00+00:00,36.869999,745900,OGE,26.43,1241200,NI
1,2017-10-19 04:00:00+00:00,36.919998,845100,OGE,26.76,1522500,NI
2,2017-10-20 04:00:00+00:00,37.09,542500,OGE,26.66,1847700,NI
3,2017-10-23 04:00:00+00:00,37.23,666900,OGE,26.82,1225300,NI
4,2017-10-24 04:00:00+00:00,37.0,1202300,OGE,26.67,1408300,NI


#### TIME ZONE CONVERSION (IF NEEDED)

In [1079]:
df12_m['datetime'] = pd.to_datetime(df12_m['datetime'], utc=True)
#df12_m.datetime = df12_m.datetime.dt.tz_convert("Europe/Istanbul")
#df12_m.datetime = df12_m.datetime.dt.tz_convert("Canada/Pacific")
df12_m.datetime = df12_m.datetime.dt.tz_convert("UTC")
df12_m

Unnamed: 0,datetime,close_x,volume_x,ticker_x,close_y,volume_y,ticker_y
0,2017-10-18 04:00:00+00:00,36.869999,745900,OGE,26.430000,1241200,NI
1,2017-10-19 04:00:00+00:00,36.919998,845100,OGE,26.760000,1522500,NI
2,2017-10-20 04:00:00+00:00,37.090000,542500,OGE,26.660000,1847700,NI
3,2017-10-23 04:00:00+00:00,37.230000,666900,OGE,26.820000,1225300,NI
4,2017-10-24 04:00:00+00:00,37.000000,1202300,OGE,26.670000,1408300,NI
...,...,...,...,...,...,...,...
1254,2022-10-12 04:00:00+00:00,33.980000,1110000,OGE,24.190001,4083200,NI
1255,2022-10-13 04:00:00+00:00,34.500000,1564200,OGE,25.240000,7062000,NI
1256,2022-10-14 04:00:00+00:00,33.759998,1308700,OGE,24.490000,5675700,NI
1257,2022-10-17 04:00:00+00:00,34.590000,1175100,OGE,24.889999,6351100,NI


#### GET START & END DATES & SET INDEX

In [1080]:
# get start and end days of time series
min_date, max_date = df12_m.datetime.min(), df12_m.datetime.max()
#str(min_date), str(max_date)
min_date.strftime('%m/%d/%Y'), max_date.strftime('%m/%d/%Y')

('10/18/2017', '10/18/2022')

In [1081]:
df12_m = df12_m.set_index("datetime").sort_index()
df12_m.head()

Unnamed: 0_level_0,close_x,volume_x,ticker_x,close_y,volume_y,ticker_y
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-10-18 04:00:00+00:00,36.869999,745900,OGE,26.43,1241200,NI
2017-10-19 04:00:00+00:00,36.919998,845100,OGE,26.76,1522500,NI
2017-10-20 04:00:00+00:00,37.09,542500,OGE,26.66,1847700,NI
2017-10-23 04:00:00+00:00,37.23,666900,OGE,26.82,1225300,NI
2017-10-24 04:00:00+00:00,37.0,1202300,OGE,26.67,1408300,NI


#### GET CUSTOM DATES (IF NEEDED):

In [1082]:
# CUSTOM DATE
import datetime
from_date = datetime.datetime(2018, 6, 1)
to_date = datetime.datetime(2022, 1, 1)
# from_date = '2021-01-01 13:30:00+00:00' # UTC
# to_date = '2022-01-01 13:30:00+00:00' # UTC
# date_format = '%Y-%m-%d'
# from_date = datetime.strptime(from_date, date_format)
# to_date = datetime.strptime(to_date, date_format)

new_index = pd.date_range(from_date, to_date, freq='1D', tz='UTC')
new_index

DatetimeIndex(['2018-06-01 00:00:00+00:00', '2018-06-02 00:00:00+00:00',
               '2018-06-03 00:00:00+00:00', '2018-06-04 00:00:00+00:00',
               '2018-06-05 00:00:00+00:00', '2018-06-06 00:00:00+00:00',
               '2018-06-07 00:00:00+00:00', '2018-06-08 00:00:00+00:00',
               '2018-06-09 00:00:00+00:00', '2018-06-10 00:00:00+00:00',
               ...
               '2021-12-23 00:00:00+00:00', '2021-12-24 00:00:00+00:00',
               '2021-12-25 00:00:00+00:00', '2021-12-26 00:00:00+00:00',
               '2021-12-27 00:00:00+00:00', '2021-12-28 00:00:00+00:00',
               '2021-12-29 00:00:00+00:00', '2021-12-30 00:00:00+00:00',
               '2021-12-31 00:00:00+00:00', '2022-01-01 00:00:00+00:00'],
              dtype='datetime64[ns, UTC]', length=1311, freq='D')

In [1083]:
df12_m_custom = df12_m.reindex(new_index, method="ffill")
df12_m_custom

Unnamed: 0,close_x,volume_x,ticker_x,close_y,volume_y,ticker_y
2018-06-01 00:00:00+00:00,35.020000,2505000,OGE,25.299999,2408500,NI
2018-06-02 00:00:00+00:00,34.380001,1578700,OGE,24.740000,2873500,NI
2018-06-03 00:00:00+00:00,34.380001,1578700,OGE,24.740000,2873500,NI
2018-06-04 00:00:00+00:00,34.380001,1578700,OGE,24.740000,2873500,NI
2018-06-05 00:00:00+00:00,34.490002,1459900,OGE,24.490000,3544100,NI
...,...,...,...,...,...,...
2021-12-28 00:00:00+00:00,37.509998,603000,OGE,27.070000,3176600,NI
2021-12-29 00:00:00+00:00,37.830002,426100,OGE,27.360001,1666600,NI
2021-12-30 00:00:00+00:00,38.119999,674700,OGE,27.700001,3222000,NI
2021-12-31 00:00:00+00:00,38.250000,612100,OGE,27.549999,2503800,NI


#### SAVE TO FILE FOR MORE ANALYSIS

In [1084]:
df12_m.to_pickle("saved.pkl")
if 'df12_m_custom' in locals():
    df12_m_custom.to_pickle("saved_custom.pkl")