At this stage, we will prepare the downloaded data for individual pair analysis

In [65]:
import numpy as np
import pandas as pd
import pickle

## IMPORT DATA

#### SHOW SAVED PAIRS

In [66]:
# get dataframe from the previous step
df = pd.read_pickle("pairs_to_download.pkl")
print(df)

x_list = df.index.tolist()
y_list = df.columns.tolist()

new_pairs_list = [] # list of pairs as tuples (A,B)

for i in range(len(x_list)):
    for j in range(len(y_list)):
        if df.iloc[i,j]>0:
            new_pairs_list.append((x_list[i], y_list[j]))

new_pairs_list

          DUK       VST       WEC     WTRG       XEL
AEE  0.032540       NaN       NaN      NaN       NaN
AEP       NaN  0.033961       NaN      NaN       NaN
CMS  0.000526       NaN       NaN      NaN       NaN
CNP       NaN  0.017855       NaN      NaN       NaN
ED        NaN  0.003137       NaN      NaN       NaN
OGE       NaN       NaN       NaN      NaN  0.031319
RUN       NaN       NaN       NaN  0.04778       NaN
VST       NaN       NaN  0.047495      NaN       NaN


[('AEE', 'DUK'),
 ('AEP', 'VST'),
 ('CMS', 'DUK'),
 ('CNP', 'VST'),
 ('ED', 'VST'),
 ('OGE', 'XEL'),
 ('RUN', 'WTRG'),
 ('VST', 'WEC')]

#### SELECT TICKERS & RESOLUTION TO READ DATA

In [67]:
# PICK FILENAME FORMAT
# "Y": "MA_1D.csv" for downloads with yfinance
# "T": "NYSE_MA, 1D.csv" for downloads from TradingView

file_format = "Y"

# SELECT TICKERS
ticker1 ="CNP"
ticker2 ="VST"

# need exchange info if fileformat is T
ticker1_exchange = 'NYSE'
ticker2_exchange = 'NYSE'

# SELECT RESOLUTION
# 5m,30m,1h,1d
resolution = '1D'

if file_format == "T":
    filename1 = "data/"+ ticker1_exchange.upper() +"_" + ticker1 + ", " + resolution.upper() + ".csv"
    filename2 = "data/"+ ticker2_exchange.upper() +"_" + ticker2 + ", " + resolution.upper() + ".csv"
elif file_format == "Y":
    filename1 = "data/"+ ticker1 +"_" + resolution.upper() + ".csv"
    filename2 = "data/"+ ticker2 +"_" + resolution.upper() + ".csv"
else:
    filename1 = "data/NYSE_MA, 1D.csv"
    filename2 = "data/NYSE_V, 1D.csv"


#### READ SAMPLE DATA

In [68]:
# get a sample from file1
import random
p = 0.01  # 1% of all the lines
df1_sample = pd.read_csv(filename1,header=0, skiprows=lambda i: i>0 and random.random() > p)
df1_sample.head()

Unnamed: 0,time,Open,High,Low,Close,Adj Close,Volume,ticker
0,2022-02-17 00:00:00-05:00,26.83,26.940001,26.42,26.809999,26.518515,7949100,CNP
1,2022-03-07 00:00:00-05:00,28.91,29.049999,28.4,28.73,28.417639,5326400,CNP


In [69]:
# get a sample from file2
df2_sample = pd.read_csv(filename2,header=0, skiprows=lambda i: i>0 and random.random() > p)
df2_sample.head()

Unnamed: 0,time,Open,High,Low,Close,Adj Close,Volume,ticker
0,2022-04-01 00:00:00-04:00,23.309999,23.5,23.08,23.440001,23.087934,4560100,VST
1,2022-10-21 00:00:00-04:00,21.48,21.768,21.120001,21.75,21.75,4248900,VST


#### GET ALL DATA

In [70]:
df1 = pd.read_csv(filename1, parse_dates=["time"])
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252 entries, 0 to 251
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   time       252 non-null    object 
 1   Open       252 non-null    float64
 2   High       252 non-null    float64
 3   Low        252 non-null    float64
 4   Close      252 non-null    float64
 5   Adj Close  252 non-null    float64
 6   Volume     252 non-null    int64  
 7   ticker     252 non-null    object 
dtypes: float64(5), int64(1), object(2)
memory usage: 15.9+ KB


In [71]:
df2=pd.read_csv(filename2, parse_dates=["time"])
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252 entries, 0 to 251
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   time       252 non-null    object 
 1   Open       252 non-null    float64
 2   High       252 non-null    float64
 3   Low        252 non-null    float64
 4   Close      252 non-null    float64
 5   Adj Close  252 non-null    float64
 6   Volume     252 non-null    int64  
 7   ticker     252 non-null    object 
dtypes: float64(5), int64(1), object(2)
memory usage: 15.9+ KB


#### FILTER

In [72]:
df1.columns = map(str.lower, df1.columns)
df1

Unnamed: 0,time,open,high,low,close,adj close,volume,ticker
0,2021-11-01 00:00:00-04:00,26.120001,26.350000,26.040001,26.290001,25.675735,3730100,CNP
1,2021-11-02 00:00:00-04:00,26.370001,26.370001,25.830000,26.190001,25.578075,5620500,CNP
2,2021-11-03 00:00:00-04:00,26.049999,26.420000,26.010000,26.290001,25.675735,3350900,CNP
3,2021-11-04 00:00:00-04:00,26.440001,26.740000,26.309999,26.500000,25.880831,3364800,CNP
4,2021-11-05 00:00:00-04:00,26.709999,26.889999,26.430000,26.670000,26.046860,3179000,CNP
...,...,...,...,...,...,...,...,...
247,2022-10-25 00:00:00-04:00,27.200001,27.820000,27.200001,27.650000,27.650000,3556000,CNP
248,2022-10-26 00:00:00-04:00,27.830000,27.969999,27.549999,27.600000,27.600000,3132200,CNP
249,2022-10-27 00:00:00-04:00,27.889999,28.260000,27.840000,27.990000,27.990000,2833500,CNP
250,2022-10-28 00:00:00-04:00,28.070000,28.820000,28.070000,28.809999,28.809999,2803400,CNP


In [73]:
df1.columns = map(str.lower, df1.columns)
df1 = df1[['time','close','volume']]
df1 = df1.rename(columns={"time": "datetime"})
df1['datetime'] = pd.to_datetime(df1['datetime'], utc=True)  # be sure that you get datetime

# we don't want null or zero values (price & volume), use forward fill
df1['close'] = df1['close'].fillna(method='ffill')
df1['volume'] = df1['volume'].replace(to_replace=0, method='ffill')
df1

Unnamed: 0,datetime,close,volume
0,2021-11-01 04:00:00+00:00,26.290001,3730100
1,2021-11-02 04:00:00+00:00,26.190001,5620500
2,2021-11-03 04:00:00+00:00,26.290001,3350900
3,2021-11-04 04:00:00+00:00,26.500000,3364800
4,2021-11-05 04:00:00+00:00,26.670000,3179000
...,...,...,...
247,2022-10-25 04:00:00+00:00,27.650000,3556000
248,2022-10-26 04:00:00+00:00,27.600000,3132200
249,2022-10-27 04:00:00+00:00,27.990000,2833500
250,2022-10-28 04:00:00+00:00,28.809999,2803400


In [74]:
df2.columns = map(str.lower, df2.columns)
df2 = df2[['time','close','volume']]
df2 = df2.rename(columns={"time": "datetime"})
df2['datetime'] = pd.to_datetime(df2['datetime'], utc=True)  # be sure that you get datetime

# we don't want null or zero values (price & volume), use forward fill
df2['close'] = df2['close'].fillna(method='ffill')
df2['volume'] = df2['volume'].replace(to_replace=0, method='ffill')
df2

Unnamed: 0,datetime,close,volume
0,2021-11-01 04:00:00+00:00,20.010000,2756500
1,2021-11-02 04:00:00+00:00,19.870001,4006000
2,2021-11-03 04:00:00+00:00,19.680000,2460100
3,2021-11-04 04:00:00+00:00,19.209999,6093500
4,2021-11-05 04:00:00+00:00,20.500000,9136100
...,...,...,...
247,2022-10-25 04:00:00+00:00,21.990000,4838400
248,2022-10-26 04:00:00+00:00,21.900000,3419400
249,2022-10-27 04:00:00+00:00,21.980000,3738400
250,2022-10-28 04:00:00+00:00,22.980000,4167400


#### ADD TICKER INFORMATION

In [75]:
# if not added already in the previous steps

In [76]:
df1_1 = df1.copy();
if not 'ticker' in df1_1.columns:
    df1_1['ticker'] = ticker1;
df1_1.head()

Unnamed: 0,datetime,close,volume,ticker
0,2021-11-01 04:00:00+00:00,26.290001,3730100,CNP
1,2021-11-02 04:00:00+00:00,26.190001,5620500,CNP
2,2021-11-03 04:00:00+00:00,26.290001,3350900,CNP
3,2021-11-04 04:00:00+00:00,26.5,3364800,CNP
4,2021-11-05 04:00:00+00:00,26.67,3179000,CNP


In [77]:
df2_1 = df2.copy();
if not 'ticker' in df2_1.columns:
    df2_1['ticker'] = ticker2;
df2_1.head()

Unnamed: 0,datetime,close,volume,ticker
0,2021-11-01 04:00:00+00:00,20.01,2756500,VST
1,2021-11-02 04:00:00+00:00,19.870001,4006000,VST
2,2021-11-03 04:00:00+00:00,19.68,2460100,VST
3,2021-11-04 04:00:00+00:00,19.209999,6093500,VST
4,2021-11-05 04:00:00+00:00,20.5,9136100,VST


## METHOD 1 (CONCATENATE  & KEEP ALL ROWS)

In [78]:
df12_c = pd.concat([df2_1,df1_1]).reset_index(drop=True) # need to drop old index values
df12_c

Unnamed: 0,datetime,close,volume,ticker
0,2021-11-01 04:00:00+00:00,20.010000,2756500,VST
1,2021-11-02 04:00:00+00:00,19.870001,4006000,VST
2,2021-11-03 04:00:00+00:00,19.680000,2460100,VST
3,2021-11-04 04:00:00+00:00,19.209999,6093500,VST
4,2021-11-05 04:00:00+00:00,20.500000,9136100,VST
...,...,...,...,...
499,2022-10-25 04:00:00+00:00,27.650000,3556000,CNP
500,2022-10-26 04:00:00+00:00,27.600000,3132200,CNP
501,2022-10-27 04:00:00+00:00,27.990000,2833500,CNP
502,2022-10-28 04:00:00+00:00,28.809999,2803400,CNP


In [79]:
#df12_c_agg = df12_c.groupby('DateTime').agg({'ticker':['close','Volume']})
df12_c_agg = df12_c.groupby('ticker')
df12_c_agg.head()

Unnamed: 0,datetime,close,volume,ticker
0,2021-11-01 04:00:00+00:00,20.01,2756500,VST
1,2021-11-02 04:00:00+00:00,19.870001,4006000,VST
2,2021-11-03 04:00:00+00:00,19.68,2460100,VST
3,2021-11-04 04:00:00+00:00,19.209999,6093500,VST
4,2021-11-05 04:00:00+00:00,20.5,9136100,VST
252,2021-11-01 04:00:00+00:00,26.290001,3730100,CNP
253,2021-11-02 04:00:00+00:00,26.190001,5620500,CNP
254,2021-11-03 04:00:00+00:00,26.290001,3350900,CNP
255,2021-11-04 04:00:00+00:00,26.5,3364800,CNP
256,2021-11-05 04:00:00+00:00,26.67,3179000,CNP


In [80]:
df12_c_1 = df12_c.set_index(["ticker", "datetime"]).sort_index() # set indexes
df12_c_1.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,close,volume
ticker,datetime,Unnamed: 2_level_1,Unnamed: 3_level_1
CNP,2021-11-01 04:00:00+00:00,26.290001,3730100
CNP,2021-11-02 04:00:00+00:00,26.190001,5620500
CNP,2021-11-03 04:00:00+00:00,26.290001,3350900
CNP,2021-11-04 04:00:00+00:00,26.5,3364800
CNP,2021-11-05 04:00:00+00:00,26.67,3179000


#### SLICE DATAFRAME TO GET TICKER DATA

In [81]:
df12_c_1.xs(ticker1)

Unnamed: 0_level_0,close,volume
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-11-01 04:00:00+00:00,26.290001,3730100
2021-11-02 04:00:00+00:00,26.190001,5620500
2021-11-03 04:00:00+00:00,26.290001,3350900
2021-11-04 04:00:00+00:00,26.500000,3364800
2021-11-05 04:00:00+00:00,26.670000,3179000
...,...,...
2022-10-25 04:00:00+00:00,27.650000,3556000
2022-10-26 04:00:00+00:00,27.600000,3132200
2022-10-27 04:00:00+00:00,27.990000,2833500
2022-10-28 04:00:00+00:00,28.809999,2803400


In [82]:
df12_c_1.xs(ticker2)

Unnamed: 0_level_0,close,volume
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-11-01 04:00:00+00:00,20.010000,2756500
2021-11-02 04:00:00+00:00,19.870001,4006000
2021-11-03 04:00:00+00:00,19.680000,2460100
2021-11-04 04:00:00+00:00,19.209999,6093500
2021-11-05 04:00:00+00:00,20.500000,9136100
...,...,...
2022-10-25 04:00:00+00:00,21.990000,4838400
2022-10-26 04:00:00+00:00,21.900000,3419400
2022-10-27 04:00:00+00:00,21.980000,3738400
2022-10-28 04:00:00+00:00,22.980000,4167400


In [83]:
# END OF METHOD 1, CONTINUE WITH METHOD 2

## METHOD 2 (MERGE TICKERS  & KEEP ONLY INNER TIME DATA)

In [84]:
df12_m = pd.merge(df1_1, df2_1, on="datetime")
df12_m.head()

Unnamed: 0,datetime,close_x,volume_x,ticker_x,close_y,volume_y,ticker_y
0,2021-11-01 04:00:00+00:00,26.290001,3730100,CNP,20.01,2756500,VST
1,2021-11-02 04:00:00+00:00,26.190001,5620500,CNP,19.870001,4006000,VST
2,2021-11-03 04:00:00+00:00,26.290001,3350900,CNP,19.68,2460100,VST
3,2021-11-04 04:00:00+00:00,26.5,3364800,CNP,19.209999,6093500,VST
4,2021-11-05 04:00:00+00:00,26.67,3179000,CNP,20.5,9136100,VST


#### TIME ZONE CONVERSION (IF NEEDED)

In [85]:
df12_m['datetime'] = pd.to_datetime(df12_m['datetime'], utc=True)
#df12_m.datetime = df12_m.datetime.dt.tz_convert("Europe/Istanbul")
#df12_m.datetime = df12_m.datetime.dt.tz_convert("Canada/Pacific")
df12_m.datetime = df12_m.datetime.dt.tz_convert("UTC")
df12_m

Unnamed: 0,datetime,close_x,volume_x,ticker_x,close_y,volume_y,ticker_y
0,2021-11-01 04:00:00+00:00,26.290001,3730100,CNP,20.010000,2756500,VST
1,2021-11-02 04:00:00+00:00,26.190001,5620500,CNP,19.870001,4006000,VST
2,2021-11-03 04:00:00+00:00,26.290001,3350900,CNP,19.680000,2460100,VST
3,2021-11-04 04:00:00+00:00,26.500000,3364800,CNP,19.209999,6093500,VST
4,2021-11-05 04:00:00+00:00,26.670000,3179000,CNP,20.500000,9136100,VST
...,...,...,...,...,...,...,...
247,2022-10-25 04:00:00+00:00,27.650000,3556000,CNP,21.990000,4838400,VST
248,2022-10-26 04:00:00+00:00,27.600000,3132200,CNP,21.900000,3419400,VST
249,2022-10-27 04:00:00+00:00,27.990000,2833500,CNP,21.980000,3738400,VST
250,2022-10-28 04:00:00+00:00,28.809999,2803400,CNP,22.980000,4167400,VST


#### GET START & END DATES & SET INDEX

In [86]:
# get start and end days of time series
min_date, max_date = df12_m.datetime.min(), df12_m.datetime.max()
#str(min_date), str(max_date)
min_date.strftime('%m/%d/%Y'), max_date.strftime('%m/%d/%Y')

('11/01/2021', '10/31/2022')

In [87]:
df12_m = df12_m.set_index("datetime").sort_index()
df12_m.head()

Unnamed: 0_level_0,close_x,volume_x,ticker_x,close_y,volume_y,ticker_y
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-11-01 04:00:00+00:00,26.290001,3730100,CNP,20.01,2756500,VST
2021-11-02 04:00:00+00:00,26.190001,5620500,CNP,19.870001,4006000,VST
2021-11-03 04:00:00+00:00,26.290001,3350900,CNP,19.68,2460100,VST
2021-11-04 04:00:00+00:00,26.5,3364800,CNP,19.209999,6093500,VST
2021-11-05 04:00:00+00:00,26.67,3179000,CNP,20.5,9136100,VST


#### GET CUSTOM DATES (IF NEEDED):

In [88]:
# CUSTOM DATE
import datetime
from_date = datetime.datetime(2018, 6, 1)
to_date = datetime.datetime(2022, 1, 1)
# from_date = '2021-01-01 13:30:00+00:00' # UTC
# to_date = '2022-01-01 13:30:00+00:00' # UTC
# date_format = '%Y-%m-%d'
# from_date = datetime.strptime(from_date, date_format)
# to_date = datetime.strptime(to_date, date_format)

new_index = pd.date_range(from_date, to_date, freq='1D', tz='UTC')
new_index

DatetimeIndex(['2018-06-01 00:00:00+00:00', '2018-06-02 00:00:00+00:00',
               '2018-06-03 00:00:00+00:00', '2018-06-04 00:00:00+00:00',
               '2018-06-05 00:00:00+00:00', '2018-06-06 00:00:00+00:00',
               '2018-06-07 00:00:00+00:00', '2018-06-08 00:00:00+00:00',
               '2018-06-09 00:00:00+00:00', '2018-06-10 00:00:00+00:00',
               ...
               '2021-12-23 00:00:00+00:00', '2021-12-24 00:00:00+00:00',
               '2021-12-25 00:00:00+00:00', '2021-12-26 00:00:00+00:00',
               '2021-12-27 00:00:00+00:00', '2021-12-28 00:00:00+00:00',
               '2021-12-29 00:00:00+00:00', '2021-12-30 00:00:00+00:00',
               '2021-12-31 00:00:00+00:00', '2022-01-01 00:00:00+00:00'],
              dtype='datetime64[ns, UTC]', length=1311, freq='D')

In [89]:
df12_m_custom = df12_m.reindex(new_index, method="ffill")
df12_m_custom

Unnamed: 0,close_x,volume_x,ticker_x,close_y,volume_y,ticker_y
2018-06-01 00:00:00+00:00,,,,,,
2018-06-02 00:00:00+00:00,,,,,,
2018-06-03 00:00:00+00:00,,,,,,
2018-06-04 00:00:00+00:00,,,,,,
2018-06-05 00:00:00+00:00,,,,,,
...,...,...,...,...,...,...
2021-12-28 00:00:00+00:00,27.580000,2132800.0,CNP,22.190001,2283600.0,VST
2021-12-29 00:00:00+00:00,27.850000,2119100.0,CNP,22.049999,1864300.0,VST
2021-12-30 00:00:00+00:00,27.950001,2618600.0,CNP,21.870001,2895400.0,VST
2021-12-31 00:00:00+00:00,27.830000,2815300.0,CNP,22.389999,7636100.0,VST


#### SAVE TO FILE FOR MORE ANALYSIS

In [90]:
df12_m.to_pickle("saved.pkl")
if 'df12_m_custom' in locals():
    df12_m_custom.to_pickle("saved_custom.pkl")