In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#### Import USD/JPY datafile

In [2]:
USDJPY = pd.read_csv('./datafiles/usdjpy.txt', sep=',')
USDJPY

Unnamed: 0,<TICKER>,<PER>,<DATE>,<TIME>,<OPEN>,<HIGH>,<LOW>,<CLOSE>,<VOL>,<OPENINT>
0,USDJPY,D,19710104,0,357.7300,357.730,357.7300,357.730,0,0
1,USDJPY,D,19710105,0,357.8100,357.810,357.8100,357.810,0,0
2,USDJPY,D,19710106,0,357.8700,357.870,357.8700,357.870,0,0
3,USDJPY,D,19710107,0,357.8700,357.870,357.8700,357.870,0,0
4,USDJPY,D,19710108,0,357.8300,357.830,357.8300,357.830,0,0
...,...,...,...,...,...,...,...,...,...,...
13354,USDJPY,D,20230309,0,137.2690,137.291,135.9480,136.163,0,0
13355,USDJPY,D,20230310,0,136.1675,136.990,134.1190,134.860,0,0
13356,USDJPY,D,20230313,0,134.4365,135.037,132.2980,133.211,0,0
13357,USDJPY,D,20230314,0,133.2090,134.898,133.0390,134.242,0,0


In [3]:
USDJPY.dtypes

<TICKER>      object
<PER>         object
<DATE>         int64
<TIME>         int64
<OPEN>       float64
<HIGH>       float64
<LOW>        float64
<CLOSE>      float64
<VOL>          int64
<OPENINT>      int64
dtype: object

#### Create general function to clean imported datafiles:
- column names to lowercase and remove <> signs
- drop columns vol, openint, time, per and ticker
- convert date column from int to datetime

In [4]:
def cleanData(df):
    df.columns = df.columns.str.lower().str.replace('<', '').str.replace('>', '')
    df = df.drop(['ticker', 'per', 'time', 'vol', 'openint'], axis=1)
    df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')
    return df

#### clean USDJPY datafile

In [9]:
USDJPY_cleaned = cleanData(USDJPY)
USDJPY_cleaned

Unnamed: 0,date,open,high,low,close
0,1971-01-04,357.7300,357.730,357.7300,357.730
1,1971-01-05,357.8100,357.810,357.8100,357.810
2,1971-01-06,357.8700,357.870,357.8700,357.870
3,1971-01-07,357.8700,357.870,357.8700,357.870
4,1971-01-08,357.8300,357.830,357.8300,357.830
...,...,...,...,...,...
13354,2023-03-09,137.2690,137.291,135.9480,136.163
13355,2023-03-10,136.1675,136.990,134.1190,134.860
13356,2023-03-13,134.4365,135.037,132.2980,133.211
13357,2023-03-14,133.2090,134.898,133.0390,134.242


#### Check for NaN values

In [10]:
USDJPY_cleaned.isna().sum()

date     0
open     0
high     0
low      0
close    0
dtype: int64

#### Very old datarows do not show differences between open, high, low and close for each day. These datarows are not useful for testing different trading strategies. I want to check my testing strategies on data from the last 20 years, because here I can be sure that the data has differences between open, high, low and close. Consequently, all data before 2003 will be dropped:

In [15]:
USDJPY_cleaned_20 = USDJPY_cleaned[USDJPY_cleaned['date'].dt.year > 2002].reset_index(drop=True)
USDJPY_cleaned_20

Unnamed: 0,date,open,high,low,close
0,2003-01-01,118.7800,118.780,118.6500,118.700
1,2003-01-02,118.6900,120.080,118.6900,120.050
2,2003-01-03,120.0500,120.160,119.5800,119.710
3,2003-01-06,119.8200,119.860,118.5800,119.150
4,2003-01-07,119.3100,120.540,119.2500,120.410
...,...,...,...,...,...
5234,2023-03-09,137.2690,137.291,135.9480,136.163
5235,2023-03-10,136.1675,136.990,134.1190,134.860
5236,2023-03-13,134.4365,135.037,132.2980,133.211
5237,2023-03-14,133.2090,134.898,133.0390,134.242


#### before saving the data as csv and in SQL, the open, high, low and close values will be converted to ticks. Tick size refers to the minimum price movement of a trading instrument in a market (reference: https://www.investopedia.com/terms/t/tick-size.asp). The conversion to ticks is used to better compare the profits and losses between different trading pairs. 

In [21]:
def convertToTicks(df, ticksize):
    for column in df:
        if df[column].dtypes == 'float64':
            df[column] = df[column].div(ticksize)
    return df

In [22]:
USDJPY_ticks = convertToTicks(USDJPY_cleaned_20, 0.005)
USDJPY_ticks

Unnamed: 0,date,open,high,low,close
0,2003-01-01,23756.0,23756.0,23730.0,23740.0
1,2003-01-02,23738.0,24016.0,23738.0,24010.0
2,2003-01-03,24010.0,24032.0,23916.0,23942.0
3,2003-01-06,23964.0,23972.0,23716.0,23830.0
4,2003-01-07,23862.0,24108.0,23850.0,24082.0
...,...,...,...,...,...
5234,2023-03-09,27453.8,27458.2,27189.6,27232.6
5235,2023-03-10,27233.5,27398.0,26823.8,26972.0
5236,2023-03-13,26887.3,27007.4,26459.6,26642.2
5237,2023-03-14,26641.8,26979.6,26607.8,26848.4


#### Save converted USDJPY datafile as csv:

In [24]:
USDJPY_ticks.to_csv('./cleaned_datafiles/usdjpy.csv', index=False) 