# Step 2 - Feature enginering

# import

In [9]:
from math import sqrt
from datetime import datetime
from dateutil.relativedelta import relativedelta

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# Set default font size
plt.rcParams['font.size'] = 24

# Internal ipython tool for setting figure size
from IPython.core.pylabtools import figsize

csv_path = 'data/GBPUSD_Candlestick_1_M_ASK_01.02.2018-01.02.2019_cleaned.csv'
test_csv_path = 'data/GBPUSD_Candlestick_1_M_ASK_01.02.2019-24.04.2019_cleaned.csv'

# Load cleaned data

In [2]:
# load csv
df = pd.read_csv(csv_path,
                 header=0,
                 names=['time', 'open', 'high', 'low', 'close', 'volume', 
                        'hour', 'volatility'],
                 parse_dates=True)
df = df.set_index('time')
print('Row count = %d' % len(df))
df.head(5)

Row count = 374596


Unnamed: 0_level_0,open,high,low,close,volume,hour,volatility
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-02-01 00:00:00,1.41996,1.42005,1.41994,1.42005,81.63,0,1.1
2018-02-01 00:01:00,1.42008,1.42014,1.41993,1.41997,130.89,0,2.1
2018-02-01 00:02:00,1.41996,1.4203,1.41996,1.4202,109.85,0,3.4
2018-02-01 00:03:00,1.4202,1.4206,1.4202,1.42043,213.38,0,4.0
2018-02-01 00:04:00,1.42043,1.42048,1.42043,1.42045,133.74,0,0.5


# calculate max profit

In [11]:
# function to calculate pips
def pip(price=None, _abs=False):

    pip_unit = 0.0001
    if price:
        price = float(price)
        if _abs:
            price = abs(price)
        return (price / pip_unit)

    return pip_unit


def score(buy, sell, take_profit):
    _max = max(abs(buy), abs(sell))
    _min = min(abs(buy), abs(sell))
    if _min < 0.2:
        safe = _max
    else:
        safe = _max / _min

    safe_score = sqrt(safe)
    profit_score = _max / abs(float(take_profit))
    loss_score = _min / abs(float(take_profit))

    score = safe_score
    if profit_score < 1:
        score *= profit_score ** 2
    elif safe_score > 1:
        score += sqrt(profit_score)

    if loss_score > 1:
        score /= sqrt(loss_score)
    elif safe > 1:
        score += 1 - sqrt(loss_score)

    if abs(sell) > abs(buy):
        return -1 * score
    return score

In [12]:
# the max sell&buy profit during 1 hour


def max_profit(path, delta):
    parser = lambda time, high, low, close: (datetime.strptime(
        time, '%Y-%m-%d %H:%M:%S'), float(high), float(low), float(close))
    import csv

    with open(path) as tick:
        data = csv.reader(tick)
        data = list(data)
        length = len(data)
        result = []
        count = 0
        now = datetime.now()
        take_profit=5
        for index, row in enumerate(data):
            if index == 0:
                continue
            time, high, low, close = parser(row[0], row[2], row[3], row[4])
            start, end = time, time + delta
            max_sell, max_buy = 0, 0
            max_sell_count, max_buy_count = 0, 0
            itor = 1
            while start < end:
                i = index + itor
                if i >= length:
                    break
                ctime, chigh, clow, cclose = parser(data[i][0], data[i][2],
                                                    data[i][3], data[i][4])
                max_s = pip(clow - close)
                max_b = pip(chigh - close)
                if max_s < max_sell:
                    max_sell = max_s
                if max_s < 0:
                    max_sell_count += 1
                if max_b > max_buy:
                    max_buy = max_b
                if max_b > 0:
                    max_buy_count += 1

                itor += 1
                start = ctime
            _score = score(max_buy, max_sell,take_profit)
            result.append((time, max_buy, max_buy_count, max_sell,
                           max_sell_count, _score))

            if not index % 10000:
                print(index)

    profit_df = pd.DataFrame(result,
                             columns=[
                                 'time', 'max_buy', 'max_buy_count',
                                 'max_sell', 'max_sell_count', 'score'
                             ])
    profit_df = profit_df.set_index('time')
    return profit_df

In [13]:
# %%timeit -n 1 -r 5
profit_df = max_profit(test_csv_path, relativedelta(minutes=60))
profit_df.to_csv(
    'data/GBPUSD_Candlestick_1_M_ASK_01.02.2019-24.04.2019_profit_H1.csv',
    float_format='%.5f')

10000
20000
30000
40000
50000
60000
70000
80000


In [14]:
profit_df = max_profit(csv_path, relativedelta(minutes=60))
profit_df.to_csv(
    'data/GBPUSD_Candlestick_1_M_ASK_01.02.2018-01.02.2019_profit_H1.csv',
    float_format='%.5f')

10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
