# An Introduction to Machine Learning in Finance

In [1]:
import numpy as np
import pandas as pd

#### 1. Load data from github url

Tips: The URL for Github (Google Search: github raw url)

In [2]:
data_url = "https://raw.githubusercontent.com/lttoby/FinTech-Course/master/sample_2.csv"
sample_data = pd.read_csv(data_url)

#### 2. Review (Play) the dataset

Tips: Usage for Pandas (Google Search: 10 minutes to pandas)

##### Tips For columns in the dataset:

PE: price-to-earnings ratio
PB: price-to-book ratio
TTM: trailing 12 months
ROE: return on qquity
ROA: return on asset
turnover_rate: share turnover rate

In [3]:
# sample_data.head()
# sample_data.tail()
# sample_data.describe()
# sample_data.shape
# sample_data[sample_data.stock_code == "000001.SZ"]
# sample_data[sample_data.trade_date == "2017-03-20"]
# sample_data[sample_data.stock_code == "000001.SZ"].head()
# sample_data[(sample_data.trade_date == "2017-03-20") & (sample_data.stock_code == "000001.SZ")]
# sample_data[(sample_data.trade_date > "2017-03-20") & (sample_data.stock_code == "000001.SZ") & (sample_data.trade_date < "2017-04-20")]
# sample_data["trade_date"] = pd.to_datetime(sample_data["trade_date"])
# sub_data = sample_data[sample_data.stock_code == "000001.SH"]
# sub_data.plot(x="trade_date", y="open", kind='line')
# sample_data.groupby('stock_code').count()
# sample_data.groupby('stock_code').mean()
# sub_data = sample_data[["stock_code", "return_adj"]]
# yearly_return = sub_data.groupby('stock_code').mean() * 252
# yearly_return.sort_values("return_adj")
# yearly_return.sort_values("return_adj", ascending=False)
# sub_data = sample_data[["stock_code", "turnover_rate"]]
# yearly_volumn = sub_data.groupby('stock_code').mean() * 252
# yearly_volumn.sort_values("turnover_rate", ascending=False)

#### 3. Process the Data

##### Get lag data (days after or days before)

In [4]:
# sub_data = sample_data.head()
# pd.set_option('mode.chained_assignment', None)
# sub_data["turnover_rate_lag_1"] = sub_data["turnover_rate"].shift(1)
# sub_data["turnover_rate_lag_1"] = sub_data["turnover_rate"].shift(-1)

In [5]:
def generate_a_lag_variable(original_data, the_column, lag_days):
    name_of_the_new_column = the_column + "_lag_" + str(lag_days)
    original_data[name_of_the_new_column] = original_data[the_column].shift(lag_days)
    return original_data

In [6]:
# generate_a_lag_variable(sub_data, "turnover_rate", 1)
# generate_a_lag_variable(sub_data, "turnover_rate", -1)
# data_wrong = generate_a_lag_variable(sample_data, "turnover_rate", -1)
# data_wrong[data_wrong.stock_code == "000001.SZ"].tail()
# data_wrong[data_wrong.stock_code == "000002.SZ"].head()

##### Get distinct stock codes

In [7]:
def all_distinct_stock_codes(sample_data):
    stock_code_array = sample_data["stock_code"].values
    stock_code_list = list(stock_code_array)
    distinct_code = list(set(stock_code_list))
    distinct_code.sort()
    return distinct_code

In [8]:
distinct_stock_code = all_distinct_stock_codes(sample_data)

##### Generate correct lag data

In [9]:
data_total = list()
data_input = sample_data

for stock_code_i in distinct_stock_code:
    data_i = data_input[data_input["stock_code"] == stock_code_i]
    data_i = generate_a_lag_variable(data_i, "return_adj", -5)
    data_total.append(data_i)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [10]:
data_output = pd.concat(data_total)

In [11]:
# data_output[data_output.stock_code == "000001.SZ"]
# data_output[data_output.stock_code == "000001.SZ"].tail(10)
# data_output[data_output.stock_code == "000002.SZ"].head(10)

In [12]:
stock_code_i = distinct_stock_code[0]
data_i = sample_data[sample_data.stock_code == stock_code_i]

##### Generate cumulative return

In [78]:
data_total = list()
cumulative_days = 20
col_name = "cumulative_return_" + str(cumulative_days)
data_input = sample_data

for stock_code_i in distinct_stock_code:
    data_i = data_input[data_input["stock_code"] == stock_code_i]
    cumulative_return_in_10_days = 1
    for i in range(cumulative_days):
        cumulative_return_in_10_days = cumulative_return_in_10_days * (data_i["return_adj"].shift(-(i+1)) + 1)
    data_i[col_name] = cumulative_return_in_10_days - 1
    data_total.append(data_i)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [79]:
data_with_cumulative_return = pd.concat(data_total)

In [15]:
# (data_with_cumulative_return["return_adj"][1] + 1) * (data_with_cumulative_return["return_adj"][2] + 1) - 1
# data_with_cumulative_return[data_with_cumulative_return["stock_code"] == "000001.SZ"].tail()

In [80]:
data_with_cumulative_return = data_with_cumulative_return.dropna()

In [81]:
data_total = list()
cumulative_days = 20
col_name = "momentum_" + str(cumulative_days)
data_input = data_with_cumulative_return

for stock_code_i in distinct_stock_code:
    data_i = data_input[data_input["stock_code"] == stock_code_i]
    cumulative_return_in_10_days = 1
    for i in range(cumulative_days):
        cumulative_return_in_10_days = cumulative_return_in_10_days * (data_i["return_adj"].shift(i+1) + 1)
    data_i[col_name] = cumulative_return_in_10_days - 1
    data_total.append(data_i)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [82]:
data_with_momentum = pd.concat(data_total)

In [19]:
# (data_with_momentum["return_adj"][0] + 1) * (data_with_momentum["return_adj"][1] + 1) - 1
# data_with_momentum[data_with_momentum["stock_code"] == "000001.SZ"].tail()

In [83]:
data_with_momentum = data_with_momentum.dropna()

##### Generate dummy output （or classification）

Tips: dummy = 0, 1

In [90]:
def all_distinct_stock_codes(sample_data):
    trade_date_array = sample_data["trade_date"].values
    trade_date_list = list(trade_date_array)
    distinct_trade_date = list(set(trade_date_list))
    distinct_trade_date.sort()
    return distinct_trade_date

In [91]:
distinct_date = all_distinct_stock_codes(sample_data)

In [95]:
percentage = 0.4
data_total = list()
data_input = data_with_momentum
cumulative_name = "cumulative_return_20"

In [96]:
for day_i in distinct_date:
    data_i = data_input[data_input["trade_date"] == day_i]
    data_i = data_i.sort_values(cumulative_name)
    top_rows_to_select = int(percentage*data_i.shape[0])
    remaining_rows = data_i.shape[0] - int(percentage*data_i.shape[0]) * 2
    win_loss_label_i = [-1] * top_rows_to_select + [0] * remaining_rows + [1] * top_rows_to_select
    data_i["win_loss"] = win_loss_label_i
    data_total.append(data_i)

In [97]:
data_with_label = pd.concat(data_total)

#### 4. Simple Machine Learning Models

##### Run linear regression

In [104]:
data_with_label.head()

Unnamed: 0,trade_date,stock_code,pe_ttm,pb,roe_ttm,roa_ttm,return_adj,open,close,high,low,turnover_rate,cumulative_return_20,momentum_20,win_loss
6852,2017-04-19,000503.SZ,2372.14,21.9928,0.009271,0.00878,0.005817,34.23,34.58,35.08,33.6,0.01034,-0.279063,-0.161464,-1.0
96644,2017-04-19,600893.SH,75.7789,3.67634,0.048514,0.018563,0.003663,35.35,35.62,35.76,35.0,0.007931,-0.244873,0.041373,-1.0
13684,2017-04-19,000768.SZ,145.985,3.7519,0.025701,0.011602,-0.045364,25.13,23.99,25.56,23.6,0.02281,-0.243435,0.071185,-1.0
102500,2017-04-19,601117.SH,17.4571,1.50007,0.085929,0.029871,-0.013483,8.9,8.78,8.98,8.61,0.008561,-0.238041,0.111111,-1.0
38572,2017-04-19,002558.SZ,151.698,17.0291,0.112257,0.100643,0.00516,74.9,74.03,74.9,70.12,0.035327,-0.230042,-0.07926,-1.0


In [105]:
data_input = data_with_momentum

y = data_input["cumulative_return_20"].values
feature_names = ["momentum_20", "pe_ttm"]
x = data_input[feature_names].values

In [106]:
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.7, random_state=123)
# x_train.shape
# x_train.shape[0] / x_test.shape[0]

In [107]:
from sklearn.linear_model import LinearRegression

lm = LinearRegression()
lm.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [108]:
from sklearn.metrics import r2_score

# print R^2 for the training set
print('The R-squared value for the training set is: {:0.4f}'.format(r2_score(y_train, lm.predict(x_train))))

# print intercept and coefficients
param_df = pd.DataFrame({"Coefficient": [lm.intercept_] + list(lm.coef_), "Feature": ['intercept'] + list(feature_names)})
param_df[['Feature', 'Coefficient']]

The R-squared value for the training set is: 0.0009


Unnamed: 0,Feature,Coefficient
0,intercept,0.007793
1,momentum_20,-0.017445
2,pe_ttm,1e-06


##### Run linear regression

In [109]:
data_input = data_with_label
data_with_binary_label = data_input[data_input["win_loss"] != 0]

y = data_with_binary_label["win_loss"].values
feature_names = ["momentum_20", "pe_ttm"]
x = data_with_binary_label[feature_names].values

In [110]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.7, random_state=123)

In [111]:
from sklearn.linear_model.logistic import LogisticRegression
classifier = LogisticRegression()
classifier.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [112]:
predictions=classifier.predict_proba(x_test)

In [113]:
from sklearn.metrics import roc_curve,auc
false_positive_rate, recall, thresholds = roc_curve(y_test, predictions[:, 1])
roc_auc=auc(false_positive_rate,recall)
roc_auc

0.5152999323946328