# 说明
对初始数据进行缺失值填充后，对训练集数据划分成为具体算法的训练和测试数据集并应用五种算法。结果表明，相比于SVM、AdaBoost、KNN、决策树四种算法，使用随机森林可以稳定达到最优结果。故对测试集数据应用训练后的随机森林输出每行结果并计算贸易者总的分类得分。最终结果可在AMF_test_Y.csv中查看。

# 初始化

In [1]:
# 导入相关包
import numpy as np
import pandas as pd
# 时间计算
import time
# 数据打乱
from sklearn.utils import shuffle
# 数据分割
from sklearn import model_selection
# 结果评价
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

# 读取初始数据
train_x_init = pd.read_csv('AMF_train_X.csv')
train_y_init = pd.read_csv('AMF_train_Y.csv')
test_x_init = pd.read_csv('AMF_test_X.csv')

# 打印数据大小
print("The shape of train_x_init equals to ", train_x_init.shape)
print("The shape of train_y_init equals to ", train_y_init.shape)
print("The shape of test_x_init equals to ", test_x_init.shape)

The shape of train_x_init equals to  (105782, 39)
The shape of train_y_init equals to  (86, 2)
The shape of test_x_init equals to  (85304, 39)


# 数据处理

In [2]:
# 缺失值填充（将.median()改为.mean()可实现平均值填充）
train_x_init = train_x_init.fillna(train_x_init.median())
test_x_init = test_x_init.fillna(test_x_init.median())

# 提取trader对应的类别
train_traders_of_HFT = list(train_y_init[train_y_init.type == "HFT"]["Trader"])
train_traders_of_NONHFT = list(train_y_init[train_y_init.type == "NON HFT"]["Trader"])
train_traders_of_MIX = list(train_y_init[train_y_init.type == "MIX"]["Trader"])

# Trader类别展示
print("Traders of HFT in Dataset Train are as follows:\n", train_traders_of_HFT)
print("Traders of NON HFT in Dataset Train are as follows:\n", train_traders_of_NONHFT)
print("Traders of MIX in Dataset Train are as follows:\n", train_traders_of_MIX)

Traders of HFT in Dataset Train are as follows:
 ['Trader_328', 'Trader_386', 'Trader_120', 'Trader_132', 'Trader_136', 'Trader_453', 'Trader_226', 'Trader_40', 'Trader_161', 'Trader_385', 'Trader_422', 'Trader_341', 'Trader_244', 'Trader_35', 'Trader_278']
Traders of NON HFT in Dataset Train are as follows:
 ['Trader_114', 'Trader_110', 'Trader_57', 'Trader_128', 'Trader_59', 'Trader_435', 'Trader_293', 'Trader_280', 'Trader_158', 'Trader_60', 'Trader_179', 'Trader_208', 'Trader_222', 'Trader_237', 'Trader_256', 'Trader_184', 'Trader_398', 'Trader_425', 'Trader_424', 'Trader_169', 'Trader_54', 'Trader_46', 'Trader_177', 'Trader_16', 'Trader_221', 'Trader_279', 'Trader_127', 'Trader_442', 'Trader_446', 'Trader_255', 'Trader_41', 'Trader_10', 'Trader_178', 'Trader_257', 'Trader_340', 'Trader_191', 'Trader_105', 'Trader_275', 'Trader_312', 'Trader_375', 'Trader_155', 'Trader_380', 'Trader_149', 'Trader_288', 'Trader_51', 'Trader_195', 'Trader_150']
Traders of MIX in Dataset Train are as 

  train_x_init = train_x_init.fillna(train_x_init.median())
  test_x_init = test_x_init.fillna(test_x_init.median())


In [3]:
# 提取Trader列
train_x_label = train_x_init["Trader"].copy()

# 将Trader数据替换为类别标签
train_x_label.replace(train_traders_of_HFT, 1, inplace = True)
train_x_label.replace(train_traders_of_NONHFT, -1, inplace = True)
train_x_label.replace(train_traders_of_MIX, 0, inplace = True)

# 将label列添加到原数据当中
train_x_init.loc[:, "Label"] = train_x_label

# 删除前三列数据
f_to_delete = ["Index", "Share", "Day"]
train_x_init.drop(columns=f_to_delete, inplace=True)

# 展示处理之后的前五行
train_x_init.head()

Unnamed: 0,Trader,OTR,OCR,OMR,min_time_two_events,mean_time_two_events,10_p_time_two_events,med_time_two_events,25_p_time_two_events,75_p_time_two_events,...,mean_dt_TV1_TV2,med_dt_TV1_TV2,min_dt_TV1_TV3,mean_dt_TV1_TV3,med_dt_TV1_TV3,min_dt_TV1_TV4,mean_dt_TV1_TV4,med_dt_TV1_TV4,NbSecondWithAtLeatOneTrade,Label
0,Trader_10,2.272727,8.333333,12.5,0.0,5117.8303,0.0,419.6885,10.722543,984.32056,...,110.42737,22.413161,0.00026,111.633327,23.665962,0.00027,113.158721,23.435835,4,-1
1,Trader_10,1.696629,25.166667,21.571429,0.0,1846.968401,7.4e-05,0.003374,0.000204,8.768699,...,110.42737,22.413161,0.00026,111.633327,23.665962,0.00027,113.158721,23.435835,15,-1
2,Trader_10,1.482759,47.3,118.25,0.0,686.30063,7.1e-05,0.000599,0.000129,5.725427,...,110.42737,22.413161,0.00026,111.633327,23.665962,0.00027,113.158721,23.435835,63,-1
3,Trader_10,1.705882,14.5,29.0,0.0,2174.335265,0.0,6.152666,0.000945,62.444176,...,110.42737,22.413161,0.00026,111.633327,23.665962,0.00027,113.158721,23.435835,4,-1
4,Trader_10,1.51773,26.75,14.0,0.0,944.008551,7.1e-05,0.001364,0.000146,2.22542,...,110.42737,22.413161,0.00026,111.633327,23.665962,0.00027,113.158721,23.435835,38,-1


In [4]:
# 数据分组
train_x_hft = train_x_init[train_x_init.Label == 1]
train_x_mix = train_x_init[train_x_init.Label == 0]
train_x_non = train_x_init[train_x_init.Label == -1]

# 输出各组大小
print("The size of train_x_hft equals to", train_x_hft.shape)
print("The size of train_x_mix equals to", train_x_mix.shape)
print("The size of train_x_non equals to", train_x_non.shape)

The size of train_x_hft equals to (31950, 37)
The size of train_x_mix equals to (51483, 37)
The size of train_x_non equals to (22349, 37)


# 整理输入输出

In [5]:
# 打乱训练数据
train_x_shuffle = train_x_init.copy()
train_x_shuffle = shuffle(train_x_shuffle)

# 将训练数据分为训练部分和测试部分
data_x = train_x_shuffle.iloc[:, 1:36]
data_y = train_x_shuffle["Label"]
train_x, test_x, train_y , test_y = model_selection.train_test_split(data_x, data_y, test_size = 0.2, random_state = 75)

# 展示训练数据与测试数据的大小
print("The size of train_x equals to", train_x.shape)
print("The size of test_x equals to", test_x.shape)

The size of train_x equals to (84625, 35)
The size of test_x equals to (21157, 35)


# 构建模型、训练、进行单行评分

## SVM

In [6]:
# from sklearn.svm import SVC

# start = time.time()
# svc = SVC()
# svc.fit(train_x, train_y)
# predict_y = svc.predict(test_x)
# end = time.time()

# # 打印单行评分
# print("Time consumption is", end - start, "seconds")
# print("Mean squared error is", mean_squared_error(test_y, predict_y))
# print('Accuracy score is', accuracy_score(test_y, predict_y))

# # SVM耗时巨大且准确率不高
# # Time consumption is 519.4578828811646 seconds
# # Mean squared error is 0.25518740842274423
# # Accuracy score is 0.7901876447511462

## AdaBoost

In [7]:
from sklearn.ensemble import AdaBoostClassifier

start = time.time()
ada = AdaBoostClassifier()
ada.fit(train_x, train_y)
predict_y = ada.predict(test_x)
end = time.time()

# 打印单行评分
print("Time consumption is", end - start, "seconds")
print("Mean squared error is", mean_squared_error(test_y, predict_y))
print('Accuracy score is', accuracy_score(test_y, predict_y))

Time consumption is 11.72180986404419 seconds
Mean squared error is 0.17181074821572057
Accuracy score is 0.8637803091175498


## KNN

In [8]:
from sklearn.neighbors import KNeighborsClassifier

start = time.time()
knn = KNeighborsClassifier()
knn.fit(train_x, train_y)
predict_y = knn.predict(test_x)
end = time.time()

# 打印单行评分
print("Time consumption is", end - start, "seconds")
print("Mean squared error is", mean_squared_error(test_y, predict_y))
print('Accuracy score is', accuracy_score(test_y, predict_y))

Time consumption is 29.115871906280518 seconds
Mean squared error is 0.174977548801815
Accuracy score is 0.8702557073309071


## Decision Tree

In [9]:
from sklearn.tree import DecisionTreeClassifier

start = time.time()
dtc = DecisionTreeClassifier()
dtc.fit(train_x, train_y)
predict_y = dtc.predict(test_x)
end = time.time()

# 打印单行评分
print("Time consumption is", end - start, "seconds")
print("Mean squared error is", mean_squared_error(test_y, predict_y))
print('Accuracy score is', accuracy_score(test_y, predict_y))

Time consumption is 4.228555917739868 seconds
Mean squared error is 0.05657701942619464
Accuracy score is 0.9612894077610247


## Random Forest

In [10]:
from sklearn.ensemble import RandomForestClassifier

start = time.time()
rfc = RandomForestClassifier(n_estimators = 10)
rfc = rfc.fit(train_x, train_y)
predict_y = rfc.predict(test_x)
end = time.time()

# 打印单行评分
print("Time consumption is", end - start, "seconds")
print("Mean squared error is", mean_squared_error(test_y, predict_y))
print('Accuracy score is', accuracy_score(test_y, predict_y))

Time consumption is 3.029448986053467 seconds
Mean squared error is 0.03932504608403838
Accuracy score is 0.9728694994564446


# 选定模型 测试整体评分

In [11]:
# 因为使用随机森林方法的准确率最高，故选定该模型进行后续处理
# 对训练集整体进行预测
train_x_test = train_x_init.iloc[:, 1:36]
predict_y_all = rfc.predict(train_x_test)

# 将单行预测结果添加到原表中最后一列
train_x_init.loc[:, "Label_Predicted"] = predict_y_all

# 将表按Trader进行分组并计算每组平均值
grouped_train = train_x_init.groupby("Trader", as_index=False)
grouped_train_mean = grouped_train.agg(np.mean)

# 展示处理后表的前五行
grouped_train_mean.head()

Unnamed: 0,Trader,OTR,OCR,OMR,min_time_two_events,mean_time_two_events,10_p_time_two_events,med_time_two_events,25_p_time_two_events,75_p_time_two_events,...,med_dt_TV1_TV2,min_dt_TV1_TV3,mean_dt_TV1_TV3,med_dt_TV1_TV3,min_dt_TV1_TV4,mean_dt_TV1_TV4,med_dt_TV1_TV4,NbSecondWithAtLeatOneTrade,Label,Label_Predicted
0,Trader_10,2.133353,24.362271,43.820981,8.956736,2431.882764,9.332928,831.154295,31.667709,2974.318122,...,23.030473,0.00026,111.633327,23.665962,0.00027,113.158721,23.435835,16.805195,-1.0,-1.0
1,Trader_105,5.399858,39.622702,13.741532,20.38837,4710.344772,20.451175,329.882101,40.584926,4946.006166,...,22.413161,0.00026,111.633327,23.665962,0.00027,113.158721,23.435835,49.258772,-1.0,-0.999513
2,Trader_107,3.663011,7.270289,91.532415,0.597774,88.883777,0.687852,16.22921,2.731964,71.448717,...,142.14464,109.640401,286.472612,150.788459,103.521249,288.920299,141.993293,75.37146,0.0,-0.005447
3,Trader_110,3.508333,8.75,14.0,1510.086341,14886.664609,1510.086341,2909.984216,1510.086341,26757.402475,...,22.413161,0.00026,111.633327,23.665962,0.00027,113.158721,23.435835,1.625,-1.0,-1.0
4,Trader_114,5.013444,26.304692,14.184935,251.787335,7331.947733,255.691851,1935.31473,463.869943,10727.553461,...,466.19599,235.666496,762.116732,466.264366,235.666496,762.199985,466.251806,105.414524,-1.0,-1.0


In [12]:
# 获取整体结果
predict_y = []
for i in range(grouped_train_mean.shape[0]):
    if grouped_train_mean["Label_Predicted"][i] > 0.33:
        predict_y.append(1.0)
    elif grouped_train_mean["Label_Predicted"][i] < - 0.33:
        predict_y.append(- 1.0)
    else:
        predict_y.append(0.0)

# 测试整体结果
print('The final accuracy score is', accuracy_score(grouped_train_mean["Label"], predict_y))

The final accuracy score is 0.9883720930232558


# 输出测试集结果

In [13]:
# 查看测试数据
test_x_init.head()

Unnamed: 0,Index,Share,Day,Trader,OTR,OCR,OMR,min_time_two_events,mean_time_two_events,10_p_time_two_events,...,min_dt_TV1_TV2,mean_dt_TV1_TV2,med_dt_TV1_TV2,min_dt_TV1_TV3,mean_dt_TV1_TV3,med_dt_TV1_TV3,min_dt_TV1_TV4,mean_dt_TV1_TV4,med_dt_TV1_TV4,NbSecondWithAtLeatOneTrade
0,1,Isin_106,Date_12,Adelaide,1.114754,68.0,13.669274,1.2e-05,22.905096,6.1e-05,...,0.00032,125.959824,25.621796,0.000452,125.585,26.665,0.000543,124.177475,25.966,16
1,2,Isin_94,Date_2,Adelaide,1.666667,5.0,13.669274,0.00197,67.630572,0.00197,...,0.00032,125.959824,25.621796,0.000452,125.585,26.665,0.000543,124.177475,25.966,1
2,3,Isin_106,Date_13,Adelaide,1.088235,37.0,13.669274,0.0,39.50649,0.0,...,0.00032,125.959824,25.621796,0.000452,125.585,26.665,0.000543,124.177475,25.966,6
3,4,Isin_106,Date_7,Adelaide,2.25,4.5,13.669274,0.0,103.378733,0.0,...,0.00032,125.959824,25.621796,0.000452,125.585,26.665,0.000543,124.177475,25.966,2
4,5,Isin_106,Date_20,Adelaide,1.266667,19.0,13.669274,2.6e-05,49.417177,6.3e-05,...,0.00032,125.959824,25.621796,0.000452,125.585,26.665,0.000543,124.177475,25.966,5


In [14]:
# 获得输入数据
test_x = test_x_init.iloc[:, 4:40]

# 导入模型，进行单行预测
predict_y = rfc.predict(test_x)

# 将单行预测结果添加到表中最后一列
test_x_init.loc[:, "Score_Predicted"] = predict_y

# 将表按Trader进行分组并计算每组平均值
grouped_test = test_x_init.groupby("Trader", as_index=False)
grouped_test_mean = grouped_test.agg(np.mean)

# 获取整体结果
label_y = []
for i in range(grouped_test_mean.shape[0]):
    if grouped_test_mean["Score_Predicted"][i] > 0.33:
        label_y.append("HFT")
    elif grouped_test_mean["Score_Predicted"][i] < - 0.33:
        label_y.append("NON HFT")
    else:
        label_y.append("MIX")

# 将整体预测结果添加到表中最后一列
grouped_test_mean.loc[:, "type"] = label_y

# 展示结果
grouped_test_mean.head()

Unnamed: 0,Trader,Index,OTR,OCR,OMR,min_time_two_events,mean_time_two_events,10_p_time_two_events,med_time_two_events,25_p_time_two_events,...,med_dt_TV1_TV2,min_dt_TV1_TV3,mean_dt_TV1_TV3,med_dt_TV1_TV3,min_dt_TV1_TV4,mean_dt_TV1_TV4,med_dt_TV1_TV4,NbSecondWithAtLeatOneTrade,Score_Predicted,type
0,Adelaide,5.5,1.384594,25.65,14.102346,0.0002026,40.031248,0.000218,0.001789,0.000356,...,25.621796,0.000452,125.585,26.665,0.000543,124.177475,25.966,4.9,-0.9,NON HFT
1,Alana,29.0,3.717499,50.14626,11.395083,213.8264,6642.86696,213.826401,1851.489926,726.763544,...,25.621796,0.000452,125.585,26.665,0.000543,124.177475,25.966,20.540541,-1.0,NON HFT
2,Alcmene,71.5,4.676963,9.5625,13.669274,3878.636,10675.070452,3878.636358,4966.611609,3912.782193,...,743.161102,728.067364,971.106887,743.747904,728.067415,970.315155,743.354717,45.8125,-0.9375,NON HFT
3,Alice,1144.0,55.519459,2.094555,13.669274,2.484502e-07,107.664516,0.11227,8.548595,0.83613,...,14.320273,5.737502,99.258898,14.292017,18.109038,111.895259,26.285304,475.942299,0.997616,HFT
4,Alices Sister,2284.0,2.787412,14.491364,10.557443,156.9779,1762.724624,157.36896,256.485343,163.055533,...,95.491069,67.350119,254.654454,96.397459,67.350199,253.431522,95.790131,7.584699,-0.540984,NON HFT


# 将结果写入到csv文件中

In [15]:
# 选取特定列
result = grouped_test_mean[["Trader", "type"]]

# 展示部分结果
result.head()

Unnamed: 0,Trader,type
0,Adelaide,NON HFT
1,Alana,NON HFT
2,Alcmene,NON HFT
3,Alice,HFT
4,Alices Sister,NON HFT


In [16]:
# 写入到csv文件中
result.to_csv("AMF_test_Y.csv",index=False,sep=',')