# TDA聚类
TDA-based association analysis between reaction conditions and yield

In [None]:
import math
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn import datasets
%matplotlib inline
matplotlib.rc('font', family='Arial Unicode MS',)  ## 解决图上文字乱码的问题

In [None]:
from tda import TDA

In [None]:
data= np.loadtxt(open("x.csv","rb"),delimiter=",",skiprows=1)
y = np.loadtxt(open("y.csv","rb"),delimiter=",",skiprows=1)

In [None]:
def distance(a, b):
    s = sum( (i-j) **2 for i, j in zip(a, b))
    return math.sqrt(s)
def L_inf(n, i):
    L = 0
    for j in range(len(n)):
        dist = distance(n[i], n[j])
        if dist > L:
            L = dist
    return L

In [None]:
t = TDA(distance, [(L_inf, 15,0.6)],5)
t.fit(data)
t.binums, len(t.clusters)

In [None]:
t.dye(lambda d, i: y[i], figsize=6, Pk=1)
plt.savefig('tda.tif', dpi=300, bbox_inches='tight')
plt.show()

# 多因素方差分析
(Interaction-based association analysis between reaction conditions and yield：反应条件与产率的交互作用分析）

In [None]:
data= np.loadtxt(open("xjiaohu.csv","rb"),delimiter=",",skiprows=1)
df = pd.DataFrame(data, columns = ['Aryl', 'Ligand', 'y'])
df.head()

In [None]:
df1 = pd.DataFrame()
data_list = []
for i in df.Ligand.unique():
    for j in df.Aryl.unique():
        data = df[(df.Ligand == i)&(df.Aryl == j)]['y'].values
        data_list.append(data)
        df1 = df1.append(pd.DataFrame(data, columns = pd.MultiIndex.from_arrays([[i],[j]])).T)
df1 = df1.T
df1


In [None]:
# 查看各组数量分布
df1.count().to_frame()


In [None]:
df_mean = df1.mean().to_frame().unstack().round(1)
df_mean.columns = ['1', '2', '3','4','5', '6', '7','8','9', '10', '11','12','13', '14', '15']
df_mean = df_mean[['1', '2', '3','4','5', '6', '7','8','9', '10', '11','12','13', '14', '15']]
df_mean

In [None]:
# 定义一个绘图函数
def draw_pics(data, feature):
    fig, ax = plt.subplots(figsize=(6, 4)) 
    for i in data.index:
        ax.plot(data.columns, data.loc[i,], label = i, marker='o')
        ax.legend()
   # ax.set_title("y")
    ax.set_xlabel(feature, fontdict={'fontsize': 14})
    ax.set_ylabel("Estimated Marginal Mean", fontdict={'fontsize': 14})
    plt.show()
# 绘制不同的性别在不同的教育程度下的均值变化
draw_pics(df_mean, 'Aryl')


# 产率预测

In [None]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import matplotlib.pylab as plt
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
import fire
from sklearn import metrics
from matplotlib import pyplot

In [None]:
data= np.loadtxt(open("x.csv","rb"),delimiter=",",skiprows=1)
y = np.loadtxt(open("y.csv","rb"),delimiter=",",skiprows=1)

In [None]:
import time
start = time.time()

gbm = lgb.LGBMRegressor(reg_alpha=0.11,reg_lambda=1,min_child_samples=8,min_child_weight=0,colsample_bytree=0.75,subsample=1,
                        num_leaves=51,max_depth=12,min_split_gain=0,learning_rate=0.1,n_estimators=515)
gbm.fit(X_train, y_train)
# 测试机预测
y_pred = gbm.predict(X_test)

end = time.time()
print("循环运行时间:%.4f秒"%(end-start))

In [None]:
from sklearn import metrics
# 评估回归性能
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:',
      np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('R2:', metrics.r2_score(y_test, y_pred))

# 多样性抽样

In [None]:
import numpy as np
import pandas as pd
import torch
from keras.models import Sequential
from sklearn.model_selection import train_test_split
from keras.layers import Dense,Dropout,Flatten,Conv1D,MaxPool1D,BatchNormalization,Activation
import matplotlib.pyplot as plt
from keras import backend as K

In [None]:
data= np.loadtxt(open("x.csv","rb"),delimiter=",",skiprows=1)
y = np.loadtxt(open("y.csv","rb"),delimiter=",",skiprows=1)

In [None]:
X=torch.from_numpy(data)

In [None]:
# 将数据分为训练集和测试集
from sklearn.model_selection import train_test_split
x_training, x_unlabeled, y_training, y_unlabeled = train_test_split(X,y,train_size=0.1)

In [None]:
# 此函数用于更新Train_x
# data_var来源于相似度计算，是x_test中的数据
def upTrain_x(data_var,X):
    final_update= torch.tensor([item.detach().numpy() for item in data_var])
    return torch.cat((X,final_update),dim = 0)
    
# 用于更新train_y
## train_y = train_y + (data_var <- test_y)
def upTrain_y(data_var,Y):
    return torch.cat((Y,data_var),dim = 0)

# 用于更新test_y：
## test_y => 
##          1. test_y
##          2. data_var -> train_y
## return: (test_y,data_var)
def upTest_y(line_num,Y):
    out = np.array([],dtype='float64')
    data_var = np.array([],dtype='float64')
    for i in range(len(Y)):
        if i not in line_num:
            out = np.append(out,Y[i])
        else:
            data_var = np.append(data_var,Y[i])
    return (torch.tensor(out),torch.tensor(data_var))   

# 此函数用于更新x
# 删除x_test中的数据，不做返回，返回在相似度计算时已做
## test_x => 
##          1. test_x
##          2. data_var -> train_x
def upTest_x(line_num,X):
    out = []
    for i in range(len(X)):                
        if i not in line_num:
            out.append(X[i])
    return torch.stack(out,dim = 1).T   # 转化为torch，注意需要转至 

# 此函数用于计算训练集与测试集相似度
## x_train: 训练集的x
## x_test: 测试集的x
## return:
###       1. line_num
###       2. data_var <- x_test
def CalSim(x_train,x_test,n):
    similarity_list = []
    #line_num,data_var=[],[]
    for i in range(len(x_test)):
        s_ = torch.cosine_similarity(x_test[i],x_train, dim=-1)#计算未标记数据的与所有标记数据的相似度
        s_max = torch.max(s_).item() # 返回每一个未标记数据对应的最大相似度
        similarity_list.append(s_max) # 将最大相似度添加到相似度列表中
    df = pd.DataFrame(zip(list(enumerate(x_test)), similarity_list), columns=['index', 'similarity'])
    df_sorted = df.sort_values(by=['similarity'], ascending=True)#将最大相似度升序排列
    df_index = df_sorted['index'].values
    update_list = list(df_index[:n])#选择前10名作为候选对象
    # 将ipdate_list排序
    df2 = pd.DataFrame(update_list,columns=["num","data"])
    df2 = df2.sort_values(by=['num'], ascending=True)               #升序排列
    line_num = list(df2["num"])
    data_var = list(df2["data"])
    return (line_num,data_var)


In [None]:
def Upgrade(n,m,train_x,train_y,test_x,test_y):
    train_x_ = train_x
    train_y_ = torch.tensor(train_y)
    test_x_ = test_x
    test_y_ = torch.tensor(test_y)
    count = 0
    while count < n:
        print("Loop <{}> |".format(count),end="\t")
        out_sim = CalSim(train_x_,test_x_,m)
        # 更新train_x_
        train_x_ = upTrain_x(out_sim[1],train_x_)
        print("New train_x = ({},{})\t".format(train_x_.shape[0],train_x_.shape[1]),end="\t")
        ## 更新test_x_
        test_x_ = upTest_x(out_sim[0],test_x_)
        print("New test_x = ({},{})\t".format(test_x_.shape[0],test_x_.shape[1]),end="\t")
        ## 更新test_y_
        out_uptest_y = upTest_y(out_sim[0],test_y_)
        test_y_ = out_uptest_y[0]
        print("New test_y_ = ({},)\t".format(test_y_.shape[0]),end="\t")
        ## 更新train_y_
        #print(train_y_)
        train_y_ = upTrain_y(out_uptest_y[1],train_y_)
        print("New train_y_ = ({},)\t".format(train_y_.shape[0]),end="\n")
        count += 1
    #return (train_x_,train_y_,test_x_,test_y_)                                      # 返回的都是tensor
    return (train_x_.numpy(),train_y_.numpy(),test_x_.numpy(),test_y_.numpy())       # 将所有的转化为array


In [None]:
a = Upgrade(6,395,x_training, y_training,x_unlabeled,y_unlabeled)

In [None]:
X_train=a[0]
X_test=a[2]
y_train=a[1]
y_test=a[3]