## 1. ML Evaluation

### utils

In [1]:
# 统计每个元素的个数
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from hmmlearn.hmm import GaussianHMM

from sklearn import svm
import csv
import pandas as pd
import numpy as np
 
def count(arr_gb):
    arr_gb = pd.Series(arr_gb)       # 转换数据类型
    arr_gb = arr_gb.value_counts()   # 计数
    arr_gb.sort_index(inplace=True)  # 排序
    return arr_gb

def deal_with_fake_data(path):
    X = np.array(pd.read_csv(path, sep=';'))
    #  从 hours-per-week 这一列生成 labels
    hours_per_week = X[:,-2]
    mean_ = np.mean(hours_per_week)
    y = []
    for i in hours_per_week:
        if i < mean_:
            y.append(0)
        else:
            y.append(1)
    y = np.array(y)
    X = np.delete(X, -2, 1)
    print(X.shape, y.shape)
    print("labels 分布")
    print(count(y))
    return X, y

def deal_with_real_data(path):
    X = np.array(pd.read_csv(path, sep=','))
    #  从 hours-per-week 这一列生成 labels
    hours_per_week = X[:,-2]
    mean_ = np.mean(hours_per_week)
    y = []
    for i in hours_per_week:
        if i < mean_:
            y.append(0)
        else:
            y.append(1)
    y = np.array(y)
    X = np.delete(X, -2, 1)
    print(X.shape, y.shape)
    print("labels 分布")
    print(count(y))
    return X, y

def split_data(X, y):
    scaler = StandardScaler() # 标准化转换
    scaler.fit(X)  # 训练标准化对象
    X = scaler.transform(X)
#     return train_test_split(X, y, test_size=0.3, random_state=1)
    return train_test_split(X, y, test_size=0.4)


def result_compare(model):
    if model == "randomForest":
        model = RandomForestClassifier()
    elif model == "svm":
        model = svm.SVC()
    elif model == "reg":
        model = LogisticRegression()
    elif model == "gaussian":
        model = GaussianNB()
    elif model == "hmm":
        model = GaussianHMM()
    else:
        print("无法识别的模型名")
        return
    
    # ground truth
    clf = model
    clf.fit(X_real_train, y_real_train)
    y_real_pred = clf.predict(X_real_test)
    print("ground truth 的结果:")
    print(classification_report(y_real_test, y_real_pred))
    print("--------------------------------------")


    # Original
    clf = model
    clf.fit(X_original_train, y_original_train)
    y_original_pred = clf.predict(X_real_test)
    print("Metric - Original 的结果:")
    print(classification_report(y_real_test, y_original_pred))
    print("--------------------------------------")


    # New
    clf = model
    clf.fit(X_new_train, y_new_train)
    y_new_pred = clf.predict(X_real_test)
    print("Metric - New 的结果:")
    print(classification_report(y_real_test, y_new_pred))
    print("--------------------------------------")

### 处理数据

##### REAL

In [2]:
import pandas as pd
import numpy as np

GROUND_TRUTH_X_PATH = "/Users/luminshen/Documents/代码/PycharmProjects/Research/-GAN-/Table-GAN/tableGAN/data/Ticket/Ticket.csv"
# GROUND_TRUTH_y_PATH = "/Users/luminshen/Documents/代码/PycharmProjects/Research/-GAN-/Table-GAN/tableGAN/data/Ticket/Ticket_labels.csv"

X_real, y_real = deal_with_real_data(GROUND_TRUTH_X_PATH)
X_real_train, X_real_test, y_real_train, y_real_test = split_data(X_real, y_real)

(2000, 13) (2000,)
labels 分布
0    1105
1     895
dtype: int64


##### FAKE-ORIGINAL

In [3]:
FAKE_ORIGINAL_X_PATH = "/Users/luminshen/Documents/代码/PycharmProjects/Research/-GAN-/Table-GAN/tableGAN/samples/Ticket/Ticket_OI_11_00_fake.csv"

X_original, y_original = deal_with_fake_data(FAKE_ORIGINAL_X_PATH)
X_original_train, X_original_test, y_original_train, y_original_test = split_data(X_original, y_original)

(2000, 13) (2000,)
labels 分布
0    1222
1     778
dtype: int64


##### new gan

In [4]:
FAKE_NEW_X_PATH = "/Users/luminshen/Desktop/模型/new gan/T-3>9>/Ticket_OI_11_00_fake.csv"

X_new, y_new = deal_with_fake_data(FAKE_NEW_X_PATH)
X_new_train, X_new_test, y_new_train, y_new_test = split_data(X_new, y_new)

(2000, 13) (2000,)
labels 分布
0    1143
1     857
dtype: int64


### Random Forest

In [5]:
result_compare("randomForest")

ground truth 的结果:
              precision    recall  f1-score   support

           0       0.96      0.98      0.97       445
           1       0.97      0.95      0.96       355

    accuracy                           0.97       800
   macro avg       0.97      0.97      0.97       800
weighted avg       0.97      0.97      0.97       800

--------------------------------------
Metric - Original 的结果:
              precision    recall  f1-score   support

           0       0.66      0.73      0.69       445
           1       0.61      0.52      0.56       355

    accuracy                           0.64       800
   macro avg       0.63      0.63      0.63       800
weighted avg       0.63      0.64      0.63       800

--------------------------------------
Metric - New 的结果:
              precision    recall  f1-score   support

           0       0.79      0.71      0.75       445
           1       0.68      0.76      0.72       355

    accuracy                           0.73  

### SVM

In [6]:
result_compare( "svm")

ground truth 的结果:
              precision    recall  f1-score   support

           0       0.87      0.99      0.92       445
           1       0.98      0.81      0.89       355

    accuracy                           0.91       800
   macro avg       0.92      0.90      0.91       800
weighted avg       0.92      0.91      0.91       800

--------------------------------------
Metric - Original 的结果:
              precision    recall  f1-score   support

           0       0.65      0.67      0.66       445
           1       0.57      0.55      0.56       355

    accuracy                           0.62       800
   macro avg       0.61      0.61      0.61       800
weighted avg       0.62      0.62      0.62       800

--------------------------------------
Metric - New 的结果:
              precision    recall  f1-score   support

           0       0.74      0.80      0.77       445
           1       0.72      0.65      0.68       355

    accuracy                           0.73  

### Logistic Regression

In [7]:
result_compare("reg")

ground truth 的结果:
              precision    recall  f1-score   support

           0       0.81      0.97      0.88       445
           1       0.95      0.72      0.82       355

    accuracy                           0.86       800
   macro avg       0.88      0.84      0.85       800
weighted avg       0.87      0.86      0.85       800

--------------------------------------
Metric - Original 的结果:
              precision    recall  f1-score   support

           0       0.72      0.77      0.75       445
           1       0.69      0.63      0.66       355

    accuracy                           0.71       800
   macro avg       0.71      0.70      0.70       800
weighted avg       0.71      0.71      0.71       800

--------------------------------------
Metric - New 的结果:
              precision    recall  f1-score   support

           0       0.73      0.82      0.77       445
           1       0.74      0.62      0.67       355

    accuracy                           0.73  

### GaussianNB

In [8]:
result_compare("gaussian")

ground truth 的结果:
              precision    recall  f1-score   support

           0       0.89      0.86      0.87       445
           1       0.83      0.87      0.85       355

    accuracy                           0.86       800
   macro avg       0.86      0.86      0.86       800
weighted avg       0.86      0.86      0.86       800

--------------------------------------
Metric - Original 的结果:
              precision    recall  f1-score   support

           0       0.74      0.77      0.76       445
           1       0.70      0.66      0.68       355

    accuracy                           0.72       800
   macro avg       0.72      0.72      0.72       800
weighted avg       0.72      0.72      0.72       800

--------------------------------------
Metric - New 的结果:
              precision    recall  f1-score   support

           0       0.69      0.94      0.80       445
           1       0.86      0.47      0.61       355

    accuracy                           0.73  

## 2. Statistics

### utils

In [9]:
import numpy as np
import scipy

def get_kldiverge(data_1, data_2):
    if data_1.split("/")[-1] == "Ticket.csv":
        A = np.array(pd.read_csv(data_1, sep=',')).astype(float)
    else:
        A = np.array(pd.read_csv(data_1, sep=';')).astype(float)
        
    if data_2.split("/")[-1] == "Ticket.csv":
        B = np.array(pd.read_csv(data_2, sep=',')).astype(float)
    else:
        B = np.array(pd.read_csv(data_2, sep=';')).astype(float)

    KL = scipy.stats.entropy(A, B) 
    
#     print(KL)
    
    res = 0
    count = 0
    for kl in KL:
        if (not np.isnan(kl)) and (kl != np.float("inf")):
            res += kl
            count += 1
            
    return res/count

def get_mse(data_1, data_2):
    if data_1.split("/")[-1] == "Ticket.csv":
        A = np.array(pd.read_csv(data_1, sep=',')).astype(float)
    else:
        A = np.array(pd.read_csv(data_1, sep=';')).astype(float)
        
    if data_2.split("/")[-1] == "Ticket.csv":
        B = np.array(pd.read_csv(data_2, sep=',')).astype(float)
    else:
        B = np.array(pd.read_csv(data_2, sep=';')).astype(float)
    
#     print(A.shape, B.shape)
    X = np.append(A, B, axis=0)
    y = np.append(np.zeros(A.shape[0]), np.ones(B.shape[0]))
#     print(X.shape, y.shape)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

    clf = LogisticRegression()
    clf.fit(X_train, y_train)
    y_pred = clf.predict_proba(X_test)
    
    # 计算 MSE
    c = 616/(584+616)
    res = 0
    for i in range(X.shape[0]):
        res += (y_pred[0][0] - c)**2
    
    print(res/X.shape[0])

### 结果

##### kl diverge


In [10]:
GROUND_TRUTH_PATH = "/Users/luminshen/Documents/代码/PycharmProjects/Research/-GAN-/Table-GAN/tableGAN/data/Ticket/Ticket.csv"
FAKE_ORIGINAL_PATH = "/Users/luminshen/Documents/代码/PycharmProjects/Research/-GAN-/Table-GAN/tableGAN/samples/Ticket/Ticket_OI_11_00_fake.csv"
FAKE_NEW_PATH = "/Users/luminshen/Desktop/模型/new gan/T-3>9>/Ticket_OI_11_00_fake.csv"

print(get_kldiverge(GROUND_TRUTH_PATH, FAKE_ORIGINAL_PATH))
print(get_kldiverge(GROUND_TRUTH_PATH, FAKE_NEW_PATH))

0.35653191205703677
0.474644305902042


  pk = 1.0*pk / np.sum(pk, axis=axis, keepdims=True)
  qk = 1.0*qk / np.sum(qk, axis=axis, keepdims=True)


##### Propensity Mean Squared Error

In [11]:
get_mse(GROUND_TRUTH_PATH, FAKE_ORIGINAL_PATH)
get_mse(GROUND_TRUTH_PATH, FAKE_NEW_PATH)

0.1475889411332195
0.11697347529745791
