## 1. ML Evaluation

### utils

In [1]:
# 统计每个元素的个数
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from hmmlearn.hmm import GaussianHMM

from sklearn import svm
import csv
import pandas as pd
import numpy as np
 
def count(arr_gb):
    arr_gb = pd.Series(arr_gb)       # 转换数据类型
    arr_gb = arr_gb.value_counts()   # 计数
    arr_gb.sort_index(inplace=True)  # 排序
    return arr_gb

def deal_with_fake_data(path):
    X = np.array(pd.read_csv(path, sep=';'))
    #  从 hours-per-week 这一列生成 labels
    hours_per_week = X[:,-2]
    mean_ = np.mean(hours_per_week)
    y = []
    for i in hours_per_week:
        if i < mean_:
            y.append(0)
        else:
            y.append(1)
    y = np.array(y)
    X = np.delete(X, -2, 1)
    print(X.shape, y.shape)
    print("labels 分布")
    print(count(y))
    return X, y

def split_data(X, y):
    scaler = StandardScaler() # 标准化转换
    scaler.fit(X)  # 训练标准化对象
    X = scaler.transform(X)
    return train_test_split(X, y, test_size=0.3, random_state=1)
#     return train_test_split(X, y, test_size=0.6)


def result_compare(model):
    if model == "randomForest":
        model = RandomForestClassifier()
    elif model == "svm":
        model = svm.SVC()
    elif model == "reg":
        model = LogisticRegression()
    elif model == "gaussian":
        model = GaussianNB()
    elif model == "hmm":
        model = GaussianHMM()
    else:
        print("无法识别的模型名")
        return
    
    # ground truth
    clf = model
    clf.fit(X_real_train, y_real_train)
    y_real_pred = clf.predict(X_real_test)
    print("ground truth 的结果:")
    print(classification_report(y_real_test, y_real_pred))
    print("--------------------------------------")


    # Original
    clf = model
    clf.fit(X_original_train, y_original_train)
    y_original_pred = clf.predict(X_real_test)
    print("Metric - Original 的结果:")
    print(classification_report(y_real_test, y_original_pred))
    print("--------------------------------------")


    # New
    clf = model
    clf.fit(X_new_train, y_new_train)
    y_new_pred = clf.predict(X_real_test)
    print("Metric - New 的结果:")
    print(classification_report(y_real_test, y_new_pred))
    print("--------------------------------------")
    
    # New 3>,9>
    clf = model
    clf.fit(X_new2_train, y_new2_train)
    y_new2_pred = clf.predict(X_real_test)
    print("Metric - New 3>9> 的结果:")
    print(classification_report(y_real_test, y_new2_pred))
    print("--------------------------------------")
    
#     # Rule Data
#     clf = model
#     clf.fit(X_rule_data_train, y_rule_data_train)
#     y_rule_data_pred = clf.predict(X_real_test)
#     print("Metric - Rule Data 的结果:")
#     print(classification_report(y_real_test, y_rule_data_pred))
#     print("--------------------------------------")



### 处理数据

##### REAL

In [2]:
import pandas as pd
import numpy as np

GROUND_TRUTH_X_PATH = "/Users/luminshen/Documents/代码/PycharmProjects/Research/-GAN-/Table-GAN/tableGAN/data/Adult/Adult.csv"
GROUND_TRUTH_y_PATH = "/Users/luminshen/Documents/代码/PycharmProjects/Research/-GAN-/Table-GAN/tableGAN/data/Adult/Adult_labels.csv"

X_real = np.array(pd.read_csv(GROUND_TRUTH_X_PATH, sep=',')[1:])
y_real = np.array(pd.read_csv(GROUND_TRUTH_y_PATH, sep=',').astype(int)).flatten()
#  去除 hours-per-week 这一列，因为它被用来生成 labels
X_real = np.delete(X_real, -2, 1)
print(X_real.shape, y_real.shape)
print("labels 分布")
print(count(y_real))

X_real_train, X_real_test, y_real_train, y_real_test = split_data(X_real, y_real)

(1999, 13) (1999,)
labels 分布
0    1389
1     610
dtype: int64


##### FAKE-ORIGINAL

In [3]:
FAKE_ORIGINAL_X_PATH = "/Users/luminshen/Desktop/模型/original gan/300分钟/Adult_OI_11_00_fake.csv"

X_original, y_original = deal_with_fake_data(FAKE_ORIGINAL_X_PATH)
X_original_train, X_original_test, y_original_train, y_original_test = split_data(X_original, y_original)

(2000, 13) (2000,)
labels 分布
0    1393
1     607
dtype: int64


##### new gan

In [4]:
FAKE_NEW_X_PATH = "/Users/luminshen/Desktop/模型/new gan/0相同，10相同/600分钟 w2/Adult_OI_11_00_fake.csv"

X_new, y_new = deal_with_fake_data(FAKE_NEW_X_PATH)
X_new_train, X_new_test, y_new_train, y_new_test = split_data(X_new, y_new)

(2000, 13) (2000,)
labels 分布
0    1055
1     945
dtype: int64


##### new gan 2

In [5]:
FAKE_NEW2_X_PATH = "/Users/luminshen/Desktop/模型/new gan/3>, 9>/Adult_OI_11_00_fake.csv"

X_new2, y_new2 = deal_with_fake_data(FAKE_NEW2_X_PATH)
X_new2_train, X_new2_test, y_new2_train, y_new2_test = split_data(X_new2, y_new2)

(2000, 13) (2000,)
labels 分布
0    1012
1     988
dtype: int64


##### gan with data following rule

In [6]:
FAKE_RULE_DATA_X_PATH = "/Users/luminshen/Desktop/模型/gan with data following rules/Adult_OI_11_00_fake.csv"

X_rule_data, y_rule_data = deal_with_fake_data(FAKE_RULE_DATA_X_PATH)
X_rule_data_train, X_rule_data, y_rule_data_train, y_rule_data_test = split_data(X_rule_data, y_rule_data)

(200, 13) (200,)
labels 分布
0     96
1    104
dtype: int64


### Random Forest

In [7]:
result_compare("randomForest")

ground truth 的结果:
              precision    recall  f1-score   support

           0       0.73      0.88      0.80       406
           1       0.55      0.31      0.40       194

    accuracy                           0.70       600
   macro avg       0.64      0.60      0.60       600
weighted avg       0.67      0.70      0.67       600

--------------------------------------
Metric - Original 的结果:
              precision    recall  f1-score   support

           0       0.68      0.96      0.79       406
           1       0.33      0.04      0.07       194

    accuracy                           0.66       600
   macro avg       0.51      0.50      0.43       600
weighted avg       0.57      0.66      0.56       600

--------------------------------------
Metric - New 的结果:
              precision    recall  f1-score   support

           0       0.65      0.62      0.63       406
           1       0.28      0.31      0.29       194

    accuracy                           0.52  

### SVM

In [8]:
result_compare( "svm")

ground truth 的结果:
              precision    recall  f1-score   support

           0       0.71      0.90      0.79       406
           1       0.51      0.21      0.30       194

    accuracy                           0.68       600
   macro avg       0.61      0.56      0.55       600
weighted avg       0.64      0.68      0.63       600

--------------------------------------
Metric - Original 的结果:
              precision    recall  f1-score   support

           0       0.65      0.55      0.60       406
           1       0.29      0.38      0.33       194

    accuracy                           0.50       600
   macro avg       0.47      0.47      0.46       600
weighted avg       0.53      0.50      0.51       600

--------------------------------------
Metric - New 的结果:
              precision    recall  f1-score   support

           0       0.68      0.58      0.63       406
           1       0.33      0.43      0.37       194

    accuracy                           0.53  

### Logistic Regression

In [9]:
result_compare("reg")

ground truth 的结果:
              precision    recall  f1-score   support

           0       0.70      0.91      0.79       406
           1       0.50      0.19      0.28       194

    accuracy                           0.68       600
   macro avg       0.60      0.55      0.53       600
weighted avg       0.64      0.68      0.63       600

--------------------------------------
Metric - Original 的结果:
              precision    recall  f1-score   support

           0       0.67      0.64      0.65       406
           1       0.31      0.34      0.32       194

    accuracy                           0.54       600
   macro avg       0.49      0.49      0.49       600
weighted avg       0.55      0.54      0.55       600

--------------------------------------
Metric - New 的结果:
              precision    recall  f1-score   support

           0       0.66      0.56      0.61       406
           1       0.30      0.39      0.34       194

    accuracy                           0.51  

### GaussianNB

In [10]:
result_compare("gaussian")

ground truth 的结果:
              precision    recall  f1-score   support

           0       0.73      0.78      0.75       406
           1       0.46      0.40      0.43       194

    accuracy                           0.66       600
   macro avg       0.60      0.59      0.59       600
weighted avg       0.65      0.66      0.65       600

--------------------------------------
Metric - Original 的结果:
              precision    recall  f1-score   support

           0       0.67      0.90      0.77       406
           1       0.26      0.07      0.11       194

    accuracy                           0.64       600
   macro avg       0.47      0.49      0.44       600
weighted avg       0.54      0.64      0.56       600

--------------------------------------
Metric - New 的结果:
              precision    recall  f1-score   support

           0       0.64      0.34      0.45       406
           1       0.30      0.60      0.40       194

    accuracy                           0.43  

## 2. Statistics

### utils

In [20]:
import numpy as np
import scipy

def get_kldiverge(data_1, data_2):
    if data_1.split("/")[-1] == "Adult.csv":
        A = np.array(pd.read_csv(data_1, sep=',')).astype(float)
    else:
        A = np.array(pd.read_csv(data_1, sep=';')).astype(float)
        
    if data_2.split("/")[-1] == "Adult.csv":
        B = np.array(pd.read_csv(data_2, sep=',')).astype(float)
    else:
        B = np.array(pd.read_csv(data_2, sep=';')).astype(float)

    KL = scipy.stats.entropy(A, B) 
    
#     print(KL)
    
    res = 0
    count = 0
    for kl in KL:
        if kl != float('inf'):
            res += kl
            count += 1
            
    return res/count

def get_mse(data_1, data_2):
    if data_1.split("/")[-1] == "Adult.csv":
        A = np.array(pd.read_csv(data_1, sep=',')).astype(float)
    else:
        A = np.array(pd.read_csv(data_1, sep=';')).astype(float)
        
    if data_2.split("/")[-1] == "Adult.csv":
        B = np.array(pd.read_csv(data_2, sep=',')).astype(float)
    else:
        B = np.array(pd.read_csv(data_2, sep=';')).astype(float)
    
#     print(A.shape, B.shape)
    X = np.append(A, B, axis=0)
    y = np.append(np.zeros(A.shape[0]), np.ones(B.shape[0]))
#     print(X.shape, y.shape)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=1)

    clf = LogisticRegression()
    clf.fit(X_train, y_train)
    y_pred = clf.predict_proba(X_test)
    
    # 计算 MSE
#     print(count(y_test))
    c = 1395/2800
    res = 0
    for i in range(X.shape[0]):
        res += (y_pred[0][0] - c)**2
    
    print(res/X.shape[0])

### 结果

##### kl diverge


In [21]:
GROUND_TRUTH_PATH = "/Users/luminshen/Documents/代码/PycharmProjects/Research/-GAN-/Table-GAN/tableGAN/data/Adult/Adult.csv"
FAKE_ORIGINAL_PATH = "/Users/luminshen/Desktop/模型/original gan/300分钟/Adult_OI_11_00_fake.csv"
FAKE_NEW_PATH = "/Users/luminshen/Desktop/模型/new gan/0相同，10相同/600分钟 w2/Adult_OI_11_00_fake.csv"
FAKE_NEW2_PATH = "/Users/luminshen/Desktop/模型/new gan/3>, 9>/Adult_OI_11_00_fake.csv"

print(get_kldiverge(GROUND_TRUTH_PATH, FAKE_ORIGINAL_PATH))
print(get_kldiverge(GROUND_TRUTH_PATH, FAKE_NEW_PATH))
print(get_kldiverge(GROUND_TRUTH_PATH, FAKE_NEW2_PATH))

0.20904159718322513
0.2201262291237125
0.22591861288510268


##### Propensity Mean Squared Error

In [22]:
get_mse(GROUND_TRUTH_PATH, FAKE_ORIGINAL_PATH)
get_mse(GROUND_TRUTH_PATH, FAKE_NEW_PATH)
get_mse(GROUND_TRUTH_PATH, FAKE_NEW2_PATH)

0.25163943621930857
0.07102413027610538
0.14479387365956412


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
