## 1. ML Evaluation

### utils

In [1]:
# 统计每个元素的个数
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression
from sklearn import svm
import csv
import pandas as pd
import numpy as np
 
def count(arr_gb):
    arr_gb = pd.Series(arr_gb)       # 转换数据类型
    arr_gb = arr_gb.value_counts()   # 计数
    arr_gb.sort_index(inplace=True)  # 排序
    return arr_gb

def deal_with_fake_data(path):
    X = np.array(pd.read_csv(path, sep=';'))
    #  从 hours-per-week 这一列生成 labels
    hours_per_week = X[:,-2]
    mean_ = np.mean(hours_per_week)
    y = []
    for i in hours_per_week:
        if i < mean_:
            y.append(0)
        else:
            y.append(1)
    y = np.array(y)
    X = np.delete(X, -2, 1)
    print(X.shape, y.shape)
    print("labels 分布")
    print(count(y))
    return X, y

def split_data(X, y):
    scaler = StandardScaler() # 标准化转换
    scaler.fit(X)  # 训练标准化对象
    X = scaler.transform(X)
    return train_test_split(X, y, test_size=0.5, random_state=1)

def result_compare(model):
    if model == "randomForest":
        model = RandomForestClassifier()
    elif model == "svm":
        model = svm.SVC()
    elif model == "reg":
        model = LogisticRegression()
    else:
        print("无法识别的模型名")
        return
    
    # ground truth
    clf = model
    clf.fit(X_real_train, y_real_train)
    y_real_pred = clf.predict(X_real_test)
    print("ground truth 的结果:")
    print(classification_report(y_real_test, y_real_pred))
    print("--------------------------------------")


    # Original
    clf = model
    clf.fit(X_original_train, y_original_train)
    y_original_pred = clf.predict(X_real_test)
    print("Metric - Original 的结果:")
    print(classification_report(y_real_test, y_original_pred))
    print("--------------------------------------")


    # New
    clf = model
    clf.fit(X_new_train, y_new_train)
    y_new_pred = clf.predict(X_real_test)
    print("Metric - New 的结果:")
    print(classification_report(y_real_test, y_new_pred))
    print("--------------------------------------")
    
    # New 3>,9>
    clf = model
    clf.fit(X_new2_train, y_new2_train)
    y_new2_pred = clf.predict(X_real_test)
    print("Metric - New 3>9> 的结果:")
    print(classification_report(y_real_test, y_new2_pred))
    print("--------------------------------------")
    
#     # Rule Data
#     clf = model
#     clf.fit(X_rule_data_train, y_rule_data_train)
#     y_rule_data_pred = clf.predict(X_real_test)
#     print("Metric - Rule Data 的结果:")
#     print(classification_report(y_real_test, y_rule_data_pred))
#     print("--------------------------------------")



### 处理数据

##### REAL

In [2]:
import pandas as pd
import numpy as np

GROUND_TRUTH_X_PATH = "/Users/luminshen/Documents/代码/PycharmProjects/Research/-GAN-/Table-GAN/tableGAN/data/Adult/Adult.csv"
GROUND_TRUTH_y_PATH = "/Users/luminshen/Documents/代码/PycharmProjects/Research/-GAN-/Table-GAN/tableGAN/data/Adult/Adult_labels.csv"

X_real = np.array(pd.read_csv(GROUND_TRUTH_X_PATH, sep=',')[1:])
y_real = np.array(pd.read_csv(GROUND_TRUTH_y_PATH, sep=',').astype(int)).flatten()
#  去除 hours-per-week 这一列，因为它被用来生成 labels
X_real = np.delete(X_real, -2, 1)
print(X_real.shape, y_real.shape)
print("labels 分布")
print(count(y_real))

X_real_train, X_real_test, y_real_train, y_real_test = split_data(X_real, y_real)

(1999, 13) (1999,)
labels 分布
0    1389
1     610
dtype: int64


##### FAKE-ORIGINAL

In [3]:
FAKE_ORIGINAL_X_PATH = "/Users/luminshen/Desktop/模型/original gan/300分钟/Adult_OI_11_00_fake.csv"

X_original, y_original = deal_with_fake_data(FAKE_ORIGINAL_X_PATH)
X_original_train, X_original_test, y_original_train, y_original_test = split_data(X_original, y_original)

(2000, 13) (2000,)
labels 分布
0    1393
1     607
dtype: int64


##### new gan

In [4]:
FAKE_NEW_X_PATH = "/Users/luminshen/Desktop/模型/new gan/0相同，10相同/600分钟 w2/Adult_OI_11_00_fake.csv"

X_new, y_new = deal_with_fake_data(FAKE_NEW_X_PATH)
X_new_train, X_new_test, y_new_train, y_new_test = split_data(X_new, y_new)

(2000, 13) (2000,)
labels 分布
0    1055
1     945
dtype: int64


##### new gan 2

In [5]:
FAKE_NEW2_X_PATH = "/Users/luminshen/Desktop/模型/new gan/3>, 9>/Adult_OI_11_00_fake.csv"

X_new2, y_new2 = deal_with_fake_data(FAKE_NEW2_X_PATH)
X_new2_train, X_new2_test, y_new2_train, y_new2_test = split_data(X_new2, y_new2)

(2000, 13) (2000,)
labels 分布
0    1012
1     988
dtype: int64


##### gan with data following rule

In [6]:
FAKE_RULE_DATA_X_PATH = "/Users/luminshen/Desktop/模型/gan with data following rules/Adult_OI_11_00_fake.csv"

X_rule_data, y_rule_data = deal_with_fake_data(FAKE_RULE_DATA_X_PATH)
X_rule_data_train, X_rule_data, y_rule_data_train, y_rule_data_test = split_data(X_rule_data, y_rule_data)

(200, 13) (200,)
labels 分布
0     96
1    104
dtype: int64


### Random Forest

In [7]:
result_compare("randomForest")

ground truth 的结果:
              precision    recall  f1-score   support

           0       0.76      0.87      0.81       689
           1       0.58      0.39      0.47       311

    accuracy                           0.72      1000
   macro avg       0.67      0.63      0.64      1000
weighted avg       0.71      0.72      0.71      1000

--------------------------------------
Metric - Original 的结果:
              precision    recall  f1-score   support

           0       0.69      0.98      0.81       689
           1       0.26      0.02      0.03       311

    accuracy                           0.68      1000
   macro avg       0.48      0.50      0.42      1000
weighted avg       0.56      0.68      0.57      1000

--------------------------------------
Metric - New 的结果:
              precision    recall  f1-score   support

           0       0.66      0.63      0.65       689
           1       0.26      0.29      0.27       311

    accuracy                           0.52  

### SVM

In [8]:
result_compare( "svm")

ground truth 的结果:
              precision    recall  f1-score   support

           0       0.71      0.94      0.81       689
           1       0.52      0.15      0.24       311

    accuracy                           0.69      1000
   macro avg       0.62      0.55      0.52      1000
weighted avg       0.65      0.69      0.63      1000

--------------------------------------
Metric - Original 的结果:
              precision    recall  f1-score   support

           0       0.66      0.61      0.63       689
           1       0.27      0.32      0.29       311

    accuracy                           0.52      1000
   macro avg       0.46      0.46      0.46      1000
weighted avg       0.54      0.52      0.53      1000

--------------------------------------
Metric - New 的结果:
              precision    recall  f1-score   support

           0       0.68      0.58      0.63       689
           1       0.30      0.41      0.35       311

    accuracy                           0.53  

### Logistic Regression

In [9]:
result_compare("reg")

ground truth 的结果:
              precision    recall  f1-score   support

           0       0.72      0.89      0.80       689
           1       0.50      0.24      0.33       311

    accuracy                           0.69      1000
   macro avg       0.61      0.57      0.56      1000
weighted avg       0.65      0.69      0.65      1000

--------------------------------------
Metric - Original 的结果:
              precision    recall  f1-score   support

           0       0.68      0.65      0.66       689
           1       0.30      0.33      0.31       311

    accuracy                           0.55      1000
   macro avg       0.49      0.49      0.49      1000
weighted avg       0.56      0.55      0.55      1000

--------------------------------------
Metric - New 的结果:
              precision    recall  f1-score   support

           0       0.69      0.60      0.64       689
           1       0.31      0.41      0.35       311

    accuracy                           0.54  

## 2. Statistics

### utils

In [10]:
import numpy as np


def get_kldiverge(data_1, data_2):
    if data_1.split("/")[-1] == "Adult.csv":
        A = np.array(pd.read_csv(data_1, sep=',')).astype(float)
    else:
        A = np.array(pd.read_csv(data_1, sep=';')).astype(float)
        
    if data_2.split("/")[-1] == "Adult.csv":
        B = np.array(pd.read_csv(data_2, sep=',')).astype(float)
    else:
        B = np.array(pd.read_csv(data_2, sep=';')).astype(float)

    KL = scipy.stats.entropy(A, B) 
    
#     print(KL)
    
    res = 0
    count = 0
    for kl in KL:
        if kl != float('inf'):
            res += kl
            count += 1
            
    return res/count

### 结果

##### kl diverge


In [11]:
GROUND_TRUTH_PATH = "/Users/luminshen/Documents/代码/PycharmProjects/Research/-GAN-/Table-GAN/tableGAN/data/Adult/Adult.csv"
FAKE_ORIGINAL_PATH = "/Users/luminshen/Desktop/模型/original gan/300分钟/Adult_OI_11_00_fake.csv"
FAKE_NEW_PATH = "/Users/luminshen/Desktop/模型/new gan/0相同，10相同/600分钟 w2/Adult_OI_11_00_fake.csv"
FAKE_NEW2_PATH = "/Users/luminshen/Desktop/模型/new gan/3>, 9>/Adult_OI_11_00_fake.csv"

print(get_kldiverge(GROUND_TRUTH_PATH, FAKE_ORIGINAL_PATH))
print(get_kldiverge(GROUND_TRUTH_PATH, FAKE_NEW_PATH))
print(get_kldiverge(GROUND_TRUTH_PATH, FAKE_NEW2_PATH))




NameError: name 'scipy' is not defined