<a href="https://colab.research.google.com/github/owen198/csc_domain_adaptation/blob/main/csc_dann.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
from keras.layers import Input, Dense, Activation, BatchNormalization, PReLU, Dropout
from keras.models import Model
from keras.optimizers import SGD
from keras.utils import to_categorical
from sklearn.datasets import make_blobs
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

import pandas as pd
import datetime
from sklearn import preprocessing

In [2]:
from google.colab import drive
drive.mount('/content/gdrive/')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [3]:
tag1 = 'W4662FM0507'
tag2 = 'W4662FM0606'
path = './gdrive/My Drive/data/CSC/W4/'

In [4]:
tag1_pd = pd.concat([pd.read_csv(path+tag1+'_202009.csv'),
                       pd.read_csv(path+tag1+'_202010.csv'),
                       pd.read_csv(path+tag1+'_202011.csv'),
                       pd.read_csv(path+tag1+'_202012.csv'),
                       pd.read_csv(path+tag1+'_202101.csv'),
                       pd.read_csv(path+tag2+'_202101.csv'),
                       pd.read_csv(path+tag2+'_202102.csv')])

tag2_pd = pd.concat([pd.read_csv(path+tag2+'_202009.csv'),
                       pd.read_csv(path+tag2+'_202010.csv'),
                       pd.read_csv(path+tag2+'_202011.csv'),
                       pd.read_csv(path+tag2+'_202012.csv'),
                       pd.read_csv(path+tag2+'_202101.csv'),
                       pd.read_csv(path+tag2+'_202102.csv')])

In [5]:
tag1_pd['datetime'] = tag1_pd['timestamp'].astype('int').astype("datetime64[s]")
tag2_pd['datetime'] = tag2_pd['timestamp'].astype('int').astype("datetime64[s]")

In [6]:
tag1_normal = tag1_pd[(tag1_pd['datetime'] > datetime.datetime(2020,9,1,0,0)) & (tag1_pd['datetime'] < datetime.datetime(2021,1,8,0,0))]
tag1_abnormal = tag1_pd[(tag1_pd['datetime'] > datetime.datetime(2021,1,9,0,0)) & (tag1_pd['datetime'] < datetime.datetime(2021,2,1,0,0))]

tag2_normal = tag2_pd[(tag2_pd['datetime'] > datetime.datetime(2020,9,1,0,0)) & (tag2_pd['datetime'] < datetime.datetime(2021,1,8,0,0))]
tag2_abnormal = tag2_pd[(tag2_pd['datetime'] > datetime.datetime(2021,1,9,0,0)) & (tag2_pd['datetime'] < datetime.datetime(2021,2,1,0,0))]

In [7]:
tag1_normal_dt = tag1_normal['datetime']
tag1_abnormal_dt = tag1_abnormal['datetime']

tag2_normal_dt = tag2_normal['datetime']
tag2_abnormal_dt = tag2_abnormal['datetime']

original_list = [s for s in list(tag1_normal) if (s.startswith('CREST') or 
                                                      s.startswith('KURT_') or 
                                                      s.startswith('SKEW') or 
                                                      s.startswith('RMS_'))]

tag1_normal = tag1_normal[original_list]
tag1_abnormal = tag1_abnormal[original_list]

tag2_normal = tag2_normal[original_list]
tag2_abnormal = tag2_abnormal[original_list]

In [8]:
Xs = tag1_normal.append(tag1_abnormal)
Xt = tag2_normal.append(tag2_abnormal)

ys = pd.DataFrame(index=range(0,len(tag1_normal)),columns=['label'], dtype='int', data=0).append(pd.DataFrame(index=range(0,len(tag1_abnormal)),columns=['label'], dtype='int', data=1))
yt = pd.DataFrame(index=range(0,len(tag2_normal)),columns=['label'], dtype='int', data=0).append(pd.DataFrame(index=range(0,len(tag2_abnormal)),columns=['label'], dtype='int', data=1))

In [9]:
import random

shape_min = min (Xs.shape[0], yt.shape[0])
shape_max = max (Xs.shape[0], yt.shape[0])

index = sorted(random.sample(range(0, shape_max), shape_min))

In [10]:
min_max_scaler = preprocessing.MinMaxScaler()
min_max_scaler = min_max_scaler.fit(Xs)

Xs = pd.DataFrame(min_max_scaler.transform(Xs))
Xt = pd.DataFrame(min_max_scaler.transform(Xt))


In [11]:
print(len(Xt), len(Xs))

if len(Xt) > len(Xs):
    Xt = Xt.iloc[index]
    yt = yt.iloc[index]
else:
    Xs = Xs.iloc[index]
    ys = ys.iloc[index]

51396 75576


In [12]:
def build_models(n_neurons):
    """Creates three different models, one used for source only training, two used for domain adaptation"""
    inputs = Input(shape=(104,)) 
    x4 = Dense(n_neurons, activation='linear')(inputs)
    x4 = BatchNormalization()(x4)
    x4 = Activation("elu")(x4)  

    source_classifier = Dense(2, activation='softmax', name="mo")(x4)  
    domain_classifier = Dense(32, activation='linear', name="do4")(x4)
    domain_classifier = BatchNormalization(name="do5")(domain_classifier)
    domain_classifier = Activation("elu", name="do6")(domain_classifier)
    domain_classifier = Dropout(0.5)(domain_classifier)

    domain_classifier = Dense(2, activation='softmax', name="do")(domain_classifier)

    comb_model = Model(inputs=inputs, outputs=[source_classifier, domain_classifier])
    comb_model.compile(optimizer="Adam",
              loss={'mo': 'categorical_crossentropy', 'do': 'categorical_crossentropy'},
              loss_weights={'mo': 1, 'do': 2}, metrics=['accuracy'], )

    source_classification_model = Model(inputs=inputs, outputs=[source_classifier])
    source_classification_model.compile(optimizer="Adam",
              loss={'mo': 'categorical_crossentropy'}, metrics=['accuracy'], )


    domain_classification_model = Model(inputs=inputs, outputs=[domain_classifier])
    domain_classification_model.compile(optimizer="Adam",
                  loss={'do': 'categorical_crossentropy'}, metrics=['accuracy'])
    
    
    embeddings_model = Model(inputs=inputs, outputs=[x4])
    embeddings_model.compile(optimizer="Adam",loss = 'categorical_crossentropy', metrics=['accuracy'])
                        
                        
    return comb_model, source_classification_model, domain_classification_model, embeddings_model

def batch_generator(data, batch_size):
    """Generate batches of data.

    Given a list of numpy data, it iterates over the list and returns batches of the same size
    This
    """
    all_examples_indices = len(data[0])
    while True:
        mini_batch_indices = np.random.choice(all_examples_indices, size=batch_size, replace=False)
        tbr = [k[mini_batch_indices] for k in data]
        yield tbr

In [13]:
#SAMPLING_ITERATIONS = 3000

def train(Xs, ys, Xt, yt,  enable_dann = True, n_iterations = 15000):
    
    batch_size = 256
    
    model, source_classification_model, domain_classification_model, embeddings_model = build_models(2)

    y_class_dummy = np.ones((len(Xt), 2))
    y_adversarial_1 = to_categorical(np.array(([1] * batch_size + [0] * batch_size)))
    
    sample_weights_class = np.array(([1] * batch_size + [0] * batch_size))
    sample_weights_adversarial = np.ones((batch_size * 2,))

    S_batches = batch_generator([Xs, to_categorical(ys)], batch_size)
    T_batches = batch_generator([Xt, np.zeros(shape = (len(Xt),2))], batch_size)
    
    for i in range(n_iterations):
        # # print(y_class_dummy.shape, ys.shape)
        y_adversarial_2 = to_categorical(np.array(([0] * batch_size + [1] * batch_size)))

        X0, y0 = next(S_batches)
        X1, y1 = next(T_batches)


        X_adv = np.concatenate([X0, X1])
        y_class = np.concatenate([y0, np.zeros_like(y0)])

        adv_weights = []
        for layer in model.layers:
            if (layer.name.startswith("do")):
                adv_weights.append(layer.get_weights())

        if(enable_dann):
            # note - even though we save and append weights, the batchnorms moving means and variances
            # are not saved throught this mechanism 
            stats = model.train_on_batch(X_adv, [y_class, y_adversarial_1],
                                     sample_weight=[sample_weights_class, sample_weights_adversarial])
            
            k = 0
            for layer in model.layers:
                if (layer.name.startswith("do")):
                    layer.set_weights(adv_weights[k])
                    k += 1

            class_weights = []
            
        
            for layer in model.layers:
                if (not layer.name.startswith("do")):
                    class_weights.append(layer.get_weights())

            #print(type(X_adv), type(y_adversarial_2))
            
            stats2 = domain_classification_model.train_on_batch(X_adv, y_adversarial_2)

            k = 0
            for layer in model.layers:
                if (not layer.name.startswith("do")):
                    layer.set_weights(class_weights[k])
                    k += 1

        else:
            source_classification_model.train_on_batch(X0,y0)
            
       
        if ((i + 1) % 1000 == 0):
            # print(i, stats)
            y_test_hat_t = source_classification_model.predict(Xt).argmax(1)
            y_test_hat_s = source_classification_model.predict(Xs).argmax(1)
            print("Iteration %d, source accuracy =  %.3f, target accuracy = %.3f"%(i, accuracy_score(ys, y_test_hat_s), accuracy_score(yt, y_test_hat_t)))
    return embeddings_model

In [14]:
embs = train(Xs.values, ys.values, Xt.values, yt.values, enable_dann = False)

Iteration 999, source accuracy =  0.844, target accuracy = 0.595
Iteration 1999, source accuracy =  0.826, target accuracy = 0.528
Iteration 2999, source accuracy =  0.904, target accuracy = 0.239
Iteration 3999, source accuracy =  0.869, target accuracy = 0.526
Iteration 4999, source accuracy =  0.371, target accuracy = 0.206
Iteration 5999, source accuracy =  0.541, target accuracy = 0.424
Iteration 6999, source accuracy =  0.959, target accuracy = 0.620
Iteration 7999, source accuracy =  0.856, target accuracy = 0.648
Iteration 8999, source accuracy =  0.817, target accuracy = 0.622
Iteration 9999, source accuracy =  0.921, target accuracy = 0.516
Iteration 10999, source accuracy =  0.915, target accuracy = 0.470
Iteration 11999, source accuracy =  0.938, target accuracy = 0.428
Iteration 12999, source accuracy =  0.849, target accuracy = 0.693
Iteration 13999, source accuracy =  0.874, target accuracy = 0.676
Iteration 14999, source accuracy =  0.822, target accuracy = 0.840


In [15]:

embs = train(Xs.values, ys.values, Xt.values, yt.values, enable_dann = True)

Iteration 999, source accuracy =  0.459, target accuracy = 0.328
Iteration 1999, source accuracy =  0.853, target accuracy = 0.830
Iteration 2999, source accuracy =  0.853, target accuracy = 0.837
Iteration 3999, source accuracy =  0.852, target accuracy = 0.854
Iteration 4999, source accuracy =  0.853, target accuracy = 0.849
Iteration 5999, source accuracy =  0.851, target accuracy = 0.848
Iteration 6999, source accuracy =  0.723, target accuracy = 0.791
Iteration 7999, source accuracy =  0.850, target accuracy = 0.854
Iteration 8999, source accuracy =  0.817, target accuracy = 0.855
Iteration 9999, source accuracy =  0.879, target accuracy = 0.797
Iteration 10999, source accuracy =  0.865, target accuracy = 0.838
Iteration 11999, source accuracy =  0.898, target accuracy = 0.824
Iteration 12999, source accuracy =  0.882, target accuracy = 0.815
Iteration 13999, source accuracy =  0.789, target accuracy = 0.675
Iteration 14999, source accuracy =  0.884, target accuracy = 0.848
