In [42]:
import pandas as pd
import os
import numpy as np
from loguru import logger
import sys
# Load dateloader
from scipy.stats import gaussian_kde
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
%matplotlib notebook

In [43]:
class CS_Classifier:
    def __init__(self, experiment_dir, dataset_name):
        self.experiment_dir = experiment_dir
        self.dataset_name = dataset_name
        self.dataset_path = self.experiment_dir + "/csd_result/" + dataset_name + ".pkl"
        self.csd_dataset_plot_dir = self.experiment_dir + "/csd_result/plot/"
        os.makedirs(self.csd_dataset_plot_dir, exist_ok=True)
        self.csc_logger = logger

        # Dataset
        self.csd_data_df = None
        self.csd_data_dict = None

        # Classifier
        self.lb = None
        self.classifier = None

        # Dataset information
        self.all_classes = None
        self.num_classes = None

        # Train the classifier
        self.load_data()
        self.train_classifier(features="dist")

        self.get_dataset_information()

    def load_data(self):
        # load data to dict, because processing of dataframe takes too much time
        self.csd_data_df = pd.read_pickle(self.dataset_path)
        self.csd_data_dict = self.csd_data_df.to_dict()

    def get_traj_index_by_labels(self, label):
        traj_index_dict = dict()
        for key, value in self.csd_data_dict['label'].items():
            if label in value:
                traj_index_dict[key] = value

        labels = list(set(traj_index_dict.values()))

        return traj_index_dict, labels

    def merge_feature_by_labels(self, traj_index_dict=None, feature="dist", labels=None):
        feature_values = dict()
        for label in labels:
            feature_values[label] = []
            traj_index_list = [key for key, value in traj_index_dict.items() if value == label]
            for traj_index in traj_index_list:
                feature_values[label].append(self.csd_data_dict[feature][traj_index])
        return feature_values

    def train_classifier(self, features="dist"):
        self.lb = preprocessing.LabelBinarizer()
        X = []
        y = []
        for index, row in self.csd_data_df.iterrows():
            X.append(row[features])
            # use the first two string values as label
            y.append(row["label"][:2])
        X = np.array(X)
        self.lb.fit(y)
        y = self.lb.transform(y)
        num_labels = np.unique(y).shape[0]+1
        self.classifier = KNeighborsClassifier(num_labels).fit(X, y)

    def get_dataset_information(self):
        self.all_classes = self.lb.classes_
        self.num_classes = len(self.all_classes)
        self.csc_logger.info("All classes from the dataset {} are {}: ", self.dataset_name, self.all_classes)

    def predict(self, input_data):
        result = self.classifier.predict(input_data)
        label = self.lb.inverse_transform(result)
        return result, label

In [44]:
experiment_dir = "/home/lx/experiments/lx/local_experiments/1908_hfv/"
cs_classifier = CS_Classifier(experiment_dir=experiment_dir, dataset_name="RoboticsProject2510")

  return self._fit(X, y)
2021-10-25 17:44:46.724 | INFO     | __main__:get_dataset_information:69 - All classes from the dataset RoboticsProject2510 are ['CS']: 


In [46]:
cs_classifier.csd_data_dict["label"]

{0: 'CS6',
 1: 'CS6',
 2: 'CS6',
 3: 'CS6',
 4: 'CS6',
 5: 'CS3',
 6: 'CS3',
 7: 'CS3',
 8: 'CS3',
 9: 'CS3',
 10: 'CS5',
 11: 'CS5',
 12: 'CS5',
 13: 'CS5',
 14: 'CS5',
 15: 'CS1',
 16: 'CS1',
 17: 'CS1',
 18: 'CS1',
 19: 'CS1'}

In [29]:
cs_classifier.predict(input_data=np.ones([1, 12]))

(array([0]), array(['CS'], dtype='<U2'))