In [1]:
import os
import time
import json
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
from scipy import stats
from pylab import rcParams
from sklearn.utils import check_random_state
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, \
    GradientBoostingClassifier

In [5]:
from ipynb.fs.full.preprocess import *

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gmjsl\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
train_data  = pd.read_csv('data/train.tsv', sep='\t', encoding = "ISO-8859-1")\
             .rename(columns={'Score1': 'EssayScore'})

X_test_data = pd.read_csv('data/public_leaderboard.tsv', sep='\t', encoding = "ISO-8859-1")  
y_test_data = pd.read_csv('data/public_leaderboard_solution.csv', sep=',', encoding = "ISO-8859-1")\
             .rename(columns={'id': 'Id', 'essay_set': 'EssaySet', 'essay_score': 'EssayScore' })\
            [['Id', 'EssaySet', 'EssayScore']]

In [8]:
train_data.head(3)

Unnamed: 0,Id,EssaySet,EssayScore,Score2,EssayText
0,1,1,1,1,Some additional information that we would need...
1,2,1,1,1,"After reading the expirement, I realized that ..."
2,3,1,1,1,"What you need is more trials, a control set up..."


In [9]:
X_test_data.head(3)

Unnamed: 0,Id,EssaySet,EssayText
0,1673,1,The procedures I think they should have includ...
1,1674,1,"In order to replicate this experiment, you wou..."
2,1675,1,"In order to replicate their experiment, you wo..."


In [10]:
y_test_data.head(3)

Unnamed: 0,Id,EssaySet,EssayScore
0,1673,1,1
1,1674,1,1
2,1675,1,3


In [11]:
class BaseModel(object):

    def __init__(self):
        pass

    def fit_predict(self):
        pass

In [12]:
class SvmModel(BaseModel):

    model_type = 'Support Vector Machine with linear Kernel'
    
    def fit_predict(self, X_train, y_train, X_val, X_test, c_weight):
        print ('training svm...')
        self.classifier = SVC(C=1, kernel='linear', probability=True,
                              class_weight=c_weight)
        self.classifier.fit(X_train, y_train)
        self.test_y_predicted = self.classifier.predict(X_test)
        self.val_y_predicted = self.classifier.predict(X_val)
        return (X_train, X_val, X_test, self.val_y_predicted,
                self.test_y_predicted)

In [13]:
class LogModel(BaseModel):

    model_type = 'Multinominal Logistic Regression' 
    
    def fit_predict(self, X_train, y_train, X_val, X_test, c_weight):
        print ('training multinomial logistic regression')
        train_samples = X_train.shape[0]
        self.classifier = LogisticRegression(
            C=50. / train_samples,
            multi_class='multinomial',
            penalty='l1',
            solver='saga',
            tol=0.1,
            class_weight=c_weight,
            )
        self.classifier.fit(X_train, y_train)
        self.test_y_predicted = self.classifier.predict(X_test)
        self.val_y_predicted = self.classifier.predict(X_val)
        return (X_train, X_val, X_test, self.val_y_predicted,
                self.test_y_predicted)

In [14]:
class RfModel(BaseModel):

    model_type = 'Random Forest'
    
    def fit_predict(self, X_train, y_train, X_val, X_test, c_weight):
        print ('training random forest...')
        self.classifier = RandomForestClassifier(n_estimators=500, class_weight=c_weight)
        self.classifier.fit(X_train, y_train)
        self.test_y_predicted = self.classifier.predict(X_test)
        self.val_y_predicted = self.classifier.predict(X_val)
        return (X_train, X_val, X_test, self.val_y_predicted, self.test_y_predicted)

In [15]:
class TrainModel:

    def __init__(self, model_object):        
        self.accuracies = []
        self.model_object = model_object()        

    def print_model_type(self):
        print (self.model_object.model_type)

    # Train normally and get probabilities for the validation set. 
    # i.e.Use the probabilities to select the most uncertain samples
    def train(self, X_train, y_train, X_val, X_test, c_weight):
        print ('Train set:', X_train.shape, 'y:', y_train.shape)
        print ('Val set:', X_val.shape)
        print ('Test set:', X_test.shape)
        t0 = time.time()
        (X_train, X_val, X_test, self.val_y_predicted,
         self.test_y_predicted) = \
            self.model_object.fit_predict(X_train, y_train, X_val, X_test, c_weight)
        self.run_time = time.time() - t0
        return (X_train, X_val, X_test)  # return them in case we use PCA, with all the other algorithms, this is not needed.

    # Calculate accuracy only for the test set
    def get_test_accuracy(self, i, y_test):
        classif_rate = np.mean(self.test_y_predicted.ravel() == y_test.ravel()) * 100
        self.accuracies.append(classif_rate)               
        print('--------------------------------')
        print('Iteration:',i)
        print('--------------------------------')
        print('y-test set:',y_test.shape)
        print('Example run in %.3f s' % self.run_time,'\n')
        print("Accuracy rate for %f " % (classif_rate))    
        print("Classification report for classifier %s:\n%s\n" % (self.model_object.classifier, metrics.classification_report(y_test, self.test_y_predicted)))
        print("Confusion matrix:\n%s" % metrics.confusion_matrix(y_test, self.test_y_predicted))
        print('--------------------------------')

In [7]:
class BaseSelectionFunction(object):

    def __init__(self):
        pass

    def select(self):
        pass

In [8]:
class RandomSelection(BaseSelectionFunction):

    @staticmethod
    def select(probas_val, initial_labeled_samples):
        random_state = check_random_state(0)
        selection = np.random.choice(probas_val.shape[0], initial_labeled_samples, replace=False)

#     print('uniques chosen:',np.unique(selection).shape[0],'<= should be equal to:',initial_labeled_samples)

        return selection

In [9]:
class EntropySelection(BaseSelectionFunction):

    @staticmethod
    def select(probas_val, initial_labeled_samples):
        e = (-probas_val * np.log2(probas_val)).sum(axis=1)
        selection = (np.argsort(e)[::-1])[:initial_labeled_samples]
        return selection

In [10]:
class MarginSamplingSelection(BaseSelectionFunction):

    @staticmethod
    def select(probas_val, initial_labeled_samples):
        rev = np.sort(probas_val, axis=1)[:, ::-1]
        values = rev[:, 0] - rev[:, 1]
        selection = np.argsort(values)[:initial_labeled_samples]
        return selection

In [11]:
class Normalize(object):
    
    def normalize(self, X_train, X_val, X_test):
        self.scaler = MinMaxScaler()
        X_train = self.scaler.fit_transform(X_train)
        X_val   = self.scaler.transform(X_val)
        X_test  = self.scaler.transform(X_test)
        return (X_train, X_val, X_test) 
    
    def inverse(self, X_train, X_val, X_test):
        X_train = self.scaler.inverse_transform(X_train)
        X_val   = self.scaler.inverse_transform(X_val)
        X_test  = self.scaler.inverse_transform(X_test)
        return (X_train, X_val, X_test)

In [None]:
# TO DO 
# create a class called data to include the following 3 methods
# get_train_data_per_prompt, get_test_data_per_prompt, get_k_random_samples

In [104]:
def get_train_data_per_prompt(i, trainset):
    X_train_cols = ['Id', 'EssaySet', 'EssayText']
    y_train_cols = ['EssayScore']
    
    X_train_full_per_prompt = trainset.loc[trainset['EssaySet']==i][X_train_cols]
    y_train_full_per_prompt = trainset.loc[trainset['EssaySet']==i][y_train_cols]
    
    X_train_full_per_prompt.reset_index(drop=True, inplace=True)
    y_train_full_per_prompt.reset_index(drop=True, inplace=True)
    
    return (X_train_full_per_prompt, y_train_full_per_prompt)

In [129]:
def get_test_data_per_prompt(i, X_test_data, y_test_data):
    X_test_full_per_prompt = X_test_data.loc[X_test_data['EssaySet']==i]
    y_test_full_per_prompt = y_test_data.loc[y_test_data['EssaySet']==i]
    return (X_test_full_per_prompt, y_test_full_per_prompt)

In [130]:
def get_k_random_samples(i, initial_labeled_samples, trainset):
    
    X_train_per_prompt, y_train_per_prompt = get_train_data_per_prompt(i=i, trainset=trainset)

    trainset_size = len(X_train_per_prompt)
    random_state = check_random_state(0)
    random_idx = np.random.choice(trainset_size-1,
                                   initial_labeled_samples,
                                   replace=False)

    X_train = X_train_per_prompt[X_train_per_prompt.index.isin(random_idx)]
    y_train = y_train_per_prompt[y_train_per_prompt.index.isin(random_idx)]
    
    # Reset the train set index
    X_train.reset_index(drop=True, inplace=True)
    y_train.reset_index(drop=True, inplace=True)
    
    return (random_idx, X_train, y_train)

In [133]:
class TheAlgorithm(object):

    accuracies = []

    def __init__(self, prompt, initial_labeled_samples, model_object, selection_function):
        self.prompt = prompt
        self.initial_labeled_samples = initial_labeled_samples
        self.model_object = model_object
        self.sample_selection_function = selection_function

    def run(self, train_data, X_test_data, y_test_data):

        # initialize process by applying base learner to labeled training data set to obtain Classifier

        (random_idx, X_train, y_train) = get_k_random_samples(self.prompt, self.initial_labeled_samples, train_data)
        (X_test, y_test) = get_test_data_per_prompt(self.prompt, X_test_data, y_test_data)
        
        self.queried = self.initial_labeled_samples
        self.samplecount = [self.initial_labeled_samples]

        # assign the val set the rest of the 'unlabelled' training data
        
        # initial the val set
        X_val = np.array([])
        y_val = np.array([])
        
        # val set = unlabeled pool set - initial labeled set
        X_val = np.copy(X_train_full)
        X_val = np.delete(X_val, permutation, axis=0)
        
        y_val = np.copy(y_train_full)
        y_val = np.delete(y_val, permutation, axis=0)
        
        print('val set:', X_val.shape, y_val.shape, permutation.shape)
 
        # normalize data
        normalizer = Normalize()
        X_train, X_val, X_test = normalizer.normalize(X_train, X_val, X_test)   
        
        self.clf_model = TrainModel(self.model_object)
        (X_train, X_val, X_test) = self.clf_model.train(X_train, y_train, X_val, X_test, 'balanced')
        active_iteration = 1
        self.clf_model.get_test_accuracy(1, y_test)

        while self.queried < max_queried:

            active_iteration += 1

            # get validation probabilities
            probas_val = self.clf_model.model_object.classifier.predict_proba(X_val)

            # select samples using a selection function
            uncertain_samples = self.sample_selection_function.select(probas_val, self.initial_labeled_samples)

            # normalization needs to be inversed and recalculated based on the new train and test set.
            X_train, X_val, X_test = normalizer.inverse(X_train, X_val, X_test)   

            # get the uncertain samples from the validation set
            print ('trainset before', X_train.shape, y_train.shape)
            X_train = np.concatenate((X_train, X_val[uncertain_samples]))
            y_train = np.concatenate((y_train, y_val[uncertain_samples]))
            print ('trainset after', X_train.shape, y_train.shape)
            self.samplecount.append(X_train.shape[0])

            X_val = np.delete(X_val, uncertain_samples, axis=0)
            y_val = np.delete(y_val, uncertain_samples, axis=0)
            print ('val set:', X_val.shape, y_val.shape)
            print ()

            # normalize again after creating the 'new' train/test sets
            normalizer = Normalize()
            X_train, X_val, X_test = normalizer.normalize(X_train, X_val, X_test)               

            self.queried += self.initial_labeled_samples
            (X_train, X_val, X_test) = self.clf_model.train(X_train, y_train, X_val, X_test, 'balanced')
            self.clf_model.get_test_accuracy(active_iteration, y_test)

        print('final active learning accuracies', self.clf_model.accuracies)

In [134]:
def pickle_save(fname, data):
  filehandler = open(fname,"wb")
  pickle.dump(data,filehandler)
  filehandler.close() 
  print('saved', fname, os.getcwd(), os.listdir())

def pickle_load(fname):
  print(os.getcwd(), os.listdir())
  file = open(fname,'rb')
  data = pickle.load(file)
  file.close()
  print(data)
  return data


def experiment(d, prompt, models, selection_functions, Ks, repeats, contfrom):
    algos_temp = []
    print ('stopping at:', max_queried)
    count = 0
    
    for model_object in models:
      if model_object.__name__ not in d:
          d[model_object.__name__] = {}
      
      for selection_function in selection_functions:
        if selection_function.__name__ not in d[model_object.__name__]:
            d[model_object.__name__][selection_function.__name__] = {}
        
        for k in Ks:
            d[model_object.__name__][selection_function.__name__][str(k)] = []           
            
            for i in range(0, repeats):
                count+=1
                if count >= contfrom:
                    print ('Count = %s, using model = %s, selection_function = %s, k = %s, iteration = %s.' % (count, model_object.__name__, selection_function.__name__, k, i))
                    alg = TheAlgorithm(prompt,
                                       k, 
                                       model_object, 
                                       selection_function
                                       )
                    alg.run(train_data, X_test_data, y_test_data)
                    d[model_object.__name__][selection_function.__name__][str(k)].append(alg.clf_model.accuracies)
                    fname = 'Active-learning-experiment-' + str(count) + '.pkl'
                    pickle_save(fname, d)
                    if count % 5 == 0:
                        print(json.dumps(d, indent=2, sort_keys=True))
                    print ()
                    print ('---------------------------- FINISHED ---------------------------')
                    print ()
    return d


max_queried = 2000 

repeats = 1

prompt = 5

models = [SvmModel, RfModel, LogModel] 

selection_functions = [RandomSelection, MarginSamplingSelection, EntropySelection] 

Ks = [250,125,50,25,10] 

d = {}
stopped_at = -1 

d = experiment(d, prompt, models, selection_functions, Ks, repeats, stopped_at+1)
print (d)
results = json.loads(json.dumps(d, indent=2, sort_keys=True))
print(results)

stopping at: 2000
Count = 1, using model = SvmModel, selection_function = RandomSelection, k = 250, iteration = 0.
val set: (17197, 3) (17197, 1) (10,)


ValueError: could not convert string to float: 'First thing the mRNA does is go to the ribosome.  Next, the tRNA comes and decodes the first three nucleotides.  Then, the tRNA brings over the amino acid that matches the codon.  Last, the tRNA shifts over and the amino acids form peptide bonds with one another.'