## Week 5 - Session 1: Semi-Supervised Learning

In [1]:
import pandas as pd
import numpy as np
import sklearn
import random 
from sklearn import svm
from sklearn import preprocessing
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score, roc_auc_score, f1_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import linear_model
from frameworks.SelfLearning import *
from sklearn.semi_supervised import LabelPropagation

### Data
 * https://archive.ics.uci.edu/ml/datasets/statlog+(australian+credit+approval)

* There are 6 numerical and 8 categorical attributes. The labels have been changed for the convenience of the statistical algorithms. For example, attribute 4 originally had 3 labels p,g,gg and these have been changed to labels 1,2,3.

        A1: 0,1 CATEGORICAL (formerly: a,b)
        A2: continuous.
        A3: continuous.
        A4: 1,2,3 CATEGORICAL (formerly: p,g,gg)
        A5: 1, 2,3,4,5, 6,7,8,9,10,11,12,13,14 CATEGORICAL (formerly: ff,d,i,k,j,aa,m,c,w, e, q, r,cc, x)
        A6: 1, 2,3, 4,5,6,7,8,9 CATEGORICAL (formerly: ff,dd,j,bb,v,n,o,h,z)
        A7: continuous.
        A8: 1, 0 CATEGORICAL (formerly: t, f)
        A9: 1, 0 CATEGORICAL (formerly: t, f)
        A10: continuous.
        A11: 1, 0 CATEGORICAL (formerly t, f)
        A12: 1, 2, 3 CATEGORICAL (formerly: s, g, p)
        A13: continuous.
        A14: continuous.
        A15: 1,2 class attribute (formerly: +,-)

In [2]:
# load data and preprocessing
def load_data():
    
    df = pd.read_csv("australian_credit.csv", header=None, sep='\t')
    df.columns = ['A'+str(i+1) for i in range(14)] + ['label']
    cat_feat = ['A4', 'A5', 'A6', 'A12']
    num_feat = [f for f in df.columns[:-1] if f not in cat_feat]
    
    # One-hot encoding for (multi-) categorical data
    df = pd.get_dummies(df, columns = cat_feat)
    df = df[[c for c in df.columns if c not in ['label']] + ['label']]
    # Normalization for numerical data
    
    x = df[num_feat].values
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    df.loc[:, num_feat] = x_scaled

    return df

# Split data into the training and test data with the given split_ratio, 
# and set labeled/ unlabeled data with the given percent
def getLabeledData(df, split_ratio, percent): 
    # Data split with Stratified random sampling
    posdf = df[df.label==1].reset_index(drop=True)
    negdf = df[df.label==0].reset_index(drop=True)
    
    #posdf = posdf.sample(frac = 1)
    #negdf = negdf.sample(frac = 1)
    
    poslen = len(posdf)
    neglen = len(negdf)
    print("pos: {}, neg:{}".format(posdf.shape, negdf.shape))

    # 1. split data into train and test
    numPosTrain = int(poslen * split_ratio)
    numNegTrain = int(neglen * split_ratio)
    posTrain = posdf.iloc[:numPosTrain, :]
    negTrain = negdf.iloc[:numNegTrain, :]
    posTest = posdf.iloc[numPosTrain:, :]
    negTest = negdf.iloc[numNegTrain:, :]
    
    # 2. extract labeled data from the training data into labeld and unlabeled
    posLabeled = posTrain.iloc[: int(numPosTrain * percent), :]
    negLabeled = negTrain.iloc[: int(numNegTrain * percent), :] 

    posUnlabeled = posTrain.iloc[int(numPosTrain * percent):, :]
    negUnlabeled = negTrain.iloc[int(numNegTrain * percent):, :] 
    
    labeled = pd.concat([posLabeled, negLabeled], axis=0, sort=False) 
    unlabeled = pd.concat([posUnlabeled, negUnlabeled], axis=0, sort=False) 
    
    X_labeled = labeled.iloc[:,:-1].values
    y_labeled = labeled.iloc[:,-1].values
    X_unlabeled = unlabeled.iloc[:,:-1].values
    y_unlabeled = unlabeled.iloc[:,-1].values

    X_train_total = np.concatenate((np.array(X_labeled), np.array(X_unlabeled)), axis=0)
    # For unlabeled data, set the labels to -1
    y_train_total = np.concatenate((np.array(y_labeled), np.array([-1]*y_unlabeled.shape[0])), axis=0)
    
    testdf  = pd.concat([posTest, negTest], axis=0, sort=False)
    X_test = testdf.iloc[:,:-1].values
    y_test = testdf.iloc[:,-1].values     
    
    return X_labeled, y_labeled, X_train_total, y_train_total, X_test, y_test

### Model training

In [3]:
def supervised_svm(X_labeled, y_labeled, X_test, y_test):
    model = svm.SVC(probability=True, gamma = 0.1)
    model.fit(X_labeled, y_labeled)
    return evaluate(model, X_test, y_test, "Supervised SVM")


def self_learning(base_model, X_train_total, y_train_total, X_test, y_test):
    ss_model = SelfLearningModel(base_model, prob_threshold=0.6, max_iter=300)
    ss_model.fit(X_train_total, y_train_total)
    return evaluate(ss_model, X_test, y_test, "Self learning")


def label_propagation(X_train_total, y_train_total, X_test, y_test):
    model = LabelPropagation(max_iter=5000)
    model.fit(X_train_total, y_train_total)
    return evaluate(model, X_test, y_test, "Label propag.")

### Looking at the above code answer the following:

1. Describe SVM in a couple sentences.
2. Describe self learning in a couple sentences.
3. Descirbe label probagation in a couple sentences.

### Evaluation

In [4]:
def evaluate(model, X_test, y_test, category=''): 
    
    test_predicted = model.predict(X_test)
    test_predicted_prob = model.predict_proba(X_test)
    
    accuracy = accuracy_score(y_test, test_predicted)
    
    # When all the data are classified into one cluster.
    if np.mean(test_predicted)==0 or np.mean(test_predicted)==1:   
        precision = 0
    else:
        precision = precision_score(y_test, test_predicted)
    
    recall = recall_score(y_test, test_predicted)
    f_measure = f1_score(y_test, test_predicted)
    tsp_prob = []
    for each in test_predicted_prob:
        tsp_prob.append(each[1])

    fpr, tpr, thresholds = metrics.roc_curve(y_test, tsp_prob, pos_label=1)
    auc_roc = metrics.auc(fpr, tpr)

    print ("{}\tf: {:.3f}, recall:{:.3f}\tprec:{:.3f}\tacc:{:.3f}\tauc:{:.3f}".
          format(category, f_measure, recall, precision, accuracy, auc_roc))

    return [accuracy, precision, recall, f_measure, auc_roc]

### Questions:

1. Which measure of evaluation would be most appropriate for the credit database?  
2. Which measure would be inappropriate?

### Compare Semi-Supervised Learning with Supervised Learning

In [5]:
def compare_methods(fold, df, labeled_percent):
    
    for p in labeled_percent:
        
        X_labeled, y_labeled, X_train_total, y_train_total, X_test, y_test = getLabeledData(df, split_ratio, p)
       
        print("\n**** LABELED PERCENTAGE: {}".format(p*100))        
        
        print("X_labeled: {}\tX_train_total: {}\tX_test: {}\ny_labeled: {}  \ty_train_total: {}\t\ty_test: {}".format(
            X_labeled.shape, X_train_total.shape, X_test.shape, y_labeled.shape, y_train_total.shape, y_test.shape))
        print("\n --- SVM")
        base_model = sklearn.svm.SVC(probability=True, gamma = 0.1)
        eval_res = self_learning(base_model, X_train_total, y_train_total, X_test, y_test)
        res.loc[len(res)] = [fold, p, 'SVM', 'SelfLearn'] + eval_res
        
        acc, prec, rec, f_measure, auc = supervised_svm(X_labeled, y_labeled, X_test, y_test)
        res.loc[len(res)] = [fold, p, 'SVM','Supervised'] + [acc, prec, rec, f_measure, auc]
        
        print("\n --- Decision Tree")
        base_model = DecisionTreeClassifier()
        eval_res = self_learning(base_model, X_train_total, y_train_total, X_test, y_test)
        res.loc[len(res)] = [fold, p, 'DT', 'SelfLearn'] + eval_res
        
        base_model = DecisionTreeClassifier()
        base_model.fit(X_labeled, y_labeled)
        eval_res = evaluate(base_model, X_test, y_test, 'Supervised')
        res.loc[len(res)] = [fold, p, 'DT', 'Supervised'] + eval_res

        print("\n --- Logistic Regression")
        base_model = linear_model.LogisticRegression()
        eval_res = self_learning(base_model, X_train_total, y_train_total, X_test, y_test)
        res.loc[len(res)] = [fold, p, 'LR', 'SelfLearn'] + eval_res
        
        base_model = linear_model.LogisticRegression()
        base_model.fit(X_labeled, y_labeled)
        eval_res = evaluate(base_model, X_test, y_test, 'Supervised')
        res.loc[len(res)] = [fold, p, 'LR','Supervised'] + eval_res

        print("\n --- Naive Bayes")
        base_model = GaussianNB()
        eval_res = self_learning(base_model, X_train_total, y_train_total, X_test, y_test)
        res.loc[len(res)] = [fold, p, 'NB', 'SelfLearn'] + eval_res
    
        base_model.fit(X_labeled, y_labeled)
        eval_res = evaluate(base_model, X_test, y_test, "Supervised")
        res.loc[len(res)] = [fold, p, 'NB', 'Supervised'] + eval_res
        
        print("\n --- Label Propagation")
        eval_res = label_propagation(X_train_total, y_train_total, X_test, y_test)
        res.loc[len(res)] = [fold, p, 'Label_Prop', np.nan] + eval_res    

    return res   

In [6]:
if __name__ == "__main__":
    split_ratio = 0.7 
    labeled_percent = [0.01, 0.02, 0.1, 0.15]
    models = ['SVM', 'DT', 'LR', 'NB', 'Label_Prop']
    approaches = ['SelfLearn', 'Supervised']
    random.seed(100)
    
    # load and preprocess data 
    df = load_data()
    
    # train and evaluate
    res = pd.DataFrame(columns=['fold', 'Labeled', 'Model', 'Approach', 'Accuracy', 'Precision', 
                                'Recall', 'F-measure', 'AUC'])    
    for i in range(20):
        res = compare_methods(i, df, labeled_percent)
    

    # get the average of evaluation metrics
    repeatRes = pd.DataFrame(columns=res.columns[1:])
    
    for l in labeled_percent:
        for m in models:
            if m != 'Label_Prop':
                for a in approaches:
                    tmp = res[(res.Labeled==l)& (res.Model==m) & (res.Approach==a)][res.columns[4:]].mean().values.tolist()
                    repeatRes.loc[len(repeatRes)] = [l, m, a]+ tmp        
            else:
                tmp = res[(res.Labeled==l)& (res.Model==m)][res.columns[4:]].mean().values.tolist()
                repeatRes.loc[len(repeatRes)] = [l, m, '']+ tmp        

pos: (307, 39), neg:(383, 39)

**** LABELED PERCENTAGE: 1.0
X_labeled: (4, 38)	X_train_total: (482, 38)	X_test: (208, 38)
y_labeled: (4,)  	y_train_total: (482,)		y_test: (208,)

 --- SVM
Self learning	f: 0.619, recall:0.516	prec:0.774	acc:0.716	auc:0.805
Supervised SVM	f: 0.658, recall:0.570	prec:0.779	acc:0.736	auc:0.196

 --- Decision Tree
Self learning	f: 0.254, recall:0.183	prec:0.415	acc:0.519	auc:0.487
Supervised	f: 0.254, recall:0.183	prec:0.415	acc:0.519	auc:0.487

 --- Logistic Regression
Self learning	f: 0.754, recall:0.710	prec:0.805	acc:0.793	auc:0.883
Supervised	f: 0.730, recall:0.699	prec:0.765	acc:0.769	auc:0.826

 --- Naive Bayes
Self learning	f: 0.391, recall:0.269	prec:0.714	acc:0.625	auc:0.680
Supervised	f: 0.703, recall:0.839	prec:0.605	acc:0.683	auc:0.718

 --- Label Propagation
Label propag.	f: 0.693, recall:0.656	prec:0.735	acc:0.740	auc:0.816
pos: (307, 39), neg:(383, 39)

**** LABELED PERCENTAGE: 2.0
X_labeled: (9, 38)	X_train_total: (482, 38)	X_test: (208, 38

In [7]:
# Overall comparison
repeatRes.round(3)

Unnamed: 0,Labeled,Model,Approach,Accuracy,Precision,Recall,F-measure,AUC
0,0.01,SVM,SelfLearn,0.716,0.773,0.517,0.62,0.805
1,0.01,SVM,Supervised,0.736,0.779,0.57,0.658,0.196
2,0.01,DT,SelfLearn,0.692,0.755,0.476,0.545,0.672
3,0.01,DT,Supervised,0.622,0.65,0.355,0.44,0.596
4,0.01,LR,SelfLearn,0.793,0.805,0.71,0.754,0.883
5,0.01,LR,Supervised,0.769,0.765,0.699,0.73,0.826
6,0.01,NB,SelfLearn,0.625,0.714,0.269,0.391,0.68
7,0.01,NB,Supervised,0.683,0.605,0.839,0.703,0.718
8,0.01,Label_Prop,,0.74,0.735,0.656,0.693,0.816
9,0.02,SVM,SelfLearn,0.788,0.777,0.762,0.761,0.891


### Look at the above table and answer the following questions:

1. Which method performed best under which circumstances?
2. What metrics did you use to make the above determination?

In [8]:
repeatRes[(repeatRes.Model=='SVM')].round(3)

Unnamed: 0,Labeled,Model,Approach,Accuracy,Precision,Recall,F-measure,AUC
0,0.01,SVM,SelfLearn,0.716,0.773,0.517,0.62,0.805
1,0.01,SVM,Supervised,0.736,0.779,0.57,0.658,0.196
9,0.02,SVM,SelfLearn,0.788,0.777,0.762,0.761,0.891
10,0.02,SVM,Supervised,0.779,0.862,0.602,0.709,0.617
18,0.1,SVM,SelfLearn,0.76,0.852,0.561,0.673,0.887
19,0.1,SVM,Supervised,0.822,0.818,0.774,0.796,0.891
27,0.15,SVM,SelfLearn,0.827,0.82,0.785,0.802,0.885
28,0.15,SVM,Supervised,0.875,0.838,0.892,0.865,0.918


In [9]:
repeatRes[(repeatRes.Model=='LR')].round(3)

Unnamed: 0,Labeled,Model,Approach,Accuracy,Precision,Recall,F-measure,AUC
4,0.01,LR,SelfLearn,0.793,0.805,0.71,0.754,0.883
5,0.01,LR,Supervised,0.769,0.765,0.699,0.73,0.826
13,0.02,LR,SelfLearn,0.812,0.855,0.699,0.769,0.897
14,0.02,LR,Supervised,0.798,0.849,0.667,0.747,0.89
22,0.1,LR,SelfLearn,0.827,0.82,0.785,0.802,0.909
23,0.1,LR,Supervised,0.822,0.811,0.785,0.798,0.887
31,0.15,LR,SelfLearn,0.827,0.82,0.785,0.802,0.912
32,0.15,LR,Supervised,0.861,0.84,0.849,0.845,0.905


In [10]:
repeatRes[(repeatRes.Model=='DT')].round(3)

Unnamed: 0,Labeled,Model,Approach,Accuracy,Precision,Recall,F-measure,AUC
2,0.01,DT,SelfLearn,0.692,0.755,0.476,0.545,0.672
3,0.01,DT,Supervised,0.622,0.65,0.355,0.44,0.596
11,0.02,DT,SelfLearn,0.76,0.794,0.637,0.697,0.748
12,0.02,DT,Supervised,0.788,0.8,0.715,0.739,0.781
20,0.1,DT,SelfLearn,0.823,0.8,0.806,0.803,0.821
21,0.1,DT,Supervised,0.828,0.809,0.808,0.808,0.826
29,0.15,DT,SelfLearn,0.809,0.811,0.748,0.777,0.803
30,0.15,DT,Supervised,0.798,0.812,0.716,0.76,0.79


In [11]:
repeatRes[(repeatRes.Model=='NB')].round(3)

Unnamed: 0,Labeled,Model,Approach,Accuracy,Precision,Recall,F-measure,AUC
6,0.01,NB,SelfLearn,0.625,0.714,0.269,0.391,0.68
7,0.01,NB,Supervised,0.683,0.605,0.839,0.703,0.718
15,0.02,NB,SelfLearn,0.562,0.507,0.763,0.609,0.613
16,0.02,NB,Supervised,0.769,0.747,0.731,0.739,0.825
24,0.1,NB,SelfLearn,0.534,0.477,0.441,0.458,0.583
25,0.1,NB,Supervised,0.649,0.596,0.667,0.629,0.672
33,0.15,NB,SelfLearn,0.615,0.568,0.581,0.574,0.663
34,0.15,NB,Supervised,0.716,0.631,0.882,0.735,0.83


In [12]:
repeatRes[(repeatRes.Model=='Label_Prop')].round(3)

Unnamed: 0,Labeled,Model,Approach,Accuracy,Precision,Recall,F-measure,AUC
8,0.01,Label_Prop,,0.74,0.735,0.656,0.693,0.816
17,0.02,Label_Prop,,0.75,0.766,0.634,0.694,0.841
26,0.1,Label_Prop,,0.731,0.718,0.656,0.685,0.804
35,0.15,Label_Prop,,0.731,0.691,0.72,0.705,0.816


## After running the above cells, answer these questions:
1. Compare the performance of these semi-supervised models against the supervised base classifiers such as: SVM, NB, DT, and LR trained on the same set of labeled data.
2. How do the results change as the percent of labeled data increases? 
3. Why do you think such results are produced?