In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Prediction with ensemble models on the TEST set

In [2]:
# File names related to prediction on the test set
test_file_list = "./test_fnames.txt"
test_csv_folder = "./test_files"
pred_id_col = "exam_ids"
prob_col_names = ["predicted_class", "prob_class1", "prob_class2", "prob_class3", "prob_class1_norm", "prob_class3_norm"]
output_file = "./test_ensemble_prediction.csv"


In [3]:
# Reading a list of files and load data from one of the files
filename = test_file_list
flist = []
with open(filename) as f_object:
    #file_list = f_object.readlines()
    for line in f_object:
        flist.append(line.strip())

datafile = test_csv_folder + "/" + flist[0]
first_df = pd.read_csv(datafile, index_col= pred_id_col)
first_df.head(5)

Unnamed: 0_level_0,exam_ids.1,predicted_class,prob_class1,prob_class2,prob_class3,prob_class1_norm,prob_class3_norm
exam_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
21284,21284,1.0,0.999083,4e-06,0.000914,0.999086,0.000914
924386,924386,1.0,0.997932,9e-06,0.002059,0.997941,0.002059
2939231,2939231,1.0,0.937724,0.001926,0.06035,0.939534,0.060466
761745,761745,1.0,0.999108,1.1e-05,0.000881,0.999119,0.000881
713438,713438,1.0,0.999596,6e-06,0.000398,0.999602,0.000398


In [4]:
class_probs = np.array(first_df[[prob_col_names[1], prob_col_names[2], prob_col_names[3]]])
(class_probs + class_probs)[:5] 

array([[1.99816513e+00, 7.92826631e-06, 1.82701158e-03],
       [1.99586439e+00, 1.82799595e-05, 4.11718665e-03],
       [1.87544799e+00, 3.85240442e-03, 1.20699629e-01],
       [1.99821627e+00, 2.26167376e-05, 1.76110945e-03],
       [1.99919164e+00, 1.17662039e-05, 7.96594599e-04]])

In [5]:
def get_ensemble_prediction(file_list, folder_name):
    """Get average probabilities of different model predictions"""
    """Return the final prediction as numpy array with 3 class probabilities"""
    
    n = len(file_list)
    rows = pd.read_csv(folder_name + "/" + file_list[0]).shape[0]
    prob_array = np.zeros((rows,3))
    i = 0
    for fname in file_list:
        new_df = pd.read_csv(folder_name + "/" + fname)
        new_class_probs = np.array(new_df[[prob_col_names[1], prob_col_names[2], prob_col_names[3]]])
        prob_array += new_class_probs
        i +=1
    print(n, "equals", i)
    return prob_array/n

In [6]:
ensemble_prediction = get_ensemble_prediction(flist, test_csv_folder)

10 equals 10


In [7]:
ensemble_prediction[:5]

array([[9.98640019e-01, 2.87283594e-05, 1.33124404e-03],
       [9.96641403e-01, 4.03913041e-05, 3.31817497e-03],
       [9.58497208e-01, 3.96972367e-03, 3.75330842e-02],
       [9.99123269e-01, 3.01216513e-05, 8.46596880e-04],
       [9.99147558e-01, 5.62836184e-05, 7.96155233e-04]])

In [8]:
# Compute normalized probabilities of classes 3 and 1
prob_sum = ensemble_prediction[:,0] + ensemble_prediction[:,2]
prob_class3_norm = ensemble_prediction[:,2] / prob_sum
prob_class1_norm = ensemble_prediction[:,0] / prob_sum

# Get predicted class
y_pred = ensemble_prediction.argmax(axis=1) + 1
exam_ids = first_df.index

In [9]:
# An overview of prediction
(unique, counts) = np.unique(y_pred, return_counts=True)
pred_frequencies = np.asarray((unique, counts)).T
print(pred_frequencies)

[[     1 194603]
 [     2   3806]]


In [10]:
# Save a csv file with prediction from ensemble models
df = pd.DataFrame({pred_id_col: exam_ids, prob_col_names[0]: y_pred,
                   prob_col_names[1]: ensemble_prediction[:,0], prob_col_names[2] : ensemble_prediction[:,1],
                   prob_col_names[3]: ensemble_prediction[:,2], prob_col_names[4] : prob_class1_norm,
                   prob_col_names[5] : prob_class3_norm})
df = df.set_index('exam_ids', drop=False)
df.to_csv(output_file)

In [11]:
df.head(5)

Unnamed: 0_level_0,exam_ids,predicted_class,prob_class1,prob_class2,prob_class3,prob_class1_norm,prob_class3_norm
exam_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
21284,21284,1,0.99864,2.9e-05,0.001331,0.998669,0.001331
924386,924386,1,0.996641,4e-05,0.003318,0.996682,0.003318
2939231,2939231,1,0.958497,0.00397,0.037533,0.962317,0.037683
761745,761745,1,0.999123,3e-05,0.000847,0.999153,0.000847
713438,713438,1,0.999148,5.6e-05,0.000796,0.999204,0.000796


### Prediction with ensemble models on the VALIDATION set 

In [12]:
# File names related to prediction on the validation set
valid_file_list = "./valid_fnames.txt"
valid_csv_folder = "./valid_files"
pred_id_col = "exam_ids"
prob_col_names = ["predicted_class", "prob_class1", "prob_class2", "prob_class3", "prob_class1_norm", "prob_class3_norm"]
output_file_v = "./valid_ensemble_prediction.csv"

In [13]:
# Reading a list of files and load data from one of the files
filename_v = valid_file_list
flist_v = []
with open(filename_v) as f:
    for line in f:
        flist_v.append(line.strip())

datafile_v = valid_csv_folder + "/" + flist_v[0]
second_df = pd.read_csv(datafile_v, index_col= pred_id_col)
print(second_df.shape)
second_df.head(5)

(68854, 7)


Unnamed: 0_level_0,exam_ids.1,predicted_class,prob_class1,prob_class2,prob_class3,prob_class1_norm,prob_class3_norm
exam_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1384433,1384433,1.0,0.995756,0.000106,0.004138,0.995861,0.004139
454057,454057,1.0,0.998857,7.1e-05,0.001072,0.998928,0.001072
817538,817538,1.0,0.997015,8.7e-05,0.002898,0.997101,0.002899
970996,970996,1.0,0.997422,5.5e-05,0.002524,0.997476,0.002524
200611,200611,2.0,0.030406,0.927939,0.041655,0.421947,0.578053


In [14]:
# Compute prediction probabilities with ensemble models (average)
valid_ensemble_prediction = get_ensemble_prediction(flist_v, valid_csv_folder)

10 equals 10


In [15]:
# Compute normalized probabilities of classes 3 and 1
prob_sum_v = valid_ensemble_prediction[:,0] + valid_ensemble_prediction[:,2]
prob_class3_norm_v = valid_ensemble_prediction[:,2] / prob_sum_v
prob_class1_norm_v = valid_ensemble_prediction[:,0] / prob_sum_v

# Get predicted class
valid_y_pred = valid_ensemble_prediction.argmax(axis=1) + 1
valid_exam_ids = second_df.index

In [16]:
# An overview of prediction
(unique_v, counts_v) = np.unique(valid_y_pred, return_counts=True)
pred_frequencies_v = np.asarray((unique_v, counts_v)).T
print(pred_frequencies_v)

[[    1 64642]
 [    2  4212]]


In [17]:
# Save a csv file with prediction from ensemble models
v_df = pd.DataFrame({pred_id_col: valid_exam_ids, prob_col_names[0]: valid_y_pred,
                   prob_col_names[1]: valid_ensemble_prediction[:,0], prob_col_names[2] : valid_ensemble_prediction[:,1],
                   prob_col_names[3]: valid_ensemble_prediction[:,2], prob_col_names[4] : prob_class1_norm_v,
                   prob_col_names[5] : prob_class3_norm_v})
v_df = v_df.set_index('exam_ids', drop=False)
v_df.to_csv(output_file_v)

In [18]:
v_df.head(5)

Unnamed: 0_level_0,exam_ids,predicted_class,prob_class1,prob_class2,prob_class3,prob_class1_norm,prob_class3_norm
exam_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1384433,1384433,1,0.995274,0.00027,0.004457,0.995542,0.004458
454057,454057,1,0.999336,5.4e-05,0.000611,0.999389,0.000611
817538,817538,1,0.994707,0.000224,0.005069,0.99493,0.00507
970996,970996,1,0.997472,0.000102,0.002426,0.997573,0.002427
200611,200611,2,0.034731,0.936955,0.028314,0.550891,0.449109
