# Imports

In [None]:
import pandas as pd
import numpy as np
import random

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

import tensorflow as tf
import sklearn

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

import pickle

In [None]:
print('Pandas: {}'.format(pd.__version__))
print('Numpy: {}'.format(np.__version__))
print('Matplotlib: {}'.format(matplotlib.__version__))
print('Tensorflow: {}'.format(tf.__version__))
print('SKLearn: {}'.format(sklearn.__version__))

# Load Data

In [None]:
df = pd.concat([pd.read_csv('data_set_ALL_AML_train.csv'),
                pd.read_csv('data_set_ALL_AML_independent.csv')],axis=1)

### Clean & Transpose

In [None]:
patients = [i for i in df.columns if i[:4] != 'call']
df = df.loc[:,patients]
gene_description = df['Gene Description']
df.drop('Gene Description',axis=1,inplace=True)
df.set_index(df.iloc[:,0],inplace=True)
df.drop('Gene Accession Number',axis=1,inplace=True)
df = df.transpose()
df.index = df.index.astype('int64')
df.sort_index(inplace=True)

### Load Target

In [None]:
target = pd.read_csv('actual.csv')
target.set_index('patient',inplace=True)
target.index = target.index.astype('int64')
target.sort_index(inplace=True)
del target.index.name

In [None]:
#One hot encode target variable

def impute_target(x):
    if x == 'ALL':
        return 1
    else:
        return 0

In [None]:
target['One-hot'] = target['cancer'].apply(lambda x: impute_target(x))

# Neural Net Framework

In [None]:
#Split into train and test sets

train = df.iloc[:38,:]
test = df.iloc[38:,:]

target_train = target.iloc[:38,1]
target_test = target.iloc[38:,1]

In [None]:
#Return subset of data and corresponding target values

def subset(rows,columns,data,target):
    return data.iloc[rows,columns], target.iloc[rows]

In [None]:
#Set size of subset (num_rows x num_cols)
num_rows = 30
num_cols = 50

train_rows = len(train)
train_cols = len(train.columns)

test_rows = len(test)

In [None]:
#Load or initialize genes, predictions, and accuracies lists

#with open("genes.txt", "rb") as fp:
#    genes = pickle.load(fp)
#
#with open("predictions.txt", "rb") as fp:
#    predictions = pickle.load(fp)
#
#with open("accuracies.txt", "rb") as fp:
#    accuracies = pickle.load(fp)
#
#
genes = []
predictions = []
accuracies = []

In [None]:
#Set number of models to train
reps = 1
for iters in range(reps): 
    #Suppress output except for error messages
    tf.logging.set_verbosity(tf.logging.ERROR)

    #Select Rows and Cols
    rows = random.sample(range(train_rows),num_rows)
    cols = random.sample(range(train_cols),num_cols)

    #Save Genes used for model
    genes.append(train.columns[cols])

    #feature columns
    feature_columns = []
    for i in train.columns[cols]:
        feature_columns.append(tf.feature_column.numeric_column(i))

    #Model Initialization
    classifier = tf.estimator.DNNClassifier(
        feature_columns=feature_columns,
        hidden_units=[25, 10, 5],
        optimizer=tf.train.AdamOptimizer(1e-2),
        n_classes=2,
        dropout=0.2,
    )

    #Input Function
    train_input_fn = tf.estimator.inputs.pandas_input_fn(
        x=subset(rows,cols,train,target_train)[0],
        y=subset(rows,cols,train,target_train)[1],
        num_epochs=10,
        batch_size=num_rows,
        shuffle=True
    )

    #model training
    classifier.train(input_fn=train_input_fn, steps=1000)

    #eval function
    test_input_fn = tf.estimator.inputs.pandas_input_fn(
        x=test.iloc[:,cols],
        y=target_test.iloc[:],
        num_epochs=1,
        shuffle=False
    )

    #predictions
    preds = list(classifier.predict(test_input_fn))

    pred_class = [p["classes"] for p in preds]

    preds = []

    for i in range(len(pred_class)):
        preds.append(int(pred_class[i][0]))

    predictions.append(preds)

    accuracies.append(accuracy_score(target_test,preds))

In [None]:
#Store updated lists

#with open('genes.txt', 'wb') as fp:
#    pickle.dump(genes, fp)
#
#with open('predictions.txt', 'wb') as fp:
#    pickle.dump(predictions, fp)
#    
#with open('accuracies.txt', 'wb') as fp:
#    pickle.dump(accuracies, fp)

# Weighting Models

In [None]:
#Convert Predictions to -1/+1
for i in range(len(predictions)):
    for j in range(len(predictions[i])):
        if predictions[i][j] == 0:
            predictions[i][j] = -1

In [None]:
#Convert Target Test Final Values
target_final = [-1 if i==0 else i for i in target_test]

In [None]:
#Takes in a matrix of predictions, multiples them by their corresponding weight and averages the sum across all 
#models to form the ensembled prediction.
def final_preds(pred_list,weights):
    final_preds = []
    for i in range(len(pred_list[0])):
        pred_sum = 0
        for j in range(len(pred_list)):
            pred_sum += pred_list[j][i]*weights[j]
        if pred_sum == 0:
            final_preds.append(pred_sum)
        else:
            final_preds.append(pred_sum/abs(pred_sum))
    return final_preds

### Give every model equal weight

In [None]:
#Every model is treated equally
weights1 = [1] * len(accuracies)

#Final Predictions
final_preds1 = final_preds(predictions,weights1)

#Accuracy
accuracy_score(target_final,final_preds1)

### Weight each model by its accuracy

In [None]:
#Every model is weighted by its accuracy
final_preds2 = final_preds(predictions,accuracies)

#Accuracy
accuracy_score(target_final,final_preds2)

### Model weight equals max{0, accuracy - mode(accuracy)}

In [None]:
#Cut off all models that predicted majority class only or performed worse
mode = max(set(accuracies), key=accuracies.count)
weights3 = []

for i in accuracies:
    if i <= mode:
        weights3.append(0)
    else:
        weights3.append(i-mode)

In [None]:
#Weight each model by above logic
final_preds3 = final_preds(predictions,weights3)

#Accuracy
accuracy_score(target_final,final_preds3)

### Pass above weights through exponential function

In [None]:
#Apply exponential to weights3

weights4 = []

for i in weights3:
    weights4.append(np.exp(i)-1)

In [None]:
#Weight each model by scaled weight
final_preds4 = final_preds(predictions,weights4)

#Accuracy
accuracy_score(target_final,final_preds4)

### Plot ensemble accuracy as a function of the number of models

In [None]:
#Returns ensembled prediction accuracy as a function of the number of models
def ensemble(predictions,weights,target):
    acc_list = []
    for i in range(len(predictions)):
        ens_preds = final_preds(predictions[:(i+1)],weights[:(i+1)])
        acc_list.append(accuracy_score(target,ens_preds))
    return acc_list

In [None]:
acc_list1 = ensemble(predictions,weights1,target_final)
acc_list2 = ensemble(predictions,accuracies,target_final)
acc_list3 = ensemble(predictions,weights3,target_final)
acc_list4 = ensemble(predictions,weights4,target_final)

In [None]:
plt.figure(figsize = (16,8))

plt.plot(acc_list1,color='red',label='Equal')
plt.plot(acc_list2,'b--',label='Accuracy')
plt.plot(acc_list3,color='green',label='ReLu_Lin')
plt.plot(acc_list4,color='yellow',label='ReLu_Exp')

plt.title('Accuracy vs. # of Models')
plt.xlabel('# of Models')
plt.ylabel('Accuracy')
plt.ylim(bottom=0.4)
plt.legend()

# Scoring Genes

In [None]:
num_genes = len(df.columns)

gene_ave = pd.Series(data=np.zeros(num_genes),index=df.columns)

In [None]:
#If a gene is used in a model, add that model's weight to the gene's score and average over number of occurrences

for i in gene_ave.index:
    count = 0
    for j in range(len(genes)):
        if i in genes[j]:
            gene_ave[i] = ((gene_ave[i]*count) + weights4[j]) / (count + 1)
            count += 1

# Linear Classifers of Top 50

In [None]:
#Get indices of 50 highest scoring genes
top50 = gene_ave.nlargest(50).index

In [None]:
print(top50)

# Optimize KNN model

In [None]:
def knn_pipeline(n):
    top = gene_ave.nlargest(n).index
    train_set = train.loc[:,top]
    test_set = test.loc[:,top]
    
    knn_acc = []
    for i in range(n):
        KNN_model = KNeighborsClassifier(n_neighbors=i+1)
        cross_val = cross_val_score(KNN_model,train_set,target_train,cv=5)
        knn_acc.append(cross_val.mean())
    
    return knn_acc

In [None]:
KNN_acc1 = knn_pipeline(1)
KNN_acc3 = knn_pipeline(3)
KNN_acc5 = knn_pipeline(5)
KNN_acc10 = knn_pipeline(10)
#KNN_acc50 = knn_pipeline(50)

In [None]:
plt.figure(figsize=(12,6))

plt.plot(KNN_acc1, color='red',marker='o', label='1')
plt.plot(KNN_acc3, color='blue',marker='o', label='3')
plt.plot(KNN_acc5, color='yellow',marker='o', label='5')
plt.plot(KNN_acc10, color='green',marker='o', label='10')
#plt.plot(KNN_acc50, color='orange',marker='o', label='50')

plt.title('Accuracy vs. # of Neighbors')
plt.xlabel('# of Neighbors')
plt.ylabel('Accuracy')
plt.ylim(bottom=0.825)
plt.legend()

# Stats on genes

In [None]:
from scipy import stats

In [None]:
p_vals = []

for i in gene_ave.nlargest(10).index:
    weight_arr = []
    for j in range(len(genes)):
        if i in genes[j]:
            weight_arr.append(weights4[j])
    p = stats.ttest_1samp(weight_arr,0)[1]
    p_vals.append(p)

In [None]:
print(p_vals)