In [1]:
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import numpy as np
import os.path
import csv
from sklearn.svm import SVC
from sklearn.svm import NuSVC
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn import grid_search
import sklearn.linear_model as linear_model
import sklearn.metrics as metrics
from sklearn import preprocessing
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from random import randint
from sklearn import tree
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import math
import time
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest

from metrics.Correlation import Correlation
from IO.Write import Write
from IO.Read import Read
from metrics.RSquare import RSquare
from DataSet.Dataset import DataSet


def setIdToName(aList):
    
    ids = {}
    nameToId = {}
    idToName = {}
    count = 0
    
    for i in aList:
        
        nameToId[i] = count
        idToName[count] = i
        count += 1
        
    ids['nameToId'] = nameToId
    ids['idToName'] = idToName
    
    return ids

def createNewIds(oldsnps,snps):
    
    nameToId = {}
    idToName = {}
    newIds = {}
    
    oldIds = oldsnps['snps']['idToName']
    
    for i in range(len(snps)):
        
        nameToId[oldIds[snps[i]]] = i
        idToName[i] = oldIds[snps[i]]
        
    newIds['nameToId'] = nameToId
    newIds['idToName'] = idToName
        
    return newIds

def setSnpsCode(patients,chromosomes):
    
    for i in patients.keys():
        patients[i].snpCode(chromosomes)
        
    return patients
    

def tables(sampleX,sampleY,k):
  
    samples = {}
    
    for run in range(1,k+1):
        
        d1 = {}
        

        dataTestX = sampleX[run]
        dataTestY = sampleY[run]

        n = 0

        for i in sampleX.keys():

            if i != run:

                n += len(sampleX[i])

        dataTrainX = np.zeros((n,len(sampleX[1].T)),dtype = int)
        dataTrainY = np.zeros((n,),dtype = int)

        count = 0

        for sample in sampleX.keys():

            if sample != run:

                 for i in range(len(sampleX[sample])):
                    for j in range(len(sampleX[sample].T)):
                        dataTrainX[count,j] = sampleX[sample][i,j]

                    dataTrainY[count] = sampleY[sample][i]
                    count += 1

        d1['trainX'] = dataTrainX
        d1['trainY'] = dataTrainY
        d1['testX'] = dataTestX
        d1['testY'] = dataTestY
        
        samples[run] = d1
    
    return samples
    
def kSampleData(k,X,Y):
    
    x = int (len(X) / k)
    allElements = np.zeros((len(X),),dtype = int)
    
    count1 = 1
    sampleX = {}
    sampleY = {}
   
    
    while count1 <= k:
        count2 = 1
        sampleData = []
        
        if count1 == k:
            x =  len(X) - ((k-1) * x)
        
        dataX = np.zeros((x,len(X.T)),dtype = int)
        dataY = np.zeros((x,),dtype = int)
        
        while count2 <= x:
            
            aRand = randint(0,len(X)-1)
            
            while allElements[aRand] == 1:
                
                aRand = randint(0,len(X)-1)
            
            allElements[aRand] = 1
            sampleData.append(aRand)
            count2 += 1
            
        for i in range(len(sampleData)):
            for j in range(len(X.T)):
                dataX[i,j] = X[sampleData[i],j]
            
            dataY[i] = Y[sampleData[i]]
            
        sampleX[count1] = dataX
        sampleY[count1] = dataY
        count1 +=1
        
    return tables(sampleX,sampleY,k)




def createNewTable(snps,X):
    
    newX = np.zeros((len(X),len(snps)),dtype = int)
    count=0
    
    
    
    for i in range(len(newX)):
        for j in range(len(newX.T)):
            newX[i,j] = -1
    
    for i in range(len(snps)):
        
        newX[:,i] = X[:,snps[i]]
        
        
    print("new shape = ",newX.shape)
            
    return newX 



def featuresIds(oldSnps,snps):
    
    features = {}
    
    for i in range(len(snps)):
        features[i] = snps[i]
        
    return features



def crossValidiation(X, Y, k = 1, continious = True, classifier = None,OLS = False,Logistic = False):
    
    if not classifier:
        print("wrong!!!!!!! you have to choise a classifier")
        return
    
    results = {}
    accuracy = {}
    auc = {}
    recall = {}
    precision = {}
    f1Score = {}
    
    sumResults = 0.0
    sumAccuracy = 0.0
    sumAuc = 0.0
    sumRecall = 0.0
    sumPrecision = 0.0
    sumF1Score = 0.0
    
    samples = kSampleData(k,X,Y)
    
    for run in range(1, k + 1):
        
        trainX = samples[run]['trainX']
        trainY = samples[run]['trainY']
        
        #trainX,trainY = balancedData(trainX,trainY)
        
        testX = samples[run]['testX']
        testY = samples[run]['testY']
        
        
        if OLS:
            classifier = sm.OLS(trainY,trainX)
            yPredict = classifier.fit().predict(testX)
        else:

            classifier.fit(trainX, trainY)
            yPredict = classifier.predict(testX)
        
        if continious:
            
            for i in range(len(yPredict)):
                
                if (abs(0 - yPredict[i]) - abs(1 - yPredict[i])) <= 1e-10 :
                    yPredict[i] = 0
                else:
                    yPredict[i] = 1
                    
        if Logistic:
            
            probabilities = classifier.predict_proba(testX)
            
            for i in range(len(probabilities)):
                if probabilities[i][1] >= 0.8:
                    yPredict[i] = 1
                else:
                    yPredict[i] = 0
          
        accuracy[run] = metrics.accuracy_score(testY,yPredict)#(yPredict,testY)#
        fpr, tpr, thresholds = metrics.roc_curve(testY,yPredict)
        auc[run] = metrics.auc(fpr,tpr)
        recall[run] = metrics.recall_score(testY,yPredict)
        precision[run] = metrics.precision_score(testY,yPredict)
        f1Score[run] = f1_score(testY, yPredict, average='binary')
        
    
    for i in accuracy.keys():
        sumAccuracy = sumAccuracy + accuracy[i]
        sumAuc = sumAuc + auc[i]
        sumRecall = sumRecall + recall[i]
        sumPrecision = sumPrecision + precision[i]
        sumF1Score = sumF1Score + f1Score[i]
    
    results['accuracy'] = sumAccuracy / k
    results['auc'] = sumAuc / k
    results['recall'] = sumRecall / k
    results['precision'] = sumPrecision / k
    results['f1'] = sumF1Score / k
    
    return results

def writeResultConf(path,name,results):
    
    timee = time.strftime("%d-%m-%Y")
    
    folder = path + name + 'Confu' + " ( " + timee + " ) " + '\\'
    #file = path + name + 'Confu' + '.txt'
    
    i=1
    while os.path.exists(folder):
        timee = time.strftime("%d-%m-%Y")
        folder = path + name + 'Confu' + " ( " + timee + " ) " + '_' + str(i) + '\\'
    
        i += 1
        
    os.makedirs(folder)
    
    file = folder + name + 'Confu' + '.txt' 
    
    write = open(file,'w')
    
    write.write(timee + '\n'+ '\n')
    write.write(name + '\n')
    
    for i in results:
        
        results1 = results[i]
        write.write("Category = " + i + '\n')
        
        for j in results1:
            
            write.write(j + " = " + str(results1[j]) + '\n')
            
        write.write('\n')
        write.write('\n')
        
    write.close()

    

def writeResult(path,name,results):
    
    timee = time.strftime("%d-%m-%Y")
    
    folder = path + name  + " ( " + timee + " ) " + '\\'
    #file = path + name  + '.txt'
    
    i=1
    while os.path.exists(folder):
        timee = time.strftime("%d-%m-%Y")
        folder = path + name + " ( " + timee + " ) " + '_' + str(i) + '\\'
       
        i += 1
        
    os.makedirs(folder)
    
    file = folder + name + '.txt'
    
    write = open(file,'w')
    
    write.write(timee + '\n'+ '\n')
    write.write(name + '\n')
    
    for i in results:
        
        results1 = results[i]
        write.write("Category = " + i + '\n')
        
        for j in results1:
            
            write.write(j + " = " + str(results1[j]) + '\n')
            
        write.write('\n')
        write.write('\n')
        
    write.close()

def balancedData(X,Y):
    
    cases = 0
    for i in range(len(Y)):
        if Y[i] == 1:
            cases += 1
            
    Xbalanced = np.zeros((2*cases,len(X.T)))
    Ybalanced = np.zeros(2*cases)
    controls = 0
    count = 0
    for i in range(len(Y)):
        if(Y[i] == 0 and controls < cases):
            Xbalanced[count,:] = X[i,:]
            Ybalanced[count] = Y[i]
            controls +=1
            count += 1
        elif Y[i] == 1:
            Xbalanced[count,:] = X[i,:]
            Ybalanced[count] = Y[i]
            count += 1
            
    print("len x = ", Xbalanced.shape)
    print("len y = ", Ybalanced.shape)
    
    return Xbalanced, Ybalanced
            



In [2]:

#path = 'C:\\Users\\ANTONIS\\Desktop\\newSet\\maf\\maf = 0.05\\pvalue = 0.001\\'
#path = 'C:\\Users\\ANTONIS\\Desktop\\newSet\\maf\\maf = 0.05\\pvalue = 0.01\\'
#path = 'C:\\Users\\ANTONIS\\Desktop\\newSet\\maf\\maf = 0.05\\pvalue = 0.005\\'
#path = 'C:\\Users\\ANTONIS\\Desktop\\newSet\\maf\\maf = 0.05\\pvalue = 0.05\\'

path = 'D:\\newSet\\maf\\maf = 0.05\\pvalue = 0.001\\'

numberOfChromosomes = 22#'ari8mos twn xromoswmatwn'
patientsTrain = {}
patientsTest = {}
allPatients = {}

chromosomes = {}

read = Read(path,numberOfChromosomes)
write = Write(path,numberOfChromosomes)

patients = read.readPatients('Patients.txt')
chromosomes = read.readSnps(".assoc")
write.writePatientsList(patients,'patient.txt')
write.writeSnpsList(chromosomes)


# run train_lgen bat and test_leg bat

In [3]:

snps = read.getListOfSnps()
ids = {} 


ids['patients'] = setIdToName(list(patients.keys()))
ids['snps'] = setIdToName(snps)


if os.path.exists(path + 'snpCode.txt'):
    print("mphka")
    X, Y = read.readSnpsCode(patients,ids)
    
else:
    
    write.writeSnpLog(read.getNumberOfPatients(),read.getNumberOfSnps(),chromosomes)
    X, Y = read.readSnpsCode(patients,ids)
    
X.shape

mphka
mphka2


(4980, 7799)

# Correlation

In [4]:
xTraining, xTest, yTraining1, yTest = train_test_split(X, Y, test_size=0.1, random_state=randint(0,2018))
print("mergex = ",X.shape)
print("xTrain = ",xTraining.shape)
print("xTest = ",xTest.shape)
print("yTrain = ",yTraining1.shape)
print("yTest = ",yTest.shape)

mergex =  (4980, 7799)
xTrain =  (4482, 7799)
xTest =  (498, 7799)
yTrain =  (4482,)
yTest =  (498,)


In [5]:
cor = Correlation(X)
#cor = RSquare(X)

#cor = Correlation(xTraining)
#cor = RSquare(xTraining)
snpReduc = {}

# Correlation high

In [None]:
snpReduc['high'] = cor.getHighCorrelationSnps(0.7,down=100,up=100)
write.writeSnpsUsed(snpReduc['high'],ids['snps']['idToName'],chromosomes,'high')

# Correlation Low

In [45]:
down = 97
up = 100 
threshold = 0.7
snpReduc['low'] = cor.getLowCorrelationSnps(threshold, down=down,up=up)
write.writeSnpsUsed(snpReduc['low'],ids['snps']['idToName'],chromosomes,'low')

snpsIds =  7515
idToName =  7799


In [None]:
'''print(ids['snps']['nameToId']['rs75570604'])
get = cor.getCorrMatrix()

summ = 0 

for i in range(len(get)):
    if get[6555,i] - 0.7 < 1e-10 :
        summ += 1'''

'''for i in range(len(get)):
    sum1 = 0
    for j in range(len(get.T)):
        if get[i,j] - 0.7 < 1e-10 :
            sum1 += 1
    if sum1 == len(get.T)-1:
        summ += 1
'''

'''for i in range(len(get)):
    sum1 = 0
    for j in range(len(get.T)):
        if get[i,j] - 0.7 > 1e-10 :
            sum1 += 1
    if sum1 > summ:
        summ = sum1'''


'''print(summ)
print(len(get))'''

In [31]:
categories = []
#categories = ['low','high']
categories = ['low']
writeResults2={}
writeResults={}

In [8]:
'''featuresIdss = {}
featuresIdss['low'] = featuresIds(snpReduc['low'])'''


"featuresIdss = {}\nfeaturesIdss['low'] = featuresIds(snpReduc['low'])"

In [9]:
'''from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest



test = SelectKBest(score_func=chi2,k = 1000)
fit1 = test.fit(xTraining, yTraining1)
xTraining1 = fit1.transform(xTraining)

fit2 = test.fit(xTest, yTest)
xTest1 = fit2.transform(xTest)

fit3 = test.fit(X, Y)
XX = fit3.transform(X)

print("xtest", xTest1.shape)
print("xtraining", xTraining1.shape)
print("X", XX.shape)'''

'from sklearn.feature_selection import chi2\nfrom sklearn.feature_selection import SelectKBest\n\n\n\ntest = SelectKBest(score_func=chi2,k = 1000)\nfit1 = test.fit(xTraining, yTraining1)\nxTraining1 = fit1.transform(xTraining)\n\nfit2 = test.fit(xTest, yTest)\nxTest1 = fit2.transform(xTest)\n\nfit3 = test.fit(X, Y)\nXX = fit3.transform(X)\n\nprint("xtest", xTest1.shape)\nprint("xtraining", xTraining1.shape)\nprint("X", XX.shape)'

# SVM KERNEL

In [32]:
from sklearn.svm import NuSVC

for i in categories:
    
    r1 = {}
    r1['down'] = down
    r1['up'] = up
    r1['thres'] = threshold
    
    print("Category = ",i)
    
    XX = createNewTable(snpReduc[i],X)
    xTraining1 = createNewTable(snpReduc[i],xTraining)
    xTest1 = createNewTable(snpReduc[i],xTest)
    
    
    
    '''  test = SelectKBest(score_func=chi2,k = 500)
    fit1 = test.fit(xTraining1, yTraining1)
    xTraining1 = fit1.transform(xTraining1)

    fit2 = test.fit(xTest1, yTest)
    xTest1 = fit2.transform(xTest1)

    fit3 = test.fit(XX, Y)
    XX = fit3.transform(XX)

    print("xtest", xTest1.shape)
    print("xtraining", xTraining1.shape)
    print("X", XX.shape)
    '''
    enc = OneHotEncoder(n_values =3) 
    enc.fit(XX) 
    XX = enc.fit_transform(XX)
    XX = XX.toarray()
    print(XX.shape)

    enc.fit(xTraining1) 
    xTraining1 = enc.fit_transform(xTraining1)
    xTraining1 = xTraining1.toarray()
    print(xTraining1.shape)

    enc.fit(xTest1) 
    xTest1 = enc.fit_transform(xTest1)
    xTest1 = xTest1.toarray()
    print(xTest1.shape)
    
    
    clf = SVC(kernel='linear')
    #clf = NuSVC(kernel='rbf',nu=0.01)
    clf.fit(xTraining1, yTraining1)
    yPredict2 = clf.predict(xTest1)
    print(metrics.accuracy_score(yTest,yPredict2))
    print(metrics.confusion_matrix(yTest,yPredict2))
    error2 = mean_squared_error(yTest, yPredict2)
    print("error 2 = ",error2)
    RMSE2 = mean_squared_error(yTest,yPredict2)**0.5
    print("RMSE2 = ",RMSE2)
    fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict2)
    print("AUC = ", metrics.auc(fpr,tpr))
    print("recal = ",metrics.recall_score(yTest,yPredict2))
    print("precision = ",metrics.precision_score(yTest,yPredict2))
    print("f1Score = ",f1_score(yTest, yPredict2, average='binary'))
    
    r1['confu'] = metrics.confusion_matrix(yTest,yPredict2)
    r1['accu'] = metrics.accuracy_score(yTest,yPredict2)
    r1['auc'] = metrics.auc(fpr,tpr)
    r1['recal'] = metrics.recall_score(yTest,yPredict2)
    r1['precision'] = metrics.precision_score(yTest,yPredict2)
    r1['f1'] = f1_score(yTest, yPredict2, average='binary')
    
    writeResults2[i] = r1
    print()

    results = crossValidiation(XX, Y, k = 10, classifier = clf, continious = False)
    results['down'] = down
    results['up'] = up
    results['thres'] = threshold
    print("accuracy = ",results['accuracy'])
    print("AUC = ", results['auc'])
    print("recal = ",results['recall'])
    print("precision = ",results['precision'])
    print("f1 = ",results['f1'])
    #results['len snps'] = len(snpReduc[i])
    results['len snps'] = 500
    writeResults[i] = results
    
    print()
    
writeResult(path,'Linear',writeResults)
writeResultConf(path,'Linear',writeResults2)

Category =  low
new shape =  (4980, 883)
new shape =  (4482, 883)
new shape =  (498, 883)
(4980, 2649)
(4482, 2649)
(498, 2649)
0.887550200803
[[360  21]
 [ 35  82]]
error 2 =  0.112449799197
RMSE2 =  0.335335353336
AUC =  0.822868295309
recal =  0.700854700855
precision =  0.796116504854
f1Score =  0.745454545455

accuracy =  0.908835341365
AUC =  0.854407646747
recal =  0.76303172625
precision =  0.782145072687
f1 =  0.771993399797



In [33]:
#re = cross_val_score(clf, XX, Y, cv=10)
#print(sum(re)/10)

# LINEAR LOGISTIC REGRESSION 

In [34]:

for i in categories:
    
    r1 = {}
    
    print("Category = ",i)
    
   
    XX = createNewTable(snpReduc[i],X)
    xTraining1 = createNewTable(snpReduc[i],xTraining)
    xTest1 = createNewTable(snpReduc[i],xTest)
    
    
    
    '''test = SelectKBest(score_func=chi2,k = 500)
    fit1 = test.fit(xTraining1, yTraining1)
    xTraining1 = fit1.transform(xTraining1)

    fit2 = test.fit(xTest1, yTest)
    xTest1 = fit2.transform(xTest1)

    fit3 = test.fit(XX, Y)
    XX = fit3.transform(XX)

    print("xtest", xTest1.shape)
    print("xtraining", xTraining1.shape)
    print("X", XX.shape)'''
   
    enc = OneHotEncoder(n_values =3) 
    enc.fit(XX) 
    XX = enc.fit_transform(XX)
    XX = XX.toarray()
    print(XX.shape)
    enc.fit(xTraining1) 
    xTraining1 = enc.fit_transform(xTraining1)
    xTraining1 = xTraining1.toarray()
    print(xTraining1.shape)

    enc.fit(xTest1) 
    xTest1 = enc.fit_transform(xTest1)
    xTest1 = xTest1.toarray()
    print(xTest1.shape)
    
     

    #lr_clf = linear_model.LogisticRegression()
 
    lr_clf = linear_model.LogisticRegressionCV()
    lr_clf.fit(xTraining1, yTraining1)
    
    yPredict4 = lr_clf.predict(xTest1)

    print(metrics.accuracy_score(yTest,yPredict4))
    print(metrics.confusion_matrix(yTest,yPredict4))
    error4 = mean_squared_error(yTest, yPredict4)
    print("error 4 = ",error4)
    RMSE4 = mean_squared_error(yTest,yPredict4)**0.5
    print("RMSE4 = ",RMSE4)

    fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict4)
    print("AUC = ", metrics.auc(fpr,tpr))
    print("recal = ",metrics.recall_score(yTest,yPredict4))
    print("precision = ",metrics.precision_score(yTest,yPredict4))
    print("f1Score = ",f1_score(yTest, yPredict4, average='binary'))
    
    r1['confu'] = metrics.confusion_matrix(yTest,yPredict4)
    r1['accu'] = metrics.accuracy_score(yTest,yPredict4)
    r1['auc'] = metrics.auc(fpr,tpr)
    r1['recal'] = metrics.recall_score(yTest,yPredict4)
    r1['precision'] = metrics.precision_score(yTest,yPredict4)
    r1['f1'] = f1_score(yTest, yPredict4, average='binary')
    writeResults2[i] = r1
    
    print()

    results = crossValidiation(XX, Y, k = 10, classifier = lr_clf, continious = False)
    results['down'] = down
    results['up'] = up
    results['thres'] = threshold
    print("accuracy = ",results['accuracy'])
    print("AUC = ", results['auc'])
    print("recal = ",results['recall'])
    print("precision = ",results['precision'])
    print("f1 = ",results['f1'])
   # results['len snps'] = len(snpReduc[i])
    results['len snps'] = 500
    writeResults[i] = results
    
    print()
    
    
writeResult(path,'llrcv',writeResults)
writeResultConf(path,'llrcv',writeResults2)

Category =  low
new shape =  (4980, 883)
new shape =  (4482, 883)
new shape =  (498, 883)
(4980, 2649)
(4482, 2649)
(498, 2649)
0.919678714859
[[373   8]
 [ 32  85]]
error 4 =  0.0803212851406
RMSE4 =  0.283410100633
AUC =  0.852749175584
recal =  0.726495726496
precision =  0.913978494624
f1Score =  0.809523809524

accuracy =  0.927510040161
AUC =  0.859788201521
recal =  0.745278574643
precision =  0.881341598527
f1 =  0.807131142732



In [82]:
#XX = createNewTable(snpReduc['low'],X)

lr_clf = linear_model.LogisticRegressionCV()
lr_clf.fit(X, Y)

coefs=lr_clf.coef_[0]

#for i in range(len(coefs)):
    

    #coefs[i] = abs(coefs[i])

idToName = {}
nameToId = {}
    
for i in range(len(coefs)):
    nameToId[coefs[i]] = []
    
for i in range(len(coefs)):
    nameToId[coefs[i]].append(i)
    idToName[i] = coefs[i]
    
    
ids['coef']['nameToId'] = nameToId
ids['coef']['idToName'] = idToName

    
sc = sorted(coefs )

top_30 = []

for i in sc[-30:]:
    
    snp = ids['coef']['nameToId'][i][0]
    ids['coef']['nameToId'][i].remove(snp)
    #top_30.append(snpReduc['low'][snp])
    top_30.append(snp)

#write.writeSnpsUsed(top_30,ids['snps']['idToName'],chromosomes,'top_30_0.7_down=100_up=100_notabs')
#write.writeSnpsUsed(top_30,ids['snps']['idToName'],chromosomes,'top_30_all_notabs')

In [36]:
from sklearn.naive_bayes import GaussianNB

for i in categories:
    
    r1 = {}
    
    print("Category = ",i)
    
   
    XX = createNewTable(snpReduc[i],X)
    xTraining1 = createNewTable(snpReduc[i],xTraining)
    xTest1 = createNewTable(snpReduc[i],xTest)
    
    
    
    '''
      test = SelectKBest(score_func=chi2,k = 500)
    fit1 = test.fit(xTraining1, yTraining1)
    xTraining1 = fit1.transform(xTraining1)

    fit2 = test.fit(xTest1, yTest)
    xTest1 = fit2.transform(xTest1)

    fit3 = test.fit(XX, Y)
    XX = fit3.transform(XX)

    print("xtest", xTest1.shape)
    print("xtraining", xTraining1.shape)
    print("X", XX.shape)
    
    '''
    
    enc = OneHotEncoder(n_values =3) 
    enc.fit(XX) 
    XX = enc.fit_transform(XX)
    XX = XX.toarray()
    print(XX.shape)
    
    enc.fit(xTraining1) 
    xTraining1 = enc.fit_transform(xTraining1)
    xTraining1 = xTraining1.toarray()
    print(xTraining1.shape)

    enc.fit(xTest1) 
    xTest1 = enc.fit_transform(xTest1)
    xTest1 = xTest1.toarray()
    print(xTest1.shape)
    

    gnb = GaussianNB()
    gnb.fit(xTraining1, yTraining1)
    yPredict6 = gnb.predict(xTest1)
    print(metrics.accuracy_score(yTest,yPredict6))
    print(metrics.confusion_matrix(yTest,yPredict6))
    error6 = mean_squared_error(yTest, yPredict6)
    print("error 6 = ",error6)
    RMSE6 = mean_squared_error(yTest,yPredict6)**0.5
    print("RMSE6 = ",RMSE6)

    fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict6)
    print("AUC = ", metrics.auc(fpr,tpr))
    print("recal = ",metrics.recall_score(yTest,yPredict6))
    print("precision = ",metrics.precision_score(yTest,yPredict6))
    print("f1Score = ",f1_score(yTest, yPredict6, average='binary'))
    
    r1['confu'] = metrics.confusion_matrix(yTest,yPredict6)
    r1['accu'] = metrics.accuracy_score(yTest,yPredict6)
    r1['auc'] = metrics.auc(fpr,tpr)
    r1['recal'] = metrics.recall_score(yTest,yPredict6)
    r1['precision'] = metrics.precision_score(yTest,yPredict6)
    r1['f1'] = f1_score(yTest, yPredict6, average='binary')
    
    writeResults2[i] = r1
    
    print()

    results = crossValidiation(XX, Y, k = 10,classifier = gnb, continious = False)
    results['down'] = down
    results['up'] = up
    results['thres'] = threshold
    print("accuracy = ",results['accuracy'])
    print("AUC = ", results['auc'])
    print("recal = ",results['recall'])
    print("precision = ",results['precision'])
    print("f1 = ",results['f1'])
    print()
  #  results['len snps'] = len(snpReduc[i])
    results['len snps'] = 500
    writeResults[i] = results
    
    print()
    
    
writeResult(path,'gnb',writeResults)
writeResultConf(path,'gnb',writeResults2)

Category =  low
new shape =  (4980, 883)
new shape =  (4482, 883)
new shape =  (498, 883)
(4980, 2649)
(4482, 2649)
(498, 2649)
0.423694779116
[[107 274]
 [ 13 104]]
error 6 =  0.576305220884
RMSE6 =  0.759147693722
AUC =  0.584864391951
recal =  0.888888888889
precision =  0.275132275132
f1Score =  0.420202020202

accuracy =  0.439558232932
AUC =  0.616544720749
recal =  0.915952188927
precision =  0.256216137769
f1 =  0.399768819262




In [37]:
#xTraining.shape
yTraining1.shape

(4482,)

# BERNOULLI


In [38]:

'''xTraining1 = createNewTable(snpReduc['low'],xTraining)
lr_clf = linear_model.LogisticRegressionCV()
lr_clf.fit(xTraining1, yTraining1)

coefs=lr_clf.coef_[0]
top_20 = np.argpartition(coefs, -50)[-50:]
#top_20 = np.argpartition(coefs, 50)[:50]
removedSnps = []
for i in range(len(snpReduc['low'])):
    if i not in top_20:
        removedSnps.append(snpReduc['low'][i])'''



for i in categories:
   
    r1 = {}
    
    print("Category = ",i)
    
   
   
  
    
    XX = createNewTable(snpReduc[i],X)
    xTraining1 = createNewTable(snpReduc[i],xTraining)
    xTest1 = createNewTable(snpReduc[i],xTest)
    
    
    
    '''
    test = SelectKBest(score_func=chi2,k = 500)
    fit1 = test.fit(xTraining1, yTraining1)
    xTraining1 = fit1.transform(xTraining1)

    fit2 = test.fit(xTest1, yTest)
    xTest1 = fit2.transform(xTest1)

    fit3 = test.fit(XX, Y)
    XX = fit3.transform(XX)
    print("xtest", xTest1.shape)
    print("xtraining", xTraining1.shape)
    print("X", XX.shape)

    
    '''
    
    enc = OneHotEncoder(n_values =3) 
    enc.fit(XX) 
    XX = enc.fit_transform(XX)
    XX = XX.toarray()
    print(XX.shape)
    
    enc.fit(xTraining1) 
    xTraining1 = enc.fit_transform(xTraining1)
    xTraining1 = xTraining1.toarray()
    print(xTraining1.shape)

    enc.fit(xTest1) 
    xTest1 = enc.fit_transform(xTest1)
    xTest1 = xTest1.toarray()
    print(xTest1.shape)
    

    bern = BernoulliNB()
    bern.fit(xTraining1, yTraining1)
    yPredict6 = bern.predict(xTest1)
    print(metrics.accuracy_score(yTest,yPredict6))
    print(metrics.confusion_matrix(yTest,yPredict6))
    error6 = mean_squared_error(yTest, yPredict6)
    print("error 6 = ",error6)
    RMSE6 = mean_squared_error(yTest,yPredict6)**0.5
    print("RMSE6 = ",RMSE6)

    fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict6)
    print("AUC = ", metrics.auc(fpr,tpr))
    print("recal = ",metrics.recall_score(yTest,yPredict6))
    print("precision = ",metrics.precision_score(yTest,yPredict6))
    print("f1Score = ",f1_score(yTest, yPredict6, average='binary'))
    
    r1['confu'] = metrics.confusion_matrix(yTest,yPredict6)
    r1['accu'] = metrics.accuracy_score(yTest,yPredict6)
    r1['auc'] = metrics.auc(fpr,tpr)
    r1['recal'] = metrics.recall_score(yTest,yPredict6)
    r1['precision'] = metrics.precision_score(yTest,yPredict6)
    r1['f1'] = f1_score(yTest, yPredict6, average='binary')
    
    writeResults2[i] = r1
    
    print()

    results = crossValidiation(XX, Y, k = 10,classifier = bern, continious = False)
    results['down'] = down
    results['up'] = up
    results['thres'] = threshold
    print("accuracy = ",results['accuracy'])
    print("AUC = ", results['auc'])
    print("recal = ",results['recall'])
    print("precision = ",results['precision'])
    print("f1 = ",results['f1'])
    print()
  #  results['len snps'] = len(snpReduc[i])
    results['len snps'] = 500
    writeResults[i] = results
    
    print()
    
    
writeResult(path,'bernoulli',writeResults)
writeResultConf(path,'bernoulli',writeResults2)

Category =  low
new shape =  (4980, 883)
new shape =  (4482, 883)
new shape =  (498, 883)
(4980, 2649)
(4482, 2649)
(498, 2649)
0.947791164659
[[369  12]
 [ 14 103]]
error 6 =  0.0522088353414
RMSE6 =  0.228492527977
AUC =  0.924422908675
recal =  0.880341880342
precision =  0.895652173913
f1Score =  0.887931034483

accuracy =  0.9359437751
AUC =  0.900737176602
recal =  0.84106429326
precision =  0.842731785609
f1 =  0.84121991563




In [39]:
'''l= [5,6,7,2,3,4,10,6,7,9,8,5]
top_3 = np.argpartition(l,3)[:3]
print(top_3)
removedSnps=[]
for i in range(len(l)):
    if i not in top_3:
        removedSnps.append(l[i])
print(removedSnps)'''

'l= [5,6,7,2,3,4,10,6,7,9,8,5]\ntop_3 = np.argpartition(l,3)[:3]\nprint(top_3)\nremovedSnps=[]\nfor i in range(len(l)):\n    if i not in top_3:\n        removedSnps.append(l[i])\nprint(removedSnps)'

In [40]:
'''XX = createNewTable(snpReduc['low'],X)
#enc = OneHotEncoder(n_values =3) 
#enc.fit(XX) 
#XX = enc.fit_transform(XX)
#XX = XX.toarray()
print(XX.shape)

re = cross_val_score(bern, XX, Y, cv=10)
print(sum(re)/10)

print()
XX = X
#enc = OneHotEncoder(n_values =3) 
#enc.fit(XX) 
#XX = enc.fit_transform(XX)
#XX = XX.toarray()
print(XX.shape)
re = cross_val_score(bern, XX, Y, cv=10)
print(sum(re)/10)
'''

"XX = createNewTable(snpReduc['low'],X)\n#enc = OneHotEncoder(n_values =3) \n#enc.fit(XX) \n#XX = enc.fit_transform(XX)\n#XX = XX.toarray()\nprint(XX.shape)\n\nre = cross_val_score(bern, XX, Y, cv=10)\nprint(sum(re)/10)\n\nprint()\nXX = X\n#enc = OneHotEncoder(n_values =3) \n#enc.fit(XX) \n#XX = enc.fit_transform(XX)\n#XX = XX.toarray()\nprint(XX.shape)\nre = cross_val_score(bern, XX, Y, cv=10)\nprint(sum(re)/10)\n"

# TREE

In [41]:
from sklearn import tree

for i in categories:
    
    r1 = {}
    
    print("Category = ",i)
    
   
    XX = createNewTable(snpReduc[i],X)
    xTraining1 = createNewTable(snpReduc[i],xTraining)
    xTest1 = createNewTable(snpReduc[i],xTest)
    
    
    
    ''' test = SelectKBest(score_func=chi2,k = 500)
    fit1 = test.fit(xTraining1, yTraining1)
    xTraining1 = fit1.transform(xTraining1)

    fit2 = test.fit(xTest1, yTest)
    xTest1 = fit2.transform(xTest1)

    fit3 = test.fit(XX, Y)
    XX = fit3.transform(XX)
    print("xtest", xTest1.shape)
    print("xtraining", xTraining1.shape)
    print("X", XX.shape)
    '''
    enc = OneHotEncoder(n_values =3) 
    enc.fit(XX) 
    XX = enc.fit_transform(XX)
    XX = XX.toarray()
    print(XX.shape)

    enc.fit(xTraining1) 
    xTraining1 = enc.fit_transform(xTraining1)
    xTraining1 = xTraining1.toarray()
    print(xTraining1.shape)

    enc.fit(xTest1) 
    xTest1 = enc.fit_transform(xTest1)
    xTest1 = xTest1.toarray()
    print(xTest1.shape)
    

    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(xTraining1, yTraining1)
    yPredict4 = clf.predict(xTest1)

    print(metrics.accuracy_score(yTest,yPredict4))
    print(metrics.confusion_matrix(yTest,yPredict4))
    error4 = mean_squared_error(yTest, yPredict4)
    print("error 4 = ",error4)
    RMSE4 = mean_squared_error(yTest,yPredict4)**0.5
    print("RMSE4 = ",RMSE4)

    fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict4)
    print("AUC = ", metrics.auc(fpr,tpr))
    print("recal = ",metrics.recall_score(yTest,yPredict4))
    print("precision = ",metrics.precision_score(yTest,yPredict4))
    print("f1Score = ",f1_score(yTest, yPredict4, average='binary'))
    
    r1['confu'] = metrics.confusion_matrix(yTest,yPredict4)
    r1['accu'] = metrics.accuracy_score(yTest,yPredict4)
    r1['auc'] = metrics.auc(fpr,tpr)
    r1['recal'] = metrics.recall_score(yTest,yPredict4)
    r1['precision'] = metrics.precision_score(yTest,yPredict4)
    r1['f1'] = f1_score(yTest, yPredict4, average='binary')
    writeResults2[i] = r1
    
    print()

    results = crossValidiation(XX, Y, k = 10, classifier = clf, continious = False)
    results['down'] = down
    results['up'] = up
    results['thres'] = threshold
    print("accuracy = ",results['accuracy'])
    print("AUC = ", results['auc'])
    print("recal = ",results['recall'])
    print("precision = ",results['precision'])
    print("f1 = ",results['f1'])
    results['len snps'] = len(snpReduc[i])
   # results['len snps'] = 500
    writeResults[i] = results
    
    print()
    
    
writeResult(path,'tree',writeResults)
writeResultConf(path,'tree',writeResults2)

Category =  low
new shape =  (4980, 883)
new shape =  (4482, 883)
new shape =  (498, 883)
(4980, 2649)
(4482, 2649)
(498, 2649)
0.654618473896
[[304  77]
 [ 95  22]]
error 4 =  0.345381526104
RMSE4 =  0.587691693071
AUC =  0.492967225251
recal =  0.188034188034
precision =  0.222222222222
f1Score =  0.203703703704

accuracy =  0.693172690763
AUC =  0.52872468675
recal =  0.250655618299
precision =  0.251017229008
f1 =  0.250275992945



In [42]:
import graphviz 
featuresName = []
for i in range(len(snpReduc['low'])):
    snp = snpReduc['low'][i]
    featuresName.append(ids['snps']['idToName'][snp])
xTraining1 = createNewTable(snpReduc['low'],xTraining)
clf = tree.DecisionTreeClassifier(max_depth = 6)
clf = clf.fit(xTraining1, yTraining1)
treeee = open(path+'treeLayers.dot','w')
dot_data = tree.export_graphviz(clf, out_file=treeee, 
                        feature_names=featuresName
                         )  
graph = graphviz.Source(dot_data)


new shape =  (4482, 883)


# ALL


In [21]:

r1 = {}
writeResults2={}
writeResults={}

enc = OneHotEncoder(n_values =3) 
enc.fit(X) 
XX = enc.fit_transform(X)
XX = XX.toarray()
print(XX.shape)

enc.fit(xTraining) 
xTraining1 = enc.fit_transform(xTraining)
xTraining1 = xTraining1.toarray()
print(xTraining1.shape)

enc.fit(xTest) 
xTest1 = enc.fit_transform(xTest)
xTest1 = xTest1.toarray()
print(xTest1.shape)
    

bern = BernoulliNB()
bern.fit(xTraining1, yTraining1)
yPredict6 = bern.predict(xTest1)
print(metrics.accuracy_score(yTest,yPredict6))
print(metrics.confusion_matrix(yTest,yPredict6))
error6 = mean_squared_error(yTest, yPredict6)
print("error 6 = ",error6)
RMSE6 = mean_squared_error(yTest,yPredict6)**0.5
print("RMSE6 = ",RMSE6)

fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict6)
print("AUC = ", metrics.auc(fpr,tpr))
print("recal = ",metrics.recall_score(yTest,yPredict6))
print("precision = ",metrics.precision_score(yTest,yPredict6))
print("f1Score = ",f1_score(yTest, yPredict6, average='binary'))

r1['confu'] = metrics.confusion_matrix(yTest,yPredict6)
r1['accu'] = metrics.accuracy_score(yTest,yPredict6)
r1['auc'] = metrics.auc(fpr,tpr)
r1['recal'] = metrics.recall_score(yTest,yPredict6)
r1['precision'] = metrics.precision_score(yTest,yPredict6)
r1['f1'] = f1_score(yTest, yPredict6, average='binary')


writeResults2['all'] = r1
    
print()

results = crossValidiation(XX, Y, k = 10,classifier = bern, continious = False)
results['down'] = down
results['up'] = up
results['thres'] = threshold    
print("accuracy = ",results['accuracy'])
print("AUC = ", results['auc'])
print("recal = ",results['recall'])
print("precision = ",results['precision'])
print("f1 = ",results['f1'])


results['len snps'] = len(XX.T)
writeResults['all'] = results
    
print()
    
    
writeResult(path,'bernoulli_all',writeResults)
writeResultConf(path,'bernoulli_all',writeResults2)


(4980, 23397)
(4482, 23397)
(498, 23397)
0.753012048193
[[291  90]
 [ 33  84]]
error 6 =  0.246987951807
RMSE6 =  0.496978824305
AUC =  0.740864122754
recal =  0.717948717949
precision =  0.48275862069
f1Score =  0.577319587629

accuracy =  0.760843373494
AUC =  0.739856964989
recal =  0.704656209922
precision =  0.446645186114
f1 =  0.54627205251



In [22]:
r1 = {}
writeResults2={}
writeResults={}

enc = OneHotEncoder(n_values =3) 
enc.fit(X) 
XX = enc.fit_transform(X)
XX = XX.toarray()
print(XX.shape)

enc.fit(xTraining) 
xTraining1 = enc.fit_transform(xTraining)
xTraining1 = xTraining1.toarray()
print(xTraining1.shape)

enc.fit(xTest) 
xTest1 = enc.fit_transform(xTest)
xTest1 = xTest1.toarray()
print(xTest1.shape)
    

bern = SVC(kernel='linear')
bern.fit(xTraining1, yTraining1)
yPredict6 = bern.predict(xTest1)
print(metrics.accuracy_score(yTest,yPredict6))
print(metrics.confusion_matrix(yTest,yPredict6))
error6 = mean_squared_error(yTest, yPredict6)
print("error 6 = ",error6)
RMSE6 = mean_squared_error(yTest,yPredict6)**0.5
print("RMSE6 = ",RMSE6)

fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict6)
print("AUC = ", metrics.auc(fpr,tpr))
print("recal = ",metrics.recall_score(yTest,yPredict6))
print("precision = ",metrics.precision_score(yTest,yPredict6))
print("f1Score = ",f1_score(yTest, yPredict6, average='binary'))

r1['confu'] = metrics.confusion_matrix(yTest,yPredict6)
r1['accu'] = metrics.accuracy_score(yTest,yPredict6)
r1['auc'] = metrics.auc(fpr,tpr)
r1['recal'] = metrics.recall_score(yTest,yPredict6)
r1['precision'] = metrics.precision_score(yTest,yPredict6)
r1['f1'] = f1_score(yTest, yPredict6, average='binary')
   
writeResults2['all'] = r1
    
print()

results = crossValidiation(XX, Y, k = 10,classifier = bern, continious = False)
results['down'] = down
results['up'] = up
results['thres'] = threshold  
print("accuracy = ",results['accuracy'])
print("AUC = ", results['auc'])
print("recal = ",results['recall'])
print("precision = ",results['precision'])
print("f1 = ",results['f1'])


results['len snps'] = len(XX.T)
writeResults['all'] = results
    
print()
    
    
writeResult(path,'svm_all',writeResults)
writeResultConf(path,'svm_all',writeResults2)

(4980, 23397)
(4482, 23397)
(498, 23397)
0.937751004016
[[371  10]
 [ 21  96]]
error 6 =  0.0622489959839
RMSE6 =  0.249497486929
AUC =  0.897133050676
recal =  0.820512820513
precision =  0.905660377358
f1Score =  0.860986547085

accuracy =  0.939156626506
AUC =  0.896973778998
recal =  0.825758409953
precision =  0.869292461782
f1 =  0.846532501061



In [23]:
r1 = {}
writeResults2={}
writeResults={}

enc = OneHotEncoder(n_values =3) 
enc.fit(X) 
XX = enc.fit_transform(X)
XX = XX.toarray()
print(XX.shape)

enc.fit(xTraining) 
xTraining1 = enc.fit_transform(xTraining)
xTraining1 = xTraining1.toarray()
print(xTraining1.shape)

enc.fit(xTest) 
xTest1 = enc.fit_transform(xTest)
xTest1 = xTest1.toarray()
print(xTest1.shape)
    

bern = linear_model.LogisticRegressionCV()
bern.fit(xTraining1, yTraining1)
yPredict6 = bern.predict(xTest1)
print(metrics.accuracy_score(yTest,yPredict6))
print(metrics.confusion_matrix(yTest,yPredict6))
error6 = mean_squared_error(yTest, yPredict6)
print("error 6 = ",error6)
RMSE6 = mean_squared_error(yTest,yPredict6)**0.5
print("RMSE6 = ",RMSE6)

fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict6)
print("AUC = ", metrics.auc(fpr,tpr))
print("recal = ",metrics.recall_score(yTest,yPredict6))
print("precision = ",metrics.precision_score(yTest,yPredict6))
print("f1Score = ",f1_score(yTest, yPredict6, average='binary'))

r1['confu'] = metrics.confusion_matrix(yTest,yPredict6)
r1['accu'] = metrics.accuracy_score(yTest,yPredict6)
r1['auc'] = metrics.auc(fpr,tpr)
r1['recal'] = metrics.recall_score(yTest,yPredict6)
r1['precision'] = metrics.precision_score(yTest,yPredict6)
r1['f1'] = f1_score(yTest, yPredict6, average='binary')
   
#writeResults2['all'] = r1

print()

results = crossValidiation(XX, Y, k = 10,classifier = bern, continious = False)
results['down'] = down
results['up'] = up
results['thres'] = threshold    
print("accuracy = ",results['accuracy'])
print("AUC = ", results['auc'])
print("recal = ",results['recall'])
print("precision = ",results['precision'])
print("f1 = ",results['f1'])


results['len snps'] = len(XX.T)
writeResults['all'] = results
    
print()
    
    
writeResult(path,'llr_all_CV',writeResults)
writeResultConf(path,'llr_all_CV',writeResults2)

(4980, 23397)
(4482, 23397)
(498, 23397)
0.933734939759
[[370  11]
 [ 22  95]]
error 6 =  0.066265060241
RMSE6 =  0.25742000746
AUC =  0.891547210445
recal =  0.811965811966
precision =  0.896226415094
f1Score =  0.85201793722

accuracy =  0.94156626506
AUC =  0.896041471809
recal =  0.819039803167
precision =  0.885473063227
f1 =  0.849701779438



In [24]:
r1 = {}
writeResults2={}
writeResults={}

#enc = OneHotEncoder(n_values =3) 
#enc.fit(X) 
#XX = enc.fit_transform(X)
#XX = XX.toarray()
#print(XX.shape)

#enc.fit(xTraining) 
#xTraining1 = enc.fit_transform(xTraining)
#xTraining1 = xTraining1.toarray()
#print(xTraining1.shape)

#enc.fit(xTest) 
#xTest1 = enc.fit_transform(xTest)
#xTest1 = xTest1.toarray()
#print(xTest1.shape)
    

bern = tree.DecisionTreeClassifier()
bern.fit(xTraining1, yTraining1)
yPredict6 = bern.predict(xTest1)
print(metrics.accuracy_score(yTest,yPredict6))
print(metrics.confusion_matrix(yTest,yPredict6))
error6 = mean_squared_error(yTest, yPredict6)
print("error 6 = ",error6)
RMSE6 = mean_squared_error(yTest,yPredict6)**0.5
print("RMSE6 = ",RMSE6)

fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict6)
print("AUC = ", metrics.auc(fpr,tpr))
print("recal = ",metrics.recall_score(yTest,yPredict6))
print("precision = ",metrics.precision_score(yTest,yPredict6))
print("f1Score = ",f1_score(yTest, yPredict6, average='binary'))

r1['confu'] = metrics.confusion_matrix(yTest,yPredict6)
r1['accu'] = metrics.accuracy_score(yTest,yPredict6)
r1['auc'] = metrics.auc(fpr,tpr)
r1['recal'] = metrics.recall_score(yTest,yPredict6)
r1['precision'] = metrics.precision_score(yTest,yPredict6)
r1['f1'] = f1_score(yTest, yPredict6, average='binary')
   
writeResults2['all'] = r1

print()

results = crossValidiation(XX, Y, k = 10,classifier = bern, continious = False)
results['down'] = down
results['up'] = up
results['thres'] = threshold    
print("accuracy = ",results['accuracy'])
print("AUC = ", results['auc'])
print("recal = ",results['recall'])
print("precision = ",results['precision'])
print("f1 = ",results['f1'])


results['len snps'] = len(XX.T)
writeResults['all'] = results
    
print()
    
    
writeResult(path,'tree_all',writeResults)
writeResultConf(path,'tree_all',writeResults2)

0.670682730924
[[303  78]
 [ 86  31]]
error 6 =  0.329317269076
RMSE6 =  0.573861715988
AUC =  0.530116427754
recal =  0.264957264957
precision =  0.284403669725
f1Score =  0.274336283186

accuracy =  0.704417670683
AUC =  0.550334731143
recal =  0.289408506863
precision =  0.28174803139
f1 =  0.284723190924

