In [27]:
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import numpy as np
import os.path
import csv
from sklearn.svm import SVC
from sklearn.svm import NuSVC
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn import grid_search
import sklearn.linear_model as linear_model
import sklearn.metrics as metrics
from sklearn import preprocessing
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from random import randint
from sklearn import tree
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import math
import time

from metrics.Correlation import Correlation
from IO.Write import Write
from IO.Read import Read
from metrics.RSquare import RSquare
from DataSet.Dataset import DataSet


def setIdToName(aList):
    
    ids = {}
    nameToId = {}
    idToName = {}
    count = 0
    
    for i in aList:
        
        nameToId[i] = count
        idToName[count] = i
        count += 1
        
    ids['nameToId'] = nameToId
    ids['idToName'] = idToName
    
    return ids


def setSnpsCode(patients,chromosomes):
    
    for i in patients.keys():
        patients[i].snpCode(chromosomes)
        
    return patients
    



def tables(sampleX,sampleY,k):
  
    samples = {}
    
    for run in range(1,k+1):
        
        d1 = {}
        

        dataTestX = sampleX[run]
        dataTestY = sampleY[run]

        n = 0

        for i in sampleX.keys():

            if i != run:

                n += len(sampleX[i])

        dataTrainX = np.zeros((n,len(sampleX[1].T)),dtype = int)
        dataTrainY = np.zeros((n,),dtype = int)

        count = 0

        for sample in sampleX.keys():

            if sample != run:

                 for i in range(len(sampleX[sample])):
                    for j in range(len(sampleX[sample].T)):
                        dataTrainX[count,j] = sampleX[sample][i,j]

                    dataTrainY[count] = sampleY[sample][i]
                    count += 1

        d1['trainX'] = dataTrainX
        d1['trainY'] = dataTrainY
        d1['testX'] = dataTestX
        d1['testY'] = dataTestY
        
        samples[run] = d1
    
    return samples
    
def kSampleData(k,X,Y):
    
    x = int (len(X) / k)
    allElements = np.zeros((len(X),),dtype = int)
    
    count1 = 1
    sampleX = {}
    sampleY = {}
   
    
    while count1 <= k:
        count2 = 1
        sampleData = []
        
        if count1 == k:
            x =  len(X) - ((k-1) * x)
        
        dataX = np.zeros((x,len(X.T)),dtype = int)
        dataY = np.zeros((x,),dtype = int)
        
        while count2 <= x:
            
            aRand = randint(0,len(X)-1)
            
            while allElements[aRand] == 1:
                
                aRand = randint(0,len(X)-1)
            
            allElements[aRand] = 1
            sampleData.append(aRand)
            count2 += 1
            
        for i in range(len(sampleData)):
            for j in range(len(X.T)):
                dataX[i,j] = X[sampleData[i],j]
            
            dataY[i] = Y[sampleData[i]]
            
        sampleX[count1] = dataX
        sampleY[count1] = dataY
        count1 +=1
        
    return tables(sampleX,sampleY,k)




def createNewTable(snps,X):
    
    newX = np.zeros((len(X),len(snps)),dtype = int)
    
    for i in range(len(newX)):
        for j in range(len(newX.T)):
            newX[i,j] = -1
    
    for i in range(len(snps)):
        #for j in range(len(X)):
            
        newX[:,i] = X[:,snps[i]]
            
    print("new shape = ",newX.shape)
            
    return newX



def crossValidiation(X, Y, k = 1, continious = True, classifier = None,OLS = False,Logistic = False):
    
    if not classifier:
        print("wrong!!!!!!! you have to choise a classifier")
        return
    
    results = {}
    accuracy = {}
    auc = {}
    recall = {}
    precision = {}
    f1Score = {}
    
    sumResults = 0.0
    sumAccuracy = 0.0
    sumAuc = 0.0
    sumRecall = 0.0
    sumPrecision = 0.0
    sumF1Score = 0.0
    
    samples = kSampleData(k,X,Y)
    
    for run in range(1, k + 1):
        
        trainX = samples[run]['trainX']
        trainY = samples[run]['trainY']
        
        testX = samples[run]['testX']
        testY = samples[run]['testY']
        
        
        if OLS:
            classifier = sm.OLS(trainY,trainX)
            yPredict = classifier.fit().predict(testX)
        else:

            classifier.fit(trainX, trainY)
            yPredict = classifier.predict(testX)
        
        if continious:
            
            for i in range(len(yPredict)):
                
                if (abs(0 - yPredict[i]) - abs(1 - yPredict[i])) <= 1e-10 :
                    yPredict[i] = 0
                else:
                    yPredict[i] = 1
                    
        if Logistic:
            
            probabilities = classifier.predict_proba(testX)
            
            for i in range(len(probabilities)):
                if probabilities[i][1] >= 0.8:
                    yPredict[i] = 1
                else:
                    yPredict[i] = 0
          
        accuracy[run] = metrics.accuracy_score(testY,yPredict)#(yPredict,testY)#
        fpr, tpr, thresholds = metrics.roc_curve(testY,yPredict)
        auc[run] = metrics.auc(fpr,tpr)
        recall[run] = metrics.recall_score(testY,yPredict)
        precision[run] = metrics.precision_score(testY,yPredict)
        f1Score[run] = f1_score(testY, yPredict, average='binary')
        
    
    for i in accuracy.keys():
        sumAccuracy = sumAccuracy + accuracy[i]
        sumAuc = sumAuc + auc[i]
        sumRecall = sumRecall + recall[i]
        sumPrecision = sumPrecision + precision[i]
        sumF1Score = sumF1Score + f1Score[i]
    
    results['accuracy'] = sumAccuracy / k
    results['auc'] = sumAuc / k
    results['recall'] = sumRecall / k
    results['precision'] = sumPrecision / k
    results['f1'] = sumF1Score / k
    
    return results

def writeResultConf(path,name,results):
    
    folder = path + name + 'Confu' + time.strftime("%d/%m/%Y")
    #file = path + name + 'Confu' + '.txt'
    
    i=1
    while os.path.exists(folder):
        folder = path + name + 'Confu' + time.strftime("%d/%m/%Y") + str(i)
       # file = path + name + 'Confu' + str(i) + '.txt'
        i += 1
        
    os.makedirs(folder)
    
    file = folder + name + 'Confu' + '.txt' 
    
    write = open(file,'w')
    
    write.write(name + '\n')
    
    for i in results:
        
        results1 = results[i]
        write.write("Category = " + i + '\n')
        
        for j in results1:
            
            write.write(j + " = " + str(results1[j]) + '\n')
            
        write.write('\n')
        write.write('\n')
        
    write.close()

    

def writeResult(path,name,results):
    
   folder = path + name  + time.strftime("%d/%m/%Y")
    #file = path + name  + '.txt'
    
    i=1
    while os.path.exists(folder):
        folder = path + name + time.strftime("%d/%m/%Y") + str(i)
       # file = path + name  + str(i) + '.txt'
        i += 1
        
    os.makedirs(folder)
    
    file = folder + name + time.strftime("%d/%m/%Y")
    
    write = open(file,'w')
    
    write.write(name + '\n')
    
    for i in results:
        
        results1 = results[i]
        write.write("Category = " + i + '\n')
        
        for j in results1:
            
            write.write(j + " = " + str(results1[j]) + '\n')
            
        write.write('\n')
        write.write('\n')
        
    write.close()


IndentationError: unexpected indent (<ipython-input-27-fe6241103af8>, line 298)

In [2]:
path = 'D:\\newSet\\p = 0.0001\\' #bake ton fakelo pou 8a epe3ergas8eis
#path = 'D:\\newSet\\fisher\\pvalue = 0.001\\'
#path = 'D:\\newSet\\fisher\\pvalue = 0.0001\\'
#path = 'D:\\newSet\\maf = 0.05\\pvalue = 0.001\\'
#path = 'D:\\newSet\\assoc\\pvalue = 0.0001\\'
#path = 'D:\\newSet\\assoc\\pvalue = 1e-05\\'

numberOfChromosomes = 22#'ari8mos twn xromoswmatwn'
patientsTrain = {}
patientsTest = {}
allPatients = {}

chromosomes = {}

read = Read(path,numberOfChromosomes)
write = Write(path,numberOfChromosomes)

patients = read.readPatients('Patients.txt')
chromosomes = read.readSnps(".assoc")
write.writePatientsList(patients,'patient.txt')
write.writeSnpsList(chromosomes)


# run train_lgen bat and test_leg bat

In [3]:

snps = read.getListOfSnps()
ids = {} 
idsTest = {}

ids['patients'] = setIdToName(list(patients.keys()))
ids['snps'] = setIdToName(snps)


if os.path.exists(path + 'snpCode.txt'):
    print("mphka")
   # patients = read.readSnpsCode(patients,ids)
    X, Y = read.readSnpsCode(patients,ids)
    
else:
    
    write.writeSnpLog(read.getNumberOfPatients(),read.getNumberOfSnps(),chromosomes)
    
    
X.shape

mphka
mphka2


(4980, 5415)

# Correlation

In [4]:
xTraining, xTest, yTraining1, yTest = train_test_split(X, Y, test_size=0.7, random_state=randint(0,2017))
print("mergex = ",X.shape)
print("xTrain = ",xTraining.shape)
print("xTest = ",xTest.shape)
print("yTrain = ",yTraining1.shape)
print("yTest = ",yTest.shape)

mergex =  (4980, 5415)
xTrain =  (1494, 5415)
xTest =  (3486, 5415)
yTrain =  (1494,)
yTest =  (3486,)


In [5]:
cor = Correlation(X)
#cor = RSquare(X)

#cor = Correlation(xTraining)
#cor = RSquare(xTraining)

In [6]:
snpReduc = {}

# Correlation high

In [7]:
snpReduc['high'] = cor.getHighCorrelationSnps(0.7)

count =  3309
len snpsRed =  3309


# Correlation Low

In [8]:
snpsRed = []
snpReduc['low'] = cor.getLowCorrelationSnps(0.7)

count =  2106
len snpsRed =  2106


# Sepearte cases and Controls

In [9]:
cases = []
controls = []
yCo = []
yCa = []
idsCa = {}
idsCos = {}

caIds = {}
cosIds = {}

for i in range(len(yTraining1)):
    if yTraining1[i] == 0 :
        controls.append(i)
        
    elif yTraining1[i] == 1:
        cases.append(i)
        
count = 0

for i in controls:
    idsCos[i] = count
    cosIds[count] = i
    count += 1
    
count = 0

for i in cases:
    idsCa[i] = count
    caIds[count] = i
    count += 1
        
control = np.zeros((len(controls),len(xTraining.T)))
case = np.zeros((len(cases),len(xTraining.T)))

for i in controls:
    pos = idsCos[i]
    control[pos,:] = xTraining[i,:]
   # yCo.append(Y[pos])
    
for i in cases:
    pos = idsCa[i]
    case[pos,:] = xTraining[i,:]
   # yCa.append(Y[pos])

print("cases = ",case.shape)
print("controls = ",control.shape)


cases =  (324, 5415)
controls =  (1170, 5415)


In [10]:
#r21 = RSquare(control)
#r22 = RSquare(case)

r21 = Correlation(control)
r22 = Correlation(case)


In [11]:
#co = r21.getRMatrix()
#ca = r22.getRMatrix()

co = r21.getCorrMatrix()
ca = r22.getCorrMatrix()

countCo ={}
countCa = {}

count1 = 0
count2 = 0

c1 = []
c2 = []
print("co.T = ",len(co.T))
for i in range(len(co.T)):
    countCo[i] = 0
    countCa[i] = 0
    
for i in range(len(co)):
    for j in range(i+1,len(co.T)):
        
        if (co[i,j]) - 0.7 >= 1e-12:
            countCo[i] = countCo[i] + 1
            countCo[j] = countCo[j] + 1
            
        if (ca[i,j]) - 0.7 >= 1e-12:
            
            countCa[i] = countCa[i] + 1
            countCa[j] = countCa[j] + 1
    
for i in countCo.keys():
    if countCo[i]>0:
        c1.append(i)
        count1 += 1
        
    if countCa[i]>0:
        c2.append(i)
        count2 += 1
        
print("count1 = ",count1)
print("count1 = ",count2)
print("c1 =",len(c1))
print("c2 =",len(c2))

co.T =  5415
count1 =  2018
count1 =  4111
c1 = 2018
c2 = 4111


# Case U CONTROL

In [12]:
snpsRed = []

c1H = list(set(c1) - set(c2))
c2H = list(set(c2) - set(c1))

for i in c1H:
    snpsRed.append(i)
    
for i in c2H:
    snpsRed.append(i)

print("c1 =",len(c1))
print("c2 =",len(c2))
print("c1H =",len(c1H))
print("c2H =",len(c2H))
print("len snpsRed = ",len(snpsRed))

snpReduc['union'] = snpsRed

c1 = 2018
c2 = 4111
c1H = 101
c2H = 2194
len snpsRed =  2295


# Case & Control

In [13]:
snpsRed = []
count = 0
for i in c1:
    if i in c2:
        count += 1
snpsRed = list(set(c1) & set(c2))
print("c1 =",len(c1))
print("c2 =",len(c2))
print("len snpsRed = ",len(snpsRed))
print("count = ",count)
snpReduc['tomi'] = snpsRed

c1 = 2018
c2 = 4111
len snpsRed =  1917
count =  1917


# 1 - ((Case U Control) + (Case & Control))

In [14]:
snpsRed = []
c1H = []
c2H = []
c3H = []
helpList = []

c1H = list(set(c1) - set(c2))
c2H = list(set(c2) - set(c1))
c3H = list(set(c1) & set(c2))

for i in c1H:
    helpList.append(i)
    
for i in c2H:
    helpList.append(i)
    
for i in c3H:
    helpList.append(i)

for i in range(len(co.T)):
    
    if i not in helpList:
        snpsRed.append(i)
        
print("c1H =",len(c1H))
print("c2H =",len(c2H))
print("c3H =",len(c3H))
print("c1 =",len(c1))
print("c2 =",len(c2))
print("helpList =",len(helpList)) 
print("len snpsRed = ",len(snpsRed))
print("5415 - len(helpList) = ", len(co.T) - len(helpList))

snpReduc['ektosuniontomi'] = snpsRed

c1H = 101
c2H = 2194
c3H = 1917
c1 = 2018
c2 = 4111
helpList = 4212
len snpsRed =  1203
5415 - len(helpList) =  1203


# 1 - (Case & Control)

In [15]:
snpsRed = []
count = 0
for i in c1:
    if i in c2:
        count += 1
cHelp = list(set(c1) & set(c2))

for i in range(len(co.T)):
    if i not in cHelp:
        snpsRed.append(i)

print("c1 =",len(c1))
print("c2 =",len(c2))
print("len cHelp = ",len(cHelp))
print("count = ",count)
print("len snpsRed = ",len(snpsRed))
snpReduc['ektostomi'] = snpsRed

c1 = 2018
c2 = 4111
len cHelp =  1917
count =  1917
len snpsRed =  3498


In [18]:
categories = ['low','high','union','tomi','ektosuniontomi','ektostomi']
writeResults = {}
writeResults2 = {}
for i in categories:
    writeResults[i] = {}
    writeResults2[i] = {}

# SVM KERNEL

In [19]:
for i in categories:
    
    r1 = {}
    
    print("Category = ",i)
    
    XX = createNewTable(snpReduc[i],X)
    xTraining1 = createNewTable(snpReduc[i],xTraining)
    xTest1 = createNewTable(snpReduc[i],xTest)
    
    enc = OneHotEncoder(n_values =3) 
    enc.fit(XX) 
    XX = enc.fit_transform(XX)
    XX = XX.toarray()
    print(XX.shape)

    enc.fit(xTraining1) 
    xTraining1 = enc.fit_transform(xTraining1)
    xTraining1 = xTraining1.toarray()
    print(xTraining1.shape)

    enc.fit(xTest1) 
    xTest1 = enc.fit_transform(xTest1)
    xTest1 = xTest1.toarray()
    print(xTest1.shape)
    
    
    clf = SVC(kernel='linear')
    clf.fit(xTraining1, yTraining1)
    yPredict2 = clf.predict(xTest1)
    print(metrics.accuracy_score(yTest,yPredict2))
    print(metrics.confusion_matrix(yTest,yPredict2))
    error2 = mean_squared_error(yTest, yPredict2)
    print("error 2 = ",error2)
    RMSE2 = mean_squared_error(yTest,yPredict2)**0.5
    print("RMSE2 = ",RMSE2)
    fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict2)
    print("AUC = ", metrics.auc(fpr,tpr))
    print("recal = ",metrics.recall_score(yTest,yPredict2))
    print("precision = ",metrics.precision_score(yTest,yPredict2))
    print("f1Score = ",f1_score(yTest, yPredict2, average='binary'))
    
    r1['confu'] = metrics.confusion_matrix(yTest,yPredict2)
    r1['accu'] = metrics.accuracy_score(yTest,yPredict2)
    r1['auc'] = metrics.auc(fpr,tpr)
    r1['recal'] = metrics.recall_score(yTest,yPredict2)
    r1['precision'] = metrics.precision_score(yTest,yPredict2)
    r1['f1'] = f1_score(yTest, yPredict2, average='binary')
    
    writeResults2[i] = r1
    print()

    results = crossValidiation(XX, Y, k = 10, classifier = clf, continious = False)
    print("accuracy = ",results['accuracy'])
    print("AUC = ", results['auc'])
    print("recal = ",results['recall'])
    print("precision = ",results['precision'])
    print("f1 = ",results['f1'])
    results['len snps'] = len(snpReduc[i])
    writeResults[i] = results
    
    print()
    
    
writeResult(path,'kernel',writeResults)
writeResultConf(path,'kernel',writeResults2)

Category =  low
new shape =  (4980, 2106)
new shape =  (1494, 2106)
new shape =  (3486, 2106)
(4980, 6318)
(1494, 6318)
(3486, 6318)
0.914228341939
[[2742   50]
 [ 249  445]]
error 2 =  0.0857716580608
RMSE2 =  0.292867987429
AUC =  0.811651032592
recal =  0.64121037464
precision =  0.89898989899
f1Score =  0.748528174937

accuracy =  0.96265060241
AUC =  0.916637043354
recal =  0.83843949199
precision =  0.973499555672
f1 =  0.900350075954

Category =  high
new shape =  (4980, 3309)
new shape =  (1494, 3309)
new shape =  (3486, 3309)
(4980, 9927)
(1494, 9927)
(3486, 9927)
0.839070567986
[[2603  189]
 [ 372  322]]
error 2 =  0.160929432014
RMSE2 =  0.401160107705
AUC =  0.698141767751
recal =  0.463976945245
precision =  0.630136986301
f1Score =  0.534439834025

accuracy =  0.881526104418
AUC =  0.797274154643
recal =  0.654899334327
precision =  0.735568468513
f1 =  0.691921224342

Category =  union
new shape =  (4980, 2295)
new shape =  (1494, 2295)
new shape =  (3486, 2295)
(4980, 6

In [20]:
#re = cross_val_score(clf, XX, Y, cv=10)
#print(sum(re)/10)

# LINEAR LOGISTIC REGRESSION 

In [21]:
for i in categories:
    
    r1 = {}
    
    print("Category = ",i)
    
    XX = createNewTable(snpReduc[i],X)
    xTraining1 = createNewTable(snpReduc[i],xTraining)
    xTest1 = createNewTable(snpReduc[i],xTest)
    
    enc = OneHotEncoder(n_values =3) 
    enc.fit(XX) 
    XX = enc.fit_transform(XX)
    XX = XX.toarray()
    print(XX.shape)

    enc.fit(xTraining1) 
    xTraining1 = enc.fit_transform(xTraining1)
    xTraining1 = xTraining1.toarray()
    print(xTraining1.shape)

    enc.fit(xTest1) 
    xTest1 = enc.fit_transform(xTest1)
    xTest1 = xTest1.toarray()
    print(xTest1.shape)
    
    

    lr_clf = linear_model.LogisticRegression()  
    lr_clf.fit(xTraining1, yTraining1)
    yPredict4 = lr_clf.predict(xTest1)

    print(metrics.accuracy_score(yTest,yPredict4))
    print(metrics.confusion_matrix(yTest,yPredict4))
    error4 = mean_squared_error(yTest, yPredict4)
    print("error 4 = ",error4)
    RMSE4 = mean_squared_error(yTest,yPredict4)**0.5
    print("RMSE4 = ",RMSE4)

    fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict4)
    print("AUC = ", metrics.auc(fpr,tpr))
    print("recal = ",metrics.recall_score(yTest,yPredict4))
    print("precision = ",metrics.precision_score(yTest,yPredict4))
    print("f1Score = ",f1_score(yTest, yPredict4, average='binary'))
    
    r1['confu'] = metrics.confusion_matrix(yTest,yPredict4)
    r1['accu'] = metrics.accuracy_score(yTest,yPredict4)
    r1['auc'] = metrics.auc(fpr,tpr)
    r1['recal'] = metrics.recall_score(yTest,yPredict4)
    r1['precision'] = metrics.precision_score(yTest,yPredict4)
    r1['f1'] = f1_score(yTest, yPredict4, average='binary')
    writeResults2[i] = r1
    
    print()

    results = crossValidiation(XX, Y, k = 10, classifier = lr_clf, continious = False)
    print("accuracy = ",results['accuracy'])
    print("AUC = ", results['auc'])
    print("recal = ",results['recall'])
    print("precision = ",results['precision'])
    print("f1 = ",results['f1'])
    results['len snps'] = len(snpReduc[i])
    writeResults[i] = results
    
    print()
    
    
writeResult(path,'llr',writeResults)
writeResultConf(path,'llr',writeResults2)

Category =  low
new shape =  (4980, 2106)
new shape =  (1494, 2106)
new shape =  (3486, 2106)
(4980, 6318)
(1494, 6318)
(3486, 6318)
0.887550200803
[[2730   62]
 [ 330  364]]
error 4 =  0.112449799197
RMSE4 =  0.335335353336
AUC =  0.751144686754
recal =  0.524495677233
precision =  0.854460093897
f1Score =  0.65

accuracy =  0.93313253012
AUC =  0.851330259132
recal =  0.712356210763
precision =  0.947927064733
f1 =  0.812538827112

Category =  high
new shape =  (4980, 3309)
new shape =  (1494, 3309)
new shape =  (3486, 3309)
(4980, 9927)
(1494, 9927)
(3486, 9927)
0.83620195066
[[2620  172]
 [ 399  295]]
error 4 =  0.16379804934
RMSE4 =  0.404719717014
AUC =  0.681733730791
recal =  0.42507204611
precision =  0.631691648822
f1Score =  0.508182601206

accuracy =  0.883333333333
AUC =  0.778505525822
recal =  0.600390523197
precision =  0.780941739181
f1 =  0.677320664044

Category =  union
new shape =  (4980, 2295)
new shape =  (1494, 2295)
new shape =  (3486, 2295)
(4980, 6885)
(1494,

In [22]:
#re = cross_val_score(lr_clf, XX, Y, cv=10)
#print(sum(re)/10)

In [23]:
from sklearn.naive_bayes import GaussianNB

for i in categories:
    
    r1 = {}
    
    print("Category = ",i)
    
    XX = createNewTable(snpReduc[i],X)
    xTraining1 = createNewTable(snpReduc[i],xTraining)
    xTest1 = createNewTable(snpReduc[i],xTest)
    
    enc = OneHotEncoder(n_values =3) 
    enc.fit(XX) 
    XX = enc.fit_transform(XX)
    XX = XX.toarray()
    print(XX.shape)

    enc.fit(xTraining1) 
    xTraining1 = enc.fit_transform(xTraining1)
    xTraining1 = xTraining1.toarray()
    print(xTraining1.shape)

    enc.fit(xTest1) 
    xTest1 = enc.fit_transform(xTest1)
    xTest1 = xTest1.toarray()
    print(xTest1.shape)
    

    gnb = GaussianNB()
    gnb.fit(xTraining1, yTraining1)
    yPredict6 = gnb.predict(xTest1)
    print(metrics.accuracy_score(yTest,yPredict6))
    print(metrics.confusion_matrix(yTest,yPredict6))
    error6 = mean_squared_error(yTest, yPredict6)
    print("error 6 = ",error6)
    RMSE6 = mean_squared_error(yTest,yPredict6)**0.5
    print("RMSE6 = ",RMSE6)

    fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict6)
    print("AUC = ", metrics.auc(fpr,tpr))
    print("recal = ",metrics.recall_score(yTest,yPredict6))
    print("precision = ",metrics.precision_score(yTest,yPredict6))
    print("f1Score = ",f1_score(yTest, yPredict6, average='binary'))
    
    r1['confu'] = metrics.confusion_matrix(yTest,yPredict6)
    r1['accu'] = metrics.accuracy_score(yTest,yPredict6)
    r1['auc'] = metrics.auc(fpr,tpr)
    r1['recal'] = metrics.recall_score(yTest,yPredict6)
    r1['precision'] = metrics.precision_score(yTest,yPredict6)
    r1['f1'] = f1_score(yTest, yPredict6, average='binary')
    
    writeResults2[i] = r1
    
    print()

    results = crossValidiation(XX, Y, k = 10,classifier = gnb, continious = False)
    
    print("accuracy = ",results['accuracy'])
    print("AUC = ", results['auc'])
    print("recal = ",results['recall'])
    print("precision = ",results['precision'])
    print("f1 = ",results['f1'])
    print()
    results['len snps'] = len(snpReduc[i])
    writeResults[i] = results
    
    print()
    
    
writeResult(path,'gnb',writeResults)
writeResultConf(path,'gnb',writeResults2)

Category =  low
new shape =  (4980, 2106)
new shape =  (1494, 2106)
new shape =  (3486, 2106)
(4980, 6318)
(1494, 6318)
(3486, 6318)
0.884968445209
[[2407  385]
 [  16  678]]
error 6 =  0.115031554791
RMSE6 =  0.339163020966
AUC =  0.919525631074
recal =  0.976945244957
precision =  0.637817497648
f1Score =  0.771770062607

accuracy =  0.972891566265
AUC =  0.979840018385
recal =  0.991718464326
precision =  0.888032789318
f1 =  0.936909916742


Category =  high
new shape =  (4980, 3309)
new shape =  (1494, 3309)
new shape =  (3486, 3309)
(4980, 9927)
(1494, 9927)
(3486, 9927)
0.90017211704
[[2624  168]
 [ 180  514]]
error 6 =  0.0998278829604
RMSE6 =  0.315955507881
AUC =  0.840231042996
recal =  0.740634005764
precision =  0.75366568915
f1Score =  0.747093023256

accuracy =  0.936345381526
AUC =  0.875848091904
recal =  0.773721588473
precision =  0.900629057846
f1 =  0.831260668825


Category =  union
new shape =  (4980, 2295)
new shape =  (1494, 2295)
new shape =  (3486, 2295)
(498

# BERNOULLI


In [24]:
from sklearn.naive_bayes import BernoulliNB

for i in categories:
    
    r1 = {}
    
    print("Category = ",i)
    
    XX = createNewTable(snpReduc[i],X)
    xTraining1 = createNewTable(snpReduc[i],xTraining)
    xTest1 = createNewTable(snpReduc[i],xTest)
    
    enc = OneHotEncoder(n_values =3) 
    enc.fit(XX) 
    XX = enc.fit_transform(XX)
    XX = XX.toarray()
    print(XX.shape)

    enc.fit(xTraining1) 
    xTraining1 = enc.fit_transform(xTraining1)
    xTraining1 = xTraining1.toarray()
    print(xTraining1.shape)

    enc.fit(xTest1) 
    xTest1 = enc.fit_transform(xTest1)
    xTest1 = xTest1.toarray()
    print(xTest1.shape)
    

    bern = BernoulliNB()
    bern.fit(xTraining1, yTraining1)
    yPredict6 = bern.predict(xTest1)
    print(metrics.accuracy_score(yTest,yPredict6))
    print(metrics.confusion_matrix(yTest,yPredict6))
    error6 = mean_squared_error(yTest, yPredict6)
    print("error 6 = ",error6)
    RMSE6 = mean_squared_error(yTest,yPredict6)**0.5
    print("RMSE6 = ",RMSE6)

    fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict6)
    print("AUC = ", metrics.auc(fpr,tpr))
    print("recal = ",metrics.recall_score(yTest,yPredict6))
    print("precision = ",metrics.precision_score(yTest,yPredict6))
    print("f1Score = ",f1_score(yTest, yPredict6, average='binary'))
    
    r1['confu'] = metrics.confusion_matrix(yTest,yPredict6)
    r1['accu'] = metrics.accuracy_score(yTest,yPredict6)
    r1['auc'] = metrics.auc(fpr,tpr)
    r1['recal'] = metrics.recall_score(yTest,yPredict6)
    r1['precision'] = metrics.precision_score(yTest,yPredict6)
    r1['f1'] = f1_score(yTest, yPredict6, average='binary')
    
    writeResults2[i] = r1
    
    print()

    results = crossValidiation(XX, Y, k = 10,classifier = bern, continious = False)
    
    print("accuracy = ",results['accuracy'])
    print("AUC = ", results['auc'])
    print("recal = ",results['recall'])
    print("precision = ",results['precision'])
    print("f1 = ",results['f1'])
    print()
    results['len snps'] = len(snpReduc[i])
    writeResults[i] = results
    
    print()
    
    
writeResult(path,'bernoulli',writeResults)
writeResultConf(path,'bernoulli',writeResults2)

Category =  low
new shape =  (4980, 2106)
new shape =  (1494, 2106)
new shape =  (3486, 2106)
(4980, 6318)
(1494, 6318)
(3486, 6318)
0.942340791738
[[2792    0]
 [ 201  493]]
error 6 =  0.0576592082616
RMSE6 =  0.240123318863
AUC =  0.855187319885
recal =  0.710374639769
precision =  1.0
f1Score =  0.830665543387

accuracy =  0.991164658635
AUC =  0.978179568219
recal =  0.956359136439
precision =  1.0
f1 =  0.977549472895


Category =  high
new shape =  (4980, 3309)
new shape =  (1494, 3309)
new shape =  (3486, 3309)
(4980, 9927)
(1494, 9927)
(3486, 9927)
0.683591508893
[[1985  807]
 [ 296  398]]
error 6 =  0.316408491107
RMSE6 =  0.562501992092
AUC =  0.642223458544
recal =  0.5734870317
precision =  0.330290456432
f1Score =  0.419167983149

accuracy =  0.68875502008
AUC =  0.66123206061
recal =  0.61443854893
precision =  0.350379286046
f1 =  0.445157844965


Category =  union
new shape =  (4980, 2295)
new shape =  (1494, 2295)
new shape =  (3486, 2295)
(4980, 6885)
(1494, 6885)
(34

In [25]:
XX = createNewTable(snpReduc['low'],X)
enc = OneHotEncoder(n_values =3) 
enc.fit(XX) 
XX = enc.fit_transform(XX)
XX = XX.toarray()
print(XX.shape)

re = cross_val_score(bern, XX, Y, cv=10)
print(sum(re)/10)

new shape =  (4980, 2106)
(4980, 6318)
0.990562636833


In [26]:
for i in categories:
    
    r1 = {}
    
    print("Category = ",i)
   
    enc = OneHotEncoder(n_values =3) 
    enc.fit(X) 
    XX = enc.fit_transform(X)
    XX = XX.toarray()
    print(XX.shape)

    enc.fit(xTraining) 
    xTraining1 = enc.fit_transform(xTraining)
    xTraining1 = xTraining1.toarray()
    print(xTraining1.shape)

    enc.fit(xTest) 
    xTest1 = enc.fit_transform(xTest)
    xTest1 = xTest1.toarray()
    print(xTest1.shape)
    

    bern = BernoulliNB()
    bern.fit(xTraining1, yTraining1)
    yPredict6 = bern.predict(xTest1)
    print(metrics.accuracy_score(yTest,yPredict6))
    print(metrics.confusion_matrix(yTest,yPredict6))
    error6 = mean_squared_error(yTest, yPredict6)
    print("error 6 = ",error6)
    RMSE6 = mean_squared_error(yTest,yPredict6)**0.5
    print("RMSE6 = ",RMSE6)

    fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict6)
    print("AUC = ", metrics.auc(fpr,tpr))
    print("recal = ",metrics.recall_score(yTest,yPredict6))
    print("precision = ",metrics.precision_score(yTest,yPredict6))
    print("f1Score = ",f1_score(yTest, yPredict6, average='binary'))
    
    r1['confu'] = metrics.confusion_matrix(yTest,yPredict6)
    r1['accu'] = metrics.accuracy_score(yTest,yPredict6)
    r1['auc'] = metrics.auc(fpr,tpr)
    r1['recal'] = metrics.recall_score(yTest,yPredict6)
    r1['precision'] = metrics.precision_score(yTest,yPredict6)
    r1['f1'] = f1_score(yTest, yPredict6, average='binary')
    
    writeResults2[i] = r1
    
    print()

    results = crossValidiation(XX, Y, k = 10,classifier = bern, continious = False)
    
    print("accuracy = ",results['accuracy'])
    print("AUC = ", results['auc'])
    print("recal = ",results['recall'])
    print("precision = ",results['precision'])
    print("f1 = ",results['f1'])
    print()
    results['len snps'] = len(snpReduc[i])
    writeResults[i] = results
    
    print()

Category =  low
(4980, 16245)
(1494, 16245)
(3486, 16245)
0.756167527252
[[2220  572]
 [ 278  416]]
error 6 =  0.243832472748
RMSE6 =  0.493793957788
AUC =  0.697276285476
recal =  0.599423631124
precision =  0.421052631579
f1Score =  0.494649227111

accuracy =  0.755823293173
AUC =  0.738293407295
recal =  0.707322853085
precision =  0.440477331948
f1 =  0.54099376221


Category =  high
(4980, 16245)
(1494, 16245)
(3486, 16245)
0.756167527252
[[2220  572]
 [ 278  416]]
error 6 =  0.243832472748
RMSE6 =  0.493793957788
AUC =  0.697276285476
recal =  0.599423631124
precision =  0.421052631579
f1Score =  0.494649227111

accuracy =  0.753212851406
AUC =  0.733312874378
recal =  0.699537384368
precision =  0.436323833182
f1 =  0.536676871306


Category =  union
(4980, 16245)
(1494, 16245)
(3486, 16245)
0.756167527252
[[2220  572]
 [ 278  416]]
error 6 =  0.243832472748
RMSE6 =  0.493793957788
AUC =  0.697276285476
recal =  0.599423631124
precision =  0.421052631579
f1Score =  0.49464922711