In [1]:
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import numpy as np
import os.path
import csv
from sklearn.svm import SVC
from sklearn.svm import NuSVC
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn import grid_search
import sklearn.linear_model as linear_model
import sklearn.metrics as metrics
from sklearn import preprocessing
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from random import randint
from sklearn import tree
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import math
import time

from metrics.Correlation import Correlation
from IO.Write import Write
from IO.Read import Read
from metrics.RSquare import RSquare
from DataSet.Dataset import DataSet


def setIdToName(aList):
    
    ids = {}
    nameToId = {}
    idToName = {}
    count = 0
    
    for i in aList:
        
        nameToId[i] = count
        idToName[count] = i
        count += 1
        
    ids['nameToId'] = nameToId
    ids['idToName'] = idToName
    
    return ids


def setSnpsCode(patients,chromosomes):
    
    for i in patients.keys():
        patients[i].snpCode(chromosomes)
        
    return patients
    



def tables(sampleX,sampleY,k):
  
    samples = {}
    
    for run in range(1,k+1):
        
        d1 = {}
        

        dataTestX = sampleX[run]
        dataTestY = sampleY[run]

        n = 0

        for i in sampleX.keys():

            if i != run:

                n += len(sampleX[i])

        dataTrainX = np.zeros((n,len(sampleX[1].T)),dtype = int)
        dataTrainY = np.zeros((n,),dtype = int)

        count = 0

        for sample in sampleX.keys():

            if sample != run:

                 for i in range(len(sampleX[sample])):
                    for j in range(len(sampleX[sample].T)):
                        dataTrainX[count,j] = sampleX[sample][i,j]

                    dataTrainY[count] = sampleY[sample][i]
                    count += 1

        d1['trainX'] = dataTrainX
        d1['trainY'] = dataTrainY
        d1['testX'] = dataTestX
        d1['testY'] = dataTestY
        
        samples[run] = d1
    
    return samples
    
def kSampleData(k,X,Y):
    
    x = int (len(X) / k)
    allElements = np.zeros((len(X),),dtype = int)
    
    count1 = 1
    sampleX = {}
    sampleY = {}
   
    
    while count1 <= k:
        count2 = 1
        sampleData = []
        
        if count1 == k:
            x =  len(X) - ((k-1) * x)
        
        dataX = np.zeros((x,len(X.T)),dtype = int)
        dataY = np.zeros((x,),dtype = int)
        
        while count2 <= x:
            
            aRand = randint(0,len(X)-1)
            
            while allElements[aRand] == 1:
                
                aRand = randint(0,len(X)-1)
            
            allElements[aRand] = 1
            sampleData.append(aRand)
            count2 += 1
            
        for i in range(len(sampleData)):
            for j in range(len(X.T)):
                dataX[i,j] = X[sampleData[i],j]
            
            dataY[i] = Y[sampleData[i]]
            
        sampleX[count1] = dataX
        sampleY[count1] = dataY
        count1 +=1
        
    return tables(sampleX,sampleY,k)




def createNewTable(snps,X):
    
    newX = np.zeros((len(X),len(snps)),dtype = int)
    
    for i in range(len(newX)):
        for j in range(len(newX.T)):
            newX[i,j] = -1
    
    for i in range(len(snps)):
        #for j in range(len(X)):
            
        newX[:,i] = X[:,snps[i]]
            
    print("new shape = ",newX.shape)
            
    return newX



def crossValidiation(X, Y, k = 1, continious = True, classifier = None,OLS = False,Logistic = False):
    
    if not classifier:
        print("wrong!!!!!!! you have to choise a classifier")
        return
    
    results = {}
    accuracy = {}
    auc = {}
    recall = {}
    precision = {}
    f1Score = {}
    
    sumResults = 0.0
    sumAccuracy = 0.0
    sumAuc = 0.0
    sumRecall = 0.0
    sumPrecision = 0.0
    sumF1Score = 0.0
    
    samples = kSampleData(k,X,Y)
    
    for run in range(1, k + 1):
        
        trainX = samples[run]['trainX']
        trainY = samples[run]['trainY']
        
        testX = samples[run]['testX']
        testY = samples[run]['testY']
        
        
        if OLS:
            classifier = sm.OLS(trainY,trainX)
            yPredict = classifier.fit().predict(testX)
        else:

            classifier.fit(trainX, trainY)
            yPredict = classifier.predict(testX)
        
        if continious:
            
            for i in range(len(yPredict)):
                
                if (abs(0 - yPredict[i]) - abs(1 - yPredict[i])) <= 1e-10 :
                    yPredict[i] = 0
                else:
                    yPredict[i] = 1
                    
        if Logistic:
            
            probabilities = classifier.predict_proba(testX)
            
            for i in range(len(probabilities)):
                if probabilities[i][1] >= 0.8:
                    yPredict[i] = 1
                else:
                    yPredict[i] = 0
          
        accuracy[run] = metrics.accuracy_score(testY,yPredict)#(yPredict,testY)#
        fpr, tpr, thresholds = metrics.roc_curve(testY,yPredict)
        auc[run] = metrics.auc(fpr,tpr)
        recall[run] = metrics.recall_score(testY,yPredict)
        precision[run] = metrics.precision_score(testY,yPredict)
        f1Score[run] = f1_score(testY, yPredict, average='binary')
        
    
    for i in accuracy.keys():
        sumAccuracy = sumAccuracy + accuracy[i]
        sumAuc = sumAuc + auc[i]
        sumRecall = sumRecall + recall[i]
        sumPrecision = sumPrecision + precision[i]
        sumF1Score = sumF1Score + f1Score[i]
    
    results['accuracy'] = sumAccuracy / k
    results['auc'] = sumAuc / k
    results['recall'] = sumRecall / k
    results['precision'] = sumPrecision / k
    results['f1'] = sumF1Score / k
    
    return results

def writeResultConf(path,name,results):
    
    folder = path + name + 'Confu' + " ( " + time.strftime("%d-%m-%Y") + " ) " + '\\'
    #file = path + name + 'Confu' + '.txt'
    
    i=1
    while os.path.exists(folder):
        folder = path + name + 'Confu' + " ( " + time.strftime("%d-%m-%Y") + " ) " + '_' + str(i) + '\\'
    
        i += 1
        
    os.makedirs(folder)
    
    file = folder + name + 'Confu' + '.txt' 
    
    write = open(file,'w')
    
    write.write(name + '\n')
    
    for i in results:
        
        results1 = results[i]
        write.write("Category = " + i + '\n')
        
        for j in results1:
            
            write.write(j + " = " + str(results1[j]) + '\n')
            
        write.write('\n')
        write.write('\n')
        
    write.close()

    

def writeResult(path,name,results):
    
    folder = path + name  + " ( " + time.strftime("%d-%m-%Y") + " ) " + '\\'
    #file = path + name  + '.txt'
    
    i=1
    while os.path.exists(folder):
        folder = path + name + " ( " + time.strftime("%d-%m-%Y") + " ) " + '_' + str(i) + '\\'
       
        i += 1
        
    os.makedirs(folder)
    
    file = folder + name + '.txt'
    
    write = open(file,'w')
    
    write.write(name + '\n')
    
    for i in results:
        
        results1 = results[i]
        write.write("Category = " + i + '\n')
        
        for j in results1:
            
            write.write(j + " = " + str(results1[j]) + '\n')
            
        write.write('\n')
        write.write('\n')
        
    write.close()




In [2]:
#path = 'D:\\newSet\\p = 0.0001\\' #bake ton fakelo pou 8a epe3ergas8eis
#path = 'D:\\newSet\\fisher\\pvalue = 0.001\\'
#path = 'D:\\newSet\\fisher\\pvalue = 0.0001\\'
#path = 'D:\\newSet\\maf = 0.05\\pvalue = 0.001\\'
path = 'D:\\newSet\\assoc\\pvalue = 0.01\\'
#path = 'D:\\newSet\\assoc\\pvalue = 1e-05\\'
#path = 'D:\\newSet\\assoc\\pvalue = 5e-08\\'
#path = 'C:\\Users\\ANTONIS\\Desktop\\p = 0.0001\\'
numberOfChromosomes = 22#'ari8mos twn xromoswmatwn'
patientsTrain = {}
patientsTest = {}
allPatients = {}

chromosomes = {}

read = Read(path,numberOfChromosomes)
write = Write(path,numberOfChromosomes)

patients = read.readPatients('Patients.txt')
chromosomes = read.readSnps(".assoc")
write.writePatientsList(patients,'patient.txt')
write.writeSnpsList(chromosomes)


# run train_lgen bat and test_leg bat

In [None]:

snps = read.getListOfSnps()
ids = {} 
idsTest = {}

ids['patients'] = setIdToName(list(patients.keys()))
ids['snps'] = setIdToName(snps)


if os.path.exists(path + 'snpCode.txt'):
    print("mphka")
   # patients = read.readSnpsCode(patients,ids)
    X, Y = read.readSnpsCode(patients,ids)
    
else:
    
    write.writeSnpLog(read.getNumberOfPatients(),read.getNumberOfSnps(),chromosomes)
    
    
X.shape

# Correlation

In [4]:
xTraining, xTest, yTraining1, yTest = train_test_split(X, Y, test_size=0.1, random_state=randint(0,2017))
print("mergex = ",X.shape)
print("xTrain = ",xTraining.shape)
print("xTest = ",xTest.shape)
print("yTrain = ",yTraining1.shape)
print("yTest = ",yTest.shape)

mergex =  (4980, 7799)
xTrain =  (4482, 7799)
xTest =  (498, 7799)
yTrain =  (4482,)
yTest =  (498,)


In [5]:
cor = Correlation(X)
#cor = RSquare(X)

#cor = Correlation(xTraining)
#cor = RSquare(xTraining)

In [6]:
snpReduc = {}

# Correlation high

In [7]:
snpReduc['high'] = cor.getHighCorrelationSnps(0.7)

count =  6916
len snpsRed =  6916


# Correlation Low

In [32]:
snpsRed = []
snpReduc['low'] = cor.getLowCorrelationSnps(0.7,c=40)

count =  7799
len snpsRed =  7799


# Sepearte cases and Controls

In [9]:
cases = []
controls = []
yCo = []
yCa = []
idsCa = {}
idsCos = {}

caIds = {}
cosIds = {}

for i in range(len(yTraining1)):
    if yTraining1[i] == 0 :
        controls.append(i)
        
    elif yTraining1[i] == 1:
        cases.append(i)
        
count = 0

for i in controls:
    idsCos[i] = count
    cosIds[count] = i
    count += 1
    
count = 0

for i in cases:
    idsCa[i] = count
    caIds[count] = i
    count += 1
        
control = np.zeros((len(controls),len(xTraining.T)))
case = np.zeros((len(cases),len(xTraining.T)))

for i in controls:
    pos = idsCos[i]
    control[pos,:] = xTraining[i,:]
   # yCo.append(Y[pos])
    
for i in cases:
    pos = idsCa[i]
    case[pos,:] = xTraining[i,:]
   # yCa.append(Y[pos])

print("cases = ",case.shape)
print("controls = ",control.shape)


cases =  (903, 7799)
controls =  (3579, 7799)


In [10]:
#r21 = RSquare(control)
#r22 = RSquare(case)

r21 = Correlation(control)
r22 = Correlation(case)


In [11]:
#co = r21.getRMatrix()
#ca = r22.getRMatrix()

co = r21.getCorrMatrix()
ca = r22.getCorrMatrix()

countCo ={}
countCa = {}

count1 = 0
count2 = 0

c1 = []
c2 = []
print("co.T = ",len(co.T))
for i in range(len(co.T)):
    countCo[i] = 0
    countCa[i] = 0
    
for i in range(len(co)):
    for j in range(i+1,len(co.T)):
        
        if (co[i,j]) - 0.7 >= 1e-12:
            countCo[i] = countCo[i] + 1
            countCo[j] = countCo[j] + 1
            
        if (ca[i,j]) - 0.7 >= 1e-12:
            
            countCa[i] = countCa[i] + 1
            countCa[j] = countCa[j] + 1
    
for i in countCo.keys():
    if countCo[i]>0:
        c1.append(i)
        count1 += 1
        
    if countCa[i]>0:
        c2.append(i)
        count2 += 1
        
print("count1 = ",count1)
print("count1 = ",count2)
print("c1 =",len(c1))
print("c2 =",len(c2))

co.T =  7799
count1 =  6911
count1 =  6925
c1 = 6911
c2 = 6925


# Case U CONTROL

In [12]:
snpsRed = []

c1H = list(set(c1) - set(c2))
c2H = list(set(c2) - set(c1))

for i in c1H:
    snpsRed.append(i)
    
for i in c2H:
    snpsRed.append(i)

print("c1 =",len(c1))
print("c2 =",len(c2))
print("c1H =",len(c1H))
print("c2H =",len(c2H))
print("len snpsRed = ",len(snpsRed))

snpReduc['union'] = snpsRed

c1 = 6911
c2 = 6925
c1H = 8
c2H = 22
len snpsRed =  30


# Case & Control

In [13]:
snpsRed = []
count = 0
for i in c1:
    if i in c2:
        count += 1
snpsRed = list(set(c1) & set(c2))
print("c1 =",len(c1))
print("c2 =",len(c2))
print("len snpsRed = ",len(snpsRed))
print("count = ",count)
snpReduc['tomi'] = snpsRed

c1 = 6911
c2 = 6925
len snpsRed =  6903
count =  6903


# 1 - ((Case U Control) + (Case & Control))

In [14]:
snpsRed = []
c1H = []
c2H = []
c3H = []
helpList = []

c1H = list(set(c1) - set(c2))
c2H = list(set(c2) - set(c1))
c3H = list(set(c1) & set(c2))

for i in c1H:
    helpList.append(i)
    
for i in c2H:
    helpList.append(i)
    
for i in c3H:
    helpList.append(i)

for i in range(len(co.T)):
    
    if i not in helpList:
        snpsRed.append(i)
        
print("c1H =",len(c1H))
print("c2H =",len(c2H))
print("c3H =",len(c3H))
print("c1 =",len(c1))
print("c2 =",len(c2))
print("helpList =",len(helpList)) 
print("len snpsRed = ",len(snpsRed))
print("5415 - len(helpList) = ", len(co.T) - len(helpList))

snpReduc['ektosuniontomi'] = snpsRed

c1H = 8
c2H = 22
c3H = 6903
c1 = 6911
c2 = 6925
helpList = 6933
len snpsRed =  866
5415 - len(helpList) =  866


# 1 - (Case & Control)

In [15]:
snpsRed = []
count = 0
for i in c1:
    if i in c2:
        count += 1
cHelp = list(set(c1) & set(c2))

for i in range(len(co.T)):
    if i not in cHelp:
        snpsRed.append(i)

print("c1 =",len(c1))
print("c2 =",len(c2))
print("len cHelp = ",len(cHelp))
print("count = ",count)
print("len snpsRed = ",len(snpsRed))
snpReduc['ektostomi'] = snpsRed

c1 = 6911
c2 = 6925
len cHelp =  6903
count =  6903
len snpsRed =  896


In [16]:
categories = ['low','high','union','tomi','ektosuniontomi','ektostomi']
writeResults = {}
writeResults2 = {}
for i in categories:
    writeResults[i] = {}
    writeResults2[i] = {}

# SVM KERNEL

In [17]:
for i in categories:
    
    r1 = {}
    
    print("Category = ",i)
    
    XX = createNewTable(snpReduc[i],X)
    xTraining1 = createNewTable(snpReduc[i],xTraining)
    xTest1 = createNewTable(snpReduc[i],xTest)
    
    enc = OneHotEncoder(n_values =3) 
    enc.fit(XX) 
    XX = enc.fit_transform(XX)
    XX = XX.toarray()
    print(XX.shape)

    enc.fit(xTraining1) 
    xTraining1 = enc.fit_transform(xTraining1)
    xTraining1 = xTraining1.toarray()
    print(xTraining1.shape)

    enc.fit(xTest1) 
    xTest1 = enc.fit_transform(xTest1)
    xTest1 = xTest1.toarray()
    print(xTest1.shape)
    
    
    clf = SVC(kernel='linear')
    clf.fit(xTraining1, yTraining1)
    yPredict2 = clf.predict(xTest1)
    print(metrics.accuracy_score(yTest,yPredict2))
    print(metrics.confusion_matrix(yTest,yPredict2))
    error2 = mean_squared_error(yTest, yPredict2)
    print("error 2 = ",error2)
    RMSE2 = mean_squared_error(yTest,yPredict2)**0.5
    print("RMSE2 = ",RMSE2)
    fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict2)
    print("AUC = ", metrics.auc(fpr,tpr))
    print("recal = ",metrics.recall_score(yTest,yPredict2))
    print("precision = ",metrics.precision_score(yTest,yPredict2))
    print("f1Score = ",f1_score(yTest, yPredict2, average='binary'))
    
    r1['confu'] = metrics.confusion_matrix(yTest,yPredict2)
    r1['accu'] = metrics.accuracy_score(yTest,yPredict2)
    r1['auc'] = metrics.auc(fpr,tpr)
    r1['recal'] = metrics.recall_score(yTest,yPredict2)
    r1['precision'] = metrics.precision_score(yTest,yPredict2)
    r1['f1'] = f1_score(yTest, yPredict2, average='binary')
    
    writeResults2[i] = r1
    print()

    results = crossValidiation(XX, Y, k = 10, classifier = clf, continious = False)
    print("accuracy = ",results['accuracy'])
    print("AUC = ", results['auc'])
    print("recal = ",results['recall'])
    print("precision = ",results['precision'])
    print("f1 = ",results['f1'])
    results['len snps'] = len(snpReduc[i])
    writeResults[i] = results
    
    print()
    
    
writeResult(path,'kernel',writeResults)
writeResultConf(path,'kernel',writeResults2)

Category =  low
new shape =  (4980, 883)
new shape =  (4482, 883)
new shape =  (498, 883)
(4980, 2649)
(4482, 2649)
(498, 2649)
0.90562248996
[[364  19]
 [ 28  87]]
error 2 =  0.0943775100402
RMSE2 =  0.307209228442
AUC =  0.85345669202
recal =  0.75652173913
precision =  0.820754716981
f1Score =  0.787330316742

accuracy =  0.906626506024
AUC =  0.858554354257
recal =  0.776933920764
precision =  0.769972440226
f1 =  0.77244680583

Category =  high
new shape =  (4980, 6916)
new shape =  (4482, 6916)
new shape =  (498, 6916)
(4980, 20748)
(4482, 20748)
(498, 20748)
0.873493975904
[[368  15]
 [ 48  67]]
error 2 =  0.126506024096
RMSE2 =  0.355676853473
AUC =  0.771722102395
recal =  0.582608695652
precision =  0.817073170732
f1Score =  0.680203045685

accuracy =  0.885542168675
AUC =  0.81822878043
recal =  0.704449734841
precision =  0.726299794343
f1 =  0.714585017675

Category =  union
new shape =  (4980, 30)
new shape =  (4482, 30)
new shape =  (498, 30)
(4980, 90)
(4482, 90)
(498, 

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


accuracy =  0.795582329317
AUC =  0.5
recal =  0.0
precision =  0.0
f1 =  0.0

Category =  tomi
new shape =  (4980, 6903)
new shape =  (4482, 6903)
new shape =  (498, 6903)
(4980, 20709)
(4482, 20709)
(498, 20709)
0.875502008032
[[367  16]
 [ 46  69]]
error 2 =  0.124497991968
RMSE2 =  0.352842729793
AUC =  0.77911227154
recal =  0.6
precision =  0.811764705882
f1Score =  0.69

accuracy =  0.884337349398
AUC =  0.815352326323
recal =  0.69839297326
precision =  0.726984939935
f1 =  0.711822130158

Category =  ektosuniontomi
new shape =  (4980, 866)
new shape =  (4482, 866)
new shape =  (498, 866)
(4980, 2598)
(4482, 2598)
(498, 2598)
0.901606425703
[[361  22]
 [ 27  88]]
error 2 =  0.0983935742972
RMSE2 =  0.313677500464
AUC =  0.85388806902
recal =  0.765217391304
precision =  0.8
f1Score =  0.782222222222

accuracy =  0.908032128514
AUC =  0.858627463515
recal =  0.775103741846
precision =  0.774363310169
f1 =  0.773915101662

Category =  ektostomi
new shape =  (4980, 896)
new shape 

In [18]:
#re = cross_val_score(clf, XX, Y, cv=10)
#print(sum(re)/10)

# LINEAR LOGISTIC REGRESSION 

In [19]:
for i in categories:
    
    r1 = {}
    
    print("Category = ",i)
    
    XX = createNewTable(snpReduc[i],X)
    xTraining1 = createNewTable(snpReduc[i],xTraining)
    xTest1 = createNewTable(snpReduc[i],xTest)
    
    enc = OneHotEncoder(n_values =3) 
    enc.fit(XX) 
    XX = enc.fit_transform(XX)
    XX = XX.toarray()
    print(XX.shape)

    enc.fit(xTraining1) 
    xTraining1 = enc.fit_transform(xTraining1)
    xTraining1 = xTraining1.toarray()
    print(xTraining1.shape)

    enc.fit(xTest1) 
    xTest1 = enc.fit_transform(xTest1)
    xTest1 = xTest1.toarray()
    print(xTest1.shape)
    
    

    lr_clf = linear_model.LogisticRegression()  
    lr_clf.fit(xTraining1, yTraining1)
    yPredict4 = lr_clf.predict(xTest1)

    print(metrics.accuracy_score(yTest,yPredict4))
    print(metrics.confusion_matrix(yTest,yPredict4))
    error4 = mean_squared_error(yTest, yPredict4)
    print("error 4 = ",error4)
    RMSE4 = mean_squared_error(yTest,yPredict4)**0.5
    print("RMSE4 = ",RMSE4)

    fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict4)
    print("AUC = ", metrics.auc(fpr,tpr))
    print("recal = ",metrics.recall_score(yTest,yPredict4))
    print("precision = ",metrics.precision_score(yTest,yPredict4))
    print("f1Score = ",f1_score(yTest, yPredict4, average='binary'))
    
    r1['confu'] = metrics.confusion_matrix(yTest,yPredict4)
    r1['accu'] = metrics.accuracy_score(yTest,yPredict4)
    r1['auc'] = metrics.auc(fpr,tpr)
    r1['recal'] = metrics.recall_score(yTest,yPredict4)
    r1['precision'] = metrics.precision_score(yTest,yPredict4)
    r1['f1'] = f1_score(yTest, yPredict4, average='binary')
    writeResults2[i] = r1
    
    print()

    results = crossValidiation(XX, Y, k = 10, classifier = lr_clf, continious = False)
    print("accuracy = ",results['accuracy'])
    print("AUC = ", results['auc'])
    print("recal = ",results['recall'])
    print("precision = ",results['precision'])
    print("f1 = ",results['f1'])
    results['len snps'] = len(snpReduc[i])
    writeResults[i] = results
    
    print()
    
    
writeResult(path,'llr',writeResults)
writeResultConf(path,'llr',writeResults2)

Category =  low
new shape =  (4980, 883)
new shape =  (4482, 883)
new shape =  (498, 883)
(4980, 2649)
(4482, 2649)
(498, 2649)
0.929718875502
[[371  12]
 [ 23  92]]
error 4 =  0.070281124498
RMSE4 =  0.26510587413
AUC =  0.884334203655
recal =  0.8
precision =  0.884615384615
f1Score =  0.840182648402

accuracy =  0.918674698795
AUC =  0.860331197978
recal =  0.761553134368
precision =  0.827683655545
f1 =  0.792152859446

Category =  high
new shape =  (4980, 6916)
new shape =  (4482, 6916)
new shape =  (498, 6916)
(4980, 20748)
(4482, 20748)
(498, 20748)
0.881526104418
[[372  11]
 [ 48  67]]
error 4 =  0.118473895582
RMSE4 =  0.34420037127
AUC =  0.77694403451
recal =  0.582608695652
precision =  0.858974358974
f1Score =  0.694300518135

accuracy =  0.891365461847
AUC =  0.821327557909
recal =  0.702591054116
precision =  0.751119787394
f1 =  0.725760226138

Category =  union
new shape =  (4980, 30)
new shape =  (4482, 30)
new shape =  (498, 30)
(4980, 90)
(4482, 90)
(498, 90)
0.7650

In [20]:
#re = cross_val_score(lr_clf, XX, Y, cv=10)
#print(sum(re)/10)

In [21]:
from sklearn.naive_bayes import GaussianNB

for i in categories:
    
    r1 = {}
    
    print("Category = ",i)
    
    XX = createNewTable(snpReduc[i],X)
    xTraining1 = createNewTable(snpReduc[i],xTraining)
    xTest1 = createNewTable(snpReduc[i],xTest)
    
    enc = OneHotEncoder(n_values =3) 
    enc.fit(XX) 
    XX = enc.fit_transform(XX)
    XX = XX.toarray()
    print(XX.shape)

    enc.fit(xTraining1) 
    xTraining1 = enc.fit_transform(xTraining1)
    xTraining1 = xTraining1.toarray()
    print(xTraining1.shape)

    enc.fit(xTest1) 
    xTest1 = enc.fit_transform(xTest1)
    xTest1 = xTest1.toarray()
    print(xTest1.shape)
    

    gnb = GaussianNB()
    gnb.fit(xTraining1, yTraining1)
    yPredict6 = gnb.predict(xTest1)
    print(metrics.accuracy_score(yTest,yPredict6))
    print(metrics.confusion_matrix(yTest,yPredict6))
    error6 = mean_squared_error(yTest, yPredict6)
    print("error 6 = ",error6)
    RMSE6 = mean_squared_error(yTest,yPredict6)**0.5
    print("RMSE6 = ",RMSE6)

    fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict6)
    print("AUC = ", metrics.auc(fpr,tpr))
    print("recal = ",metrics.recall_score(yTest,yPredict6))
    print("precision = ",metrics.precision_score(yTest,yPredict6))
    print("f1Score = ",f1_score(yTest, yPredict6, average='binary'))
    
    r1['confu'] = metrics.confusion_matrix(yTest,yPredict6)
    r1['accu'] = metrics.accuracy_score(yTest,yPredict6)
    r1['auc'] = metrics.auc(fpr,tpr)
    r1['recal'] = metrics.recall_score(yTest,yPredict6)
    r1['precision'] = metrics.precision_score(yTest,yPredict6)
    r1['f1'] = f1_score(yTest, yPredict6, average='binary')
    
    writeResults2[i] = r1
    
    print()

    results = crossValidiation(XX, Y, k = 10,classifier = gnb, continious = False)
    
    print("accuracy = ",results['accuracy'])
    print("AUC = ", results['auc'])
    print("recal = ",results['recall'])
    print("precision = ",results['precision'])
    print("f1 = ",results['f1'])
    print()
    results['len snps'] = len(snpReduc[i])
    writeResults[i] = results
    
    print()
    
    
writeResult(path,'gnb',writeResults)
writeResultConf(path,'gnb',writeResults2)

Category =  low
new shape =  (4980, 883)
new shape =  (4482, 883)
new shape =  (498, 883)
(4980, 2649)
(4482, 2649)
(498, 2649)
0.427710843373
[[109 274]
 [ 11 104]]
error 6 =  0.572289156627
RMSE6 =  0.756497955467
AUC =  0.594471563174
recal =  0.904347826087
precision =  0.275132275132
f1Score =  0.421906693712

accuracy =  0.432530120482
AUC =  0.609521284731
recal =  0.90860519027
precision =  0.252796699268
f1 =  0.394928662656


Category =  high
new shape =  (4980, 6916)
new shape =  (4482, 6916)
new shape =  (498, 6916)
(4980, 20748)
(4482, 20748)
(498, 20748)
0.481927710843
[[133 250]
 [  8 107]]
error 6 =  0.518072289157
RMSE6 =  0.719772387048
AUC =  0.638846634124
recal =  0.930434782609
precision =  0.299719887955
f1Score =  0.453389830508

accuracy =  0.512449799197
AUC =  0.651871103621
recal =  0.887581627747
precision =  0.281714384667
f1 =  0.426884930339


Category =  union
new shape =  (4980, 30)
new shape =  (4482, 30)
new shape =  (498, 30)
(4980, 90)
(4482, 90)
(

# BERNOULLI


In [22]:
from sklearn.naive_bayes import BernoulliNB

for i in categories:
    
    r1 = {}
    
    print("Category = ",i)
    
    XX = createNewTable(snpReduc[i],X)
    xTraining1 = createNewTable(snpReduc[i],xTraining)
    xTest1 = createNewTable(snpReduc[i],xTest)
    
    enc = OneHotEncoder(n_values =3) 
    enc.fit(XX) 
    XX = enc.fit_transform(XX)
    XX = XX.toarray()
    print(XX.shape)

    enc.fit(xTraining1) 
    xTraining1 = enc.fit_transform(xTraining1)
    xTraining1 = xTraining1.toarray()
    print(xTraining1.shape)

    enc.fit(xTest1) 
    xTest1 = enc.fit_transform(xTest1)
    xTest1 = xTest1.toarray()
    print(xTest1.shape)
    

    bern = BernoulliNB()
    bern.fit(xTraining1, yTraining1)
    yPredict6 = bern.predict(xTest1)
    print(metrics.accuracy_score(yTest,yPredict6))
    print(metrics.confusion_matrix(yTest,yPredict6))
    error6 = mean_squared_error(yTest, yPredict6)
    print("error 6 = ",error6)
    RMSE6 = mean_squared_error(yTest,yPredict6)**0.5
    print("RMSE6 = ",RMSE6)

    fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict6)
    print("AUC = ", metrics.auc(fpr,tpr))
    print("recal = ",metrics.recall_score(yTest,yPredict6))
    print("precision = ",metrics.precision_score(yTest,yPredict6))
    print("f1Score = ",f1_score(yTest, yPredict6, average='binary'))
    
    r1['confu'] = metrics.confusion_matrix(yTest,yPredict6)
    r1['accu'] = metrics.accuracy_score(yTest,yPredict6)
    r1['auc'] = metrics.auc(fpr,tpr)
    r1['recal'] = metrics.recall_score(yTest,yPredict6)
    r1['precision'] = metrics.precision_score(yTest,yPredict6)
    r1['f1'] = f1_score(yTest, yPredict6, average='binary')
    
    writeResults2[i] = r1
    
    print()

    results = crossValidiation(XX, Y, k = 10,classifier = bern, continious = False)
    
    print("accuracy = ",results['accuracy'])
    print("AUC = ", results['auc'])
    print("recal = ",results['recall'])
    print("precision = ",results['precision'])
    print("f1 = ",results['f1'])
    print()
    results['len snps'] = len(snpReduc[i])
    writeResults[i] = results
    
    print()
    
    
writeResult(path,'bernoulli',writeResults)
writeResultConf(path,'bernoulli',writeResults2)

Category =  low
new shape =  (4980, 883)
new shape =  (4482, 883)
new shape =  (498, 883)
(4980, 2649)
(4482, 2649)
(498, 2649)
0.947791164659
[[373  10]
 [ 16  99]]
error 6 =  0.0522088353414
RMSE6 =  0.228492527977
AUC =  0.917379952321
recal =  0.860869565217
precision =  0.908256880734
f1Score =  0.883928571429

accuracy =  0.935742971888
AUC =  0.902231382684
recal =  0.845559848611
precision =  0.840396953352
f1 =  0.842630804385


Category =  high
new shape =  (4980, 6916)
new shape =  (4482, 6916)
new shape =  (498, 6916)
(4980, 20748)
(4482, 20748)
(498, 20748)
0.726907630522
[[282 101]
 [ 35  80]]
error 6 =  0.273092369478
RMSE6 =  0.522582404486
AUC =  0.715972301056
recal =  0.695652173913
precision =  0.441988950276
f1Score =  0.540540540541

accuracy =  0.74437751004
AUC =  0.727198994106
recal =  0.698332061221
precision =  0.424099872758
f1 =  0.527068390862


Category =  union
new shape =  (4980, 30)
new shape =  (4482, 30)
new shape =  (498, 30)
(4980, 90)
(4482, 90)


In [23]:
XX = createNewTable(snpReduc['low'],X)
enc = OneHotEncoder(n_values =3) 
enc.fit(XX) 
XX = enc.fit_transform(XX)
XX = XX.toarray()
print(XX.shape)

re = cross_val_score(bern, XX, Y, cv=10)
print(sum(re)/10)

new shape =  (4980, 883)
(4980, 2649)
0.934742077952


In [24]:
for i in categories:
    
    r1 = {}
    
    print("Category = ",i)
   
    enc = OneHotEncoder(n_values =3) 
    enc.fit(X) 
    XX = enc.fit_transform(X)
    XX = XX.toarray()
    print(XX.shape)

    enc.fit(xTraining) 
    xTraining1 = enc.fit_transform(xTraining)
    xTraining1 = xTraining1.toarray()
    print(xTraining1.shape)

    enc.fit(xTest) 
    xTest1 = enc.fit_transform(xTest)
    xTest1 = xTest1.toarray()
    print(xTest1.shape)
    

    bern = BernoulliNB()
    bern.fit(xTraining1, yTraining1)
    yPredict6 = bern.predict(xTest1)
    print(metrics.accuracy_score(yTest,yPredict6))
    print(metrics.confusion_matrix(yTest,yPredict6))
    error6 = mean_squared_error(yTest, yPredict6)
    print("error 6 = ",error6)
    RMSE6 = mean_squared_error(yTest,yPredict6)**0.5
    print("RMSE6 = ",RMSE6)

    fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict6)
    print("AUC = ", metrics.auc(fpr,tpr))
    print("recal = ",metrics.recall_score(yTest,yPredict6))
    print("precision = ",metrics.precision_score(yTest,yPredict6))
    print("f1Score = ",f1_score(yTest, yPredict6, average='binary'))
    
    r1['confu'] = metrics.confusion_matrix(yTest,yPredict6)
    r1['accu'] = metrics.accuracy_score(yTest,yPredict6)
    r1['auc'] = metrics.auc(fpr,tpr)
    r1['recal'] = metrics.recall_score(yTest,yPredict6)
    r1['precision'] = metrics.precision_score(yTest,yPredict6)
    r1['f1'] = f1_score(yTest, yPredict6, average='binary')
    
    writeResults2[i] = r1
    
    print()

    results = crossValidiation(XX, Y, k = 10,classifier = bern, continious = False)
    
    print("accuracy = ",results['accuracy'])
    print("AUC = ", results['auc'])
    print("recal = ",results['recall'])
    print("precision = ",results['precision'])
    print("f1 = ",results['f1'])
    print()
    results['len snps'] = len(snpReduc[i])
    writeResults[i] = results
    
    print()

Category =  low
(4980, 23397)
(4482, 23397)
(498, 23397)
0.74297188755
[[287  96]
 [ 32  83]]
error 6 =  0.25702811245
RMSE6 =  0.50697940042
AUC =  0.73554319446
recal =  0.721739130435
precision =  0.463687150838
f1Score =  0.56462585034

accuracy =  0.760040160643
AUC =  0.739641324157
recal =  0.705066384849
precision =  0.445295334589
f1 =  0.545550735723


Category =  high
(4980, 23397)
(4482, 23397)
(498, 23397)
0.74297188755
[[287  96]
 [ 32  83]]
error 6 =  0.25702811245
RMSE6 =  0.50697940042
AUC =  0.73554319446
recal =  0.721739130435
precision =  0.463687150838
f1Score =  0.56462585034

accuracy =  0.763855421687
AUC =  0.742578393061
recal =  0.706664887637
precision =  0.450343638268
f1 =  0.548902055946


Category =  union
(4980, 23397)
(4482, 23397)
(498, 23397)
0.74297188755
[[287  96]
 [ 32  83]]
error 6 =  0.25702811245
RMSE6 =  0.50697940042
AUC =  0.73554319446
recal =  0.721739130435
precision =  0.463687150838
f1Score =  0.56462585034

accuracy =  0.76124497992
