In [None]:
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import numpy as np
import os.path
import csv
from sklearn.svm import SVC
from sklearn.svm import NuSVC
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn import grid_search
import sklearn.linear_model as linear_model
import sklearn.metrics as metrics
from sklearn import preprocessing
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from random import randint
from sklearn import tree
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import math
import time
from sklearn.naive_bayes import BernoulliNB

from metrics.Correlation import Correlation
from IO.Write import Write
from IO.Read import Read
from metrics.RSquare import RSquare
from DataSet.Dataset import DataSet


def setIdToName(aList):
    
    ids = {}
    nameToId = {}
    idToName = {}
    count = 0
    
    for i in aList:
        
        nameToId[i] = count
        idToName[count] = i
        count += 1
        
    ids['nameToId'] = nameToId
    ids['idToName'] = idToName
    
    return ids


def setSnpsCode(patients,chromosomes):
    
    for i in patients.keys():
        patients[i].snpCode(chromosomes)
        
    return patients
    



def tables(sampleX,sampleY,k):
  
    samples = {}
    
    for run in range(1,k+1):
        
        d1 = {}
        

        dataTestX = sampleX[run]
        dataTestY = sampleY[run]

        n = 0

        for i in sampleX.keys():

            if i != run:

                n += len(sampleX[i])

        dataTrainX = np.zeros((n,len(sampleX[1].T)),dtype = int)
        dataTrainY = np.zeros((n,),dtype = int)

        count = 0

        for sample in sampleX.keys():

            if sample != run:

                 for i in range(len(sampleX[sample])):
                    for j in range(len(sampleX[sample].T)):
                        dataTrainX[count,j] = sampleX[sample][i,j]

                    dataTrainY[count] = sampleY[sample][i]
                    count += 1

        d1['trainX'] = dataTrainX
        d1['trainY'] = dataTrainY
        d1['testX'] = dataTestX
        d1['testY'] = dataTestY
        
        samples[run] = d1
    
    return samples
    
def kSampleData(k,X,Y):
    
    x = int (len(X) / k)
    allElements = np.zeros((len(X),),dtype = int)
    
    count1 = 1
    sampleX = {}
    sampleY = {}
   
    
    while count1 <= k:
        count2 = 1
        sampleData = []
        
        if count1 == k:
            x =  len(X) - ((k-1) * x)
        
        dataX = np.zeros((x,len(X.T)),dtype = int)
        dataY = np.zeros((x,),dtype = int)
        
        while count2 <= x:
            
            aRand = randint(0,len(X)-1)
            
            while allElements[aRand] == 1:
                
                aRand = randint(0,len(X)-1)
            
            allElements[aRand] = 1
            sampleData.append(aRand)
            count2 += 1
            
        for i in range(len(sampleData)):
            for j in range(len(X.T)):
                dataX[i,j] = X[sampleData[i],j]
            
            dataY[i] = Y[sampleData[i]]
            
        sampleX[count1] = dataX
        sampleY[count1] = dataY
        count1 +=1
        
    return tables(sampleX,sampleY,k)




def createNewTable(snps,X):
    
    newX = np.zeros((len(X),len(snps)),dtype = int)
    count=0
    
    for i in range(len(newX)):
        for j in range(len(newX.T)):
            newX[i,j] = -1
    
    for i in range(len(snps)):
        
        newX[:,i] = X[:,snps[i]]
       
        
    print("new shape = ",newX.shape)
            
    return newX

def featuresIds(snps):
    
    features = {}
    
    for i in range(len(snps)):
        features[i] = snps[i]
        
    return features



def crossValidiation(X, Y, k = 1, continious = True, classifier = None,OLS = False,Logistic = False):
    
    if not classifier:
        print("wrong!!!!!!! you have to choise a classifier")
        return
    
    results = {}
    accuracy = {}
    auc = {}
    recall = {}
    precision = {}
    f1Score = {}
    
    sumResults = 0.0
    sumAccuracy = 0.0
    sumAuc = 0.0
    sumRecall = 0.0
    sumPrecision = 0.0
    sumF1Score = 0.0
    
    samples = kSampleData(k,X,Y)
    
    for run in range(1, k + 1):
        
        trainX = samples[run]['trainX']
        trainY = samples[run]['trainY']
        
        #trainX,trainY = balancedData(trainX,trainY)
        
        testX = samples[run]['testX']
        testY = samples[run]['testY']
        
        
        if OLS:
            classifier = sm.OLS(trainY,trainX)
            yPredict = classifier.fit().predict(testX)
        else:

            classifier.fit(trainX, trainY)
            yPredict = classifier.predict(testX)
        
        if continious:
            
            for i in range(len(yPredict)):
                
                if (abs(0 - yPredict[i]) - abs(1 - yPredict[i])) <= 1e-10 :
                    yPredict[i] = 0
                else:
                    yPredict[i] = 1
                    
        if Logistic:
            
            probabilities = classifier.predict_proba(testX)
            
            for i in range(len(probabilities)):
                if probabilities[i][1] >= 0.8:
                    yPredict[i] = 1
                else:
                    yPredict[i] = 0
          
        accuracy[run] = metrics.accuracy_score(testY,yPredict)#(yPredict,testY)#
        fpr, tpr, thresholds = metrics.roc_curve(testY,yPredict)
        auc[run] = metrics.auc(fpr,tpr)
        recall[run] = metrics.recall_score(testY,yPredict)
        precision[run] = metrics.precision_score(testY,yPredict)
        f1Score[run] = f1_score(testY, yPredict, average='binary')
        
    
    for i in accuracy.keys():
        sumAccuracy = sumAccuracy + accuracy[i]
        sumAuc = sumAuc + auc[i]
        sumRecall = sumRecall + recall[i]
        sumPrecision = sumPrecision + precision[i]
        sumF1Score = sumF1Score + f1Score[i]
    
    results['accuracy'] = sumAccuracy / k
    results['auc'] = sumAuc / k
    results['recall'] = sumRecall / k
    results['precision'] = sumPrecision / k
    results['f1'] = sumF1Score / k
    
    return results

def writeResultConf(path,name,results):
    
    folder = path + name + 'Confu' + " ( " + time.strftime("%d-%m-%Y") + " ) " + '\\'
    #file = path + name + 'Confu' + '.txt'
    
    i=1
    while os.path.exists(folder):
        folder = path + name + 'Confu' + " ( " + time.strftime("%d-%m-%Y") + " ) " + '_' + str(i) + '\\'
    
        i += 1
        
    os.makedirs(folder)
    
    file = folder + name + 'Confu' + '.txt' 
    
    write = open(file,'w')
    
    write.write(name + '\n')
    
    for i in results:
        
        results1 = results[i]
        write.write("Category = " + i + '\n')
        
        for j in results1:
            
            write.write(j + " = " + str(results1[j]) + '\n')
            
        write.write('\n')
        write.write('\n')
        
    write.close()

    

def writeResult(path,name,results):
    
    folder = path + name  + " ( " + time.strftime("%d-%m-%Y") + " ) " + '\\'
    #file = path + name  + '.txt'
    
    i=1
    while os.path.exists(folder):
        folder = path + name + " ( " + time.strftime("%d-%m-%Y") + " ) " + '_' + str(i) + '\\'
       
        i += 1
        
    os.makedirs(folder)
    
    file = folder + name + '.txt'
    
    write = open(file,'w')
    
    write.write(name + '\n')
    
    for i in results:
        
        results1 = results[i]
        write.write("Category = " + i + '\n')
        
        for j in results1:
            
            write.write(j + " = " + str(results1[j]) + '\n')
            
        write.write('\n')
        write.write('\n')
        
    write.close()

def balancedData(X,Y):
    
    cases = 0
    for i in range(len(Y)):
        if Y[i] == 1:
            cases += 1
            
    Xbalanced = np.zeros((2*cases,len(X.T)))
    Ybalanced = np.zeros(2*cases)
    controls = 0
    count = 0
    for i in range(len(Y)):
        if(Y[i] == 0 and controls < cases):
            Xbalanced[count,:] = X[i,:]
            Ybalanced[count] = Y[i]
            controls +=1
            count += 1
        elif Y[i] == 1:
            Xbalanced[count,:] = X[i,:]
            Ybalanced[count] = Y[i]
            count += 1
            
    print("len x = ", Xbalanced.shape)
    print("len y = ", Ybalanced.shape)
    
    return Xbalanced, Ybalanced
            

In [None]:

path = 'C:\\Users\\ANTONIS\\Desktop\\newSet\\assoc\\pvalue = 0.0001\\'
#path = 'C:\\Users\\ANTONIS\\Desktop\\newSet\\assoc\\pvalue = 5e-08\\'
#path = 'C:\\Users\\ANTONIS\\Desktop\\newSet\\assoc\\pvalue = 1e-05\\'
#path = 'C:\\Users\\ANTONIS\\Desktop\\newSet\\assoc\\pvalue = 0.001\\'
#path = 'C:\\Users\\ANTONIS\\Desktop\\newSet\\assoc\\pvalue = 0.01\\'

numberOfChromosomes = 22#'ari8mos twn xromoswmatwn'
patientsTrain = {}
patientsTest = {}
allPatients = {}

chromosomes = {}

read = Read(path,numberOfChromosomes)
write = Write(path,numberOfChromosomes)

patients = read.readPatients('Patients.txt')
chromosomes = read.readSnps(".assoc")
write.writePatientsList(patients,'patient.txt')
write.writeSnpsList(chromosomes)


# run train_lgen bat and test_leg bat

In [None]:

snps = read.getListOfSnps()
ids = {} 


ids['patients'] = setIdToName(list(patients.keys()))
ids['snps'] = setIdToName(snps)


if os.path.exists(path + 'snpCode.txt'):
    print("mphka")
   # patients = read.readSnpsCode(patients,ids)
    X, Y = read.readSnpsCode(patients,ids)
    
else:
    
    write.writeSnpLog(read.getNumberOfPatients(),read.getNumberOfSnps(),chromosomes)
    X, Y = read.readSnpsCode(patients,ids)
    
X.shape

In [None]:
'''
for i in range(len(Y)):
    if Y[i] == 0 :
        Y[i] = -1

sum0 = 0
sum1 = 0
sum2 = 0
sumelse = 0
for i in range(len(Y)):
    if Y[i] == 0 :
        sum0 += 1
    elif Y[i] == 1:
        sum1 +=1
    elif Y[i] == -1:
        sum2 +=1
    else:
        sumelse +=1
print("sum0 = ", sum0)
print("sum1 = ", sum1)
print("sum-1 = ", sum2)
print("sumelse = ", sumelse)'''

# Correlation

In [None]:
xTraining, xTest, yTraining1, yTest = train_test_split(X, Y, test_size=0.1, random_state=randint(0,2017))
print("mergex = ",X.shape)
print("xTrain = ",xTraining.shape)
print("xTest = ",xTest.shape)
print("yTrain = ",yTraining1.shape)
print("yTest = ",yTest.shape)

In [None]:
cor = Correlation(X)
#cor = RSquare(X)

#cor = Correlation(xTraining)
#cor = RSquare(xTraining)

In [None]:
snpReduc = {}

# Correlation high

In [None]:
snpReduc['high'] = cor.getHighCorrelationSnps(0.7)
write.writeSnpsUsed(snpReduc['high'],ids['snps']['idToName'],chromosomes,'high')

# Correlation Low

In [None]:
snpsRed = []
snpReduc['low'] = cor.getLowCorrelationSnps(0.7)
write.writeSnpsUsed(snpReduc['low'],ids['snps']['idToName'],chromosomes,'low')

# Sepearte cases and Controls

In [None]:
cases = []
controls = []
yCo = []
yCa = []
idsCa = {}
idsCos = {}

caIds = {}
cosIds = {}

for i in range(len(yTraining1)):
    if yTraining1[i] == 0 :
        controls.append(i)
        
    elif yTraining1[i] == 1:
        cases.append(i)
        
count = 0

for i in controls:
    idsCos[i] = count
    cosIds[count] = i
    count += 1
    
count = 0

for i in cases:
    idsCa[i] = count
    caIds[count] = i
    count += 1
        
control = np.zeros((len(controls),len(xTraining.T)))
case = np.zeros((len(cases),len(xTraining.T)))

for i in controls:
    pos = idsCos[i]
    control[pos,:] = xTraining[i,:]
   # yCo.append(Y[pos])
    
for i in cases:
    pos = idsCa[i]
    case[pos,:] = xTraining[i,:]
   # yCa.append(Y[pos])

print("cases = ",case.shape)
print("controls = ",control.shape)


In [None]:
#r21 = RSquare(control)
#r22 = RSquare(case)

r21 = Correlation(control)
r22 = Correlation(case)


In [None]:
lowCo = r21.getLowCorrelationSnps(0.7)
highCo = r21.getHighCorrelationSnps(0.7)

lowCa = r22.getLowCorrelationSnps(0.7)
highCa = r22.getHighCorrelationSnps(0.7)


# low &

In [None]:
snpsRed = []
tomi = list(set(lowCo) & set(lowCa))
snpsRed = tomi
print("len snpsRed = ",len(set(snpsRed)))
snpReduc['low&'] = snpsRed
write.writeSnpsUsed(snpReduc['low&'],ids['snps']['idToName'],chromosomes,'low&')

# low U 

In [None]:
snpsRed = []
allLows = []

for i in lowCo:
    if not i in tomi:
        snpsRed.append(i)
    
for i in lowCa:
    if i not in tomi:
        if i not in snpsRed:
            snpsRed.append(i)

print("len snpsRed = ",len(snpsRed))
union = snpsRed
snpReduc['lowU'] = snpsRed
write.writeSnpsUsed(snpReduc['lowU'],ids['snps']['idToName'],chromosomes,'lowU')

# low - (( U ) + (&))

In [None]:

snpsRed = []
alll = []

for i in lowCo:
    alll.append(i)
    
for i in lowCa:
    alll.append(i)
    
for i in alll:
    if (i not in union) and (i not in tomi):
        snpsRed.append(i)
        
print("len snsRed = ", len(snpsRed))
snpReduc['lowEktosUnionTomi'] = snpsRed
write.writeSnpsUsed(snpReduc['lowEktosUnionTomi'],ids['snps']['idToName'],chromosomes,'lowEktosUnionTomi')

# high&

In [None]:
snpsRed = []

snpsRed = list(set(highCa) & set(highCo))
print("len snpsRed = ",len(set(snpsRed)))
snpReduc['high&'] = snpsRed
write.writeSnpsUsed(snpReduc['high&'],ids['snps']['idToName'],chromosomes,'high&')

# highU

In [None]:
snpsRed = []
tomi = []
tomi = list(set(highCo) & set(highCa))

for i in highCo:
    if i not in tomi:
        snpsRed.append(i)
    
for i in highCa:
    if i not in tomi:
        if i not in snpsRed:
            snpsRed.append(i)


print("len snpsRed = ",len(snpsRed))
print("set len snpsRed = ",len(set(snpsRed)))
union = snpsRed
snpReduc['highU'] = snpsRed
write.writeSnpsUsed(snpReduc['highU'],ids['snps']['idToName'],chromosomes,'highU')

In [None]:
len(set(snpReduc['highU']) - set(snpReduc['lowU']))

# high - ((U) + (&)) 

In [None]:
tomi = snpsRed
snpsRed = []
alll = []

for i in highCo:
    alll.append(i)
    
for i in highCa:
    alll.append(i)
    
for i in alll:
    if (i not in union) & (i not in tomi):
        snpsRed.append(i)
        
print("len snsRed = ", len(snpsRed))
snpReduc['highEktosUnionTomi'] = snpsRed
write.writeSnpsUsed(snpReduc['highEktosUnionTomi'],ids['snps']['idToName'],chromosomes,'highEktosUnionTomi')

In [None]:
#categories = ['low','high','low&','lowU','high&','highU']
#categories = ['low','high']
categories = ['low','lowReduced']
writeResults = {}
writeResults2 = {}
for i in categories:
    writeResults[i] = {}
    writeResults2[i] = {}

In [None]:
featuresIdss = {}
featuresIdss['low'] = featuresIds(snpReduc['low'])


# SVM KERNEL

In [None]:
for i in categories:
    
    r1 = {}
    
    print("Category = ",i)
    
    XX = createNewTable(snpReduc[i],X)
    xTraining1 = createNewTable(snpReduc[i],xTraining)
    xTest1 = createNewTable(snpReduc[i],xTest)
    
    enc = OneHotEncoder(n_values =3) 
    enc.fit(XX) 
    XX = enc.fit_transform(XX)
    XX = XX.toarray()
    print(XX.shape)

    enc.fit(xTraining1) 
    xTraining1 = enc.fit_transform(xTraining1)
    xTraining1 = xTraining1.toarray()
    print(xTraining1.shape)

    enc.fit(xTest1) 
    xTest1 = enc.fit_transform(xTest1)
    xTest1 = xTest1.toarray()
    print(xTest1.shape)
    
    
    clf = SVC(kernel='linear')
    clf.fit(xTraining1, yTraining1)
    yPredict2 = clf.predict(xTest1)
    print(metrics.accuracy_score(yTest,yPredict2))
    print(metrics.confusion_matrix(yTest,yPredict2))
    error2 = mean_squared_error(yTest, yPredict2)
    print("error 2 = ",error2)
    RMSE2 = mean_squared_error(yTest,yPredict2)**0.5
    print("RMSE2 = ",RMSE2)
    fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict2)
    print("AUC = ", metrics.auc(fpr,tpr))
    print("recal = ",metrics.recall_score(yTest,yPredict2))
    print("precision = ",metrics.precision_score(yTest,yPredict2))
    print("f1Score = ",f1_score(yTest, yPredict2, average='binary'))
    
    r1['confu'] = metrics.confusion_matrix(yTest,yPredict2)
    r1['accu'] = metrics.accuracy_score(yTest,yPredict2)
    r1['auc'] = metrics.auc(fpr,tpr)
    r1['recal'] = metrics.recall_score(yTest,yPredict2)
    r1['precision'] = metrics.precision_score(yTest,yPredict2)
    r1['f1'] = f1_score(yTest, yPredict2, average='binary')
    
    writeResults2[i] = r1
    print()

    results = crossValidiation(XX, Y, k = 10, classifier = clf, continious = False)
    print("accuracy = ",results['accuracy'])
    print("AUC = ", results['auc'])
    print("recal = ",results['recall'])
    print("precision = ",results['precision'])
    print("f1 = ",results['f1'])
    results['len snps'] = len(snpReduc[i])
    writeResults[i] = results
    
    print()
    
writeResult(path,'kernel',writeResults)
writeResultConf(path,'kernel',writeResults2)

In [None]:
#re = cross_val_score(clf, XX, Y, cv=10)
#print(sum(re)/10)

# LINEAR LOGISTIC REGRESSION 

In [None]:

for i in categories:
  
    r1 = {}
    
    print("Category = ",i)
    
    XX = createNewTable(snpReduc[i],X)
    
    xTraining1 = createNewTable(snpReduc[i],xTraining)
    xTest1 = createNewTable(snpReduc[i],xTest)
    
    ''' enc = OneHotEncoder(n_values =3) 
    enc.fit(XX) 
    XX = enc.fit_transform(XX)
    XX = XX.toarray()
    print(XX.shape)

    enc.fit(xTraining1) 
    xTraining1 = enc.fit_transform(xTraining1)
    xTraining1 = xTraining1.toarray()
    print(xTraining1.shape)

    enc.fit(xTest1) 
    xTest1 = enc.fit_transform(xTest1)
    xTest1 = xTest1.toarray()
    print(xTest1.shape)'''
    
    

#    lr_clf = linear_model.LogisticRegression()
    lr_clf = linear_model.LogisticRegressionCV()
    lr_clf.fit(xTraining1, yTraining1)
    yPredict4 = lr_clf.predict(xTest1)

    print(metrics.accuracy_score(yTest,yPredict4))
    print(metrics.confusion_matrix(yTest,yPredict4))
    error4 = mean_squared_error(yTest, yPredict4)
    print("error 4 = ",error4)
    RMSE4 = mean_squared_error(yTest,yPredict4)**0.5
    print("RMSE4 = ",RMSE4)

    fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict4)
    print("AUC = ", metrics.auc(fpr,tpr))
    print("recal = ",metrics.recall_score(yTest,yPredict4))
    print("precision = ",metrics.precision_score(yTest,yPredict4))
    print("f1Score = ",f1_score(yTest, yPredict4, average='binary'))
    
    r1['confu'] = metrics.confusion_matrix(yTest,yPredict4)
    r1['accu'] = metrics.accuracy_score(yTest,yPredict4)
    r1['auc'] = metrics.auc(fpr,tpr)
    r1['recal'] = metrics.recall_score(yTest,yPredict4)
    r1['precision'] = metrics.precision_score(yTest,yPredict4)
    r1['f1'] = f1_score(yTest, yPredict4, average='binary')
    writeResults2[i] = r1
    
    print()

    results = crossValidiation(XX, Y, k = 10, classifier = lr_clf, continious = False)
    print("accuracy = ",results['accuracy'])
    print("AUC = ", results['auc'])
    print("recal = ",results['recall'])
    print("precision = ",results['precision'])
    print("f1 = ",results['f1'])
    results['len snps'] = len(snpReduc[i])
    writeResults[i] = results
    
    print()
    
    
writeResult(path,'llr',writeResults)
writeResultConf(path,'llr',writeResults2)

In [None]:
from sklearn.naive_bayes import GaussianNB

for i in categories:
    
    r1 = {}
    
    print("Category = ",i)
    
    XX = createNewTable(snpReduc[i],X)
    xTraining1 = createNewTable(snpReduc[i],xTraining)
    xTest1 = createNewTable(snpReduc[i],xTest)
    
    enc = OneHotEncoder(n_values =3) 
    enc.fit(XX) 
    XX = enc.fit_transform(XX)
    XX = XX.toarray()
    print(XX.shape)

    enc.fit(xTraining1) 
    xTraining1 = enc.fit_transform(xTraining1)
    xTraining1 = xTraining1.toarray()
    print(xTraining1.shape)

    enc.fit(xTest1) 
    xTest1 = enc.fit_transform(xTest1)
    xTest1 = xTest1.toarray()
    print(xTest1.shape)
    

    gnb = GaussianNB()
    gnb.fit(xTraining1, yTraining1)
    yPredict6 = gnb.predict(xTest1)
    print(metrics.accuracy_score(yTest,yPredict6))
    print(metrics.confusion_matrix(yTest,yPredict6))
    error6 = mean_squared_error(yTest, yPredict6)
    print("error 6 = ",error6)
    RMSE6 = mean_squared_error(yTest,yPredict6)**0.5
    print("RMSE6 = ",RMSE6)

    fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict6)
    print("AUC = ", metrics.auc(fpr,tpr))
    print("recal = ",metrics.recall_score(yTest,yPredict6))
    print("precision = ",metrics.precision_score(yTest,yPredict6))
    print("f1Score = ",f1_score(yTest, yPredict6, average='binary'))
    
    r1['confu'] = metrics.confusion_matrix(yTest,yPredict6)
    r1['accu'] = metrics.accuracy_score(yTest,yPredict6)
    r1['auc'] = metrics.auc(fpr,tpr)
    r1['recal'] = metrics.recall_score(yTest,yPredict6)
    r1['precision'] = metrics.precision_score(yTest,yPredict6)
    r1['f1'] = f1_score(yTest, yPredict6, average='binary')
    
    writeResults2[i] = r1
    
    print()

    results = crossValidiation(XX, Y, k = 10,classifier = gnb, continious = False)
    
    print("accuracy = ",results['accuracy'])
    print("AUC = ", results['auc'])
    print("recal = ",results['recall'])
    print("precision = ",results['precision'])
    print("f1 = ",results['f1'])
    print()
    results['len snps'] = len(snpReduc[i])
    writeResults[i] = results
    
    print()
    
    
writeResult(path,'gnb',writeResults)
writeResultConf(path,'gnb',writeResults2)

# BERNOULLI


In [None]:

xTraining1 = createNewTable(snpReduc['low'],xTraining)
lr_clf = linear_model.LogisticRegressionCV()
lr_clf.fit(xTraining1, yTraining1)

coefs=lr_clf.coef_[0]
top_20 = np.argpartition(coefs, -50)[-50:]
#top_20 = np.argpartition(coefs, 50)[:50]
removedSnps = []
for i in range(len(snpReduc['low'])):
    if i not in top_20:
        removedSnps.append(snpReduc['low'][i])


snpReduc['lowReduced'] = removedSnps

for i in categories:
   
    r1 = {}
    
    print("Category = ",i)
    print("snpReduc = ", len(snpReduc[i]))
   
   
  
    XX = createNewTable(snpReduc[i],X)
    xTraining1 = createNewTable(snpReduc[i],xTraining)
    xTest1 = createNewTable(snpReduc[i],xTest)

   
    enc = OneHotEncoder(n_values =3) 
    enc.fit(XX) 
    XX = enc.fit_transform(XX)
    XX = XX.toarray()
    print(XX.shape)

    enc.fit(xTraining1) 
    xTraining1 = enc.fit_transform(xTraining1)
    xTraining1 = xTraining1.toarray()
    print(xTraining1.shape)

    enc.fit(xTest1) 
    xTest1 = enc.fit_transform(xTest1)
    xTest1 = xTest1.toarray()
    print(xTest1.shape)
    

    bern = BernoulliNB()
    bern.fit(xTraining1, yTraining1)
    yPredict6 = bern.predict(xTest1)
    print(metrics.accuracy_score(yTest,yPredict6))
    print(metrics.confusion_matrix(yTest,yPredict6))
    error6 = mean_squared_error(yTest, yPredict6)
    print("error 6 = ",error6)
    RMSE6 = mean_squared_error(yTest,yPredict6)**0.5
    print("RMSE6 = ",RMSE6)

    fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict6)
    print("AUC = ", metrics.auc(fpr,tpr))
    print("recal = ",metrics.recall_score(yTest,yPredict6))
    print("precision = ",metrics.precision_score(yTest,yPredict6))
    print("f1Score = ",f1_score(yTest, yPredict6, average='binary'))
    
    r1['confu'] = metrics.confusion_matrix(yTest,yPredict6)
    r1['accu'] = metrics.accuracy_score(yTest,yPredict6)
    r1['auc'] = metrics.auc(fpr,tpr)
    r1['recal'] = metrics.recall_score(yTest,yPredict6)
    r1['precision'] = metrics.precision_score(yTest,yPredict6)
    r1['f1'] = f1_score(yTest, yPredict6, average='binary')
    
    writeResults2[i] = r1
    
    print()

    results = crossValidiation(XX, Y, k = 10,classifier = bern, continious = False)
    
    print("accuracy = ",results['accuracy'])
    print("AUC = ", results['auc'])
    print("recal = ",results['recall'])
    print("precision = ",results['precision'])
    print("f1 = ",results['f1'])
    print()
   # results['len snps'] = len(snpReduc[i])
    writeResults[i] = results
    
    print()
    
    
writeResult(path,'bernoulli',writeResults)
writeResultConf(path,'bernoulli',writeResults2)

In [None]:
'''l= [5,6,7,2,3,4,10,6,7,9,8,5]
top_3 = np.argpartition(l,3)[:3]
print(top_3)
removedSnps=[]
for i in range(len(l)):
    if i not in top_3:
        removedSnps.append(l[i])
print(removedSnps)'''

In [None]:
XX = createNewTable(snpReduc['low'],X)
#enc = OneHotEncoder(n_values =3) 
#enc.fit(XX) 
#XX = enc.fit_transform(XX)
#XX = XX.toarray()
print(XX.shape)

re = cross_val_score(bern, XX, Y, cv=10)
print(sum(re)/10)

print()
XX = X
#enc = OneHotEncoder(n_values =3) 
#enc.fit(XX) 
#XX = enc.fit_transform(XX)
#XX = XX.toarray()
print(XX.shape)
re = cross_val_score(bern, XX, Y, cv=10)
print(sum(re)/10)


# TREE

In [None]:
from sklearn import tree

clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, Y)


for i in categories:
    
    r1 = {}
    
    print("Category = ",i)
    
    XX = createNewTable(snpReduc[i],X)
    xTraining1 = createNewTable(snpReduc[i],xTraining)
    xTest1 = createNewTable(snpReduc[i],xTest)
    
   # enc = OneHotEncoder(n_values =3) 
   # enc.fit(XX) 
   # XX = enc.fit_transform(XX)
   # XX = XX.toarray()
   # print(XX.shape)

    #enc.fit(xTraining1) 
    #xTraining1 = enc.fit_transform(xTraining1)
    #xTraining1 = xTraining1.toarray()
    #print(xTraining1.shape)

    #enc.fit(xTest1) 
    #xTest1 = enc.fit_transform(xTest1)
    #xTest1 = xTest1.toarray()
    #print(xTest1.shape)
    
    

    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(xTraining1, yTraining1)
    yPredict4 = clf.predict(xTest1)

    print(metrics.accuracy_score(yTest,yPredict4))
    print(metrics.confusion_matrix(yTest,yPredict4))
    error4 = mean_squared_error(yTest, yPredict4)
    print("error 4 = ",error4)
    RMSE4 = mean_squared_error(yTest,yPredict4)**0.5
    print("RMSE4 = ",RMSE4)

    fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict4)
    print("AUC = ", metrics.auc(fpr,tpr))
    print("recal = ",metrics.recall_score(yTest,yPredict4))
    print("precision = ",metrics.precision_score(yTest,yPredict4))
    print("f1Score = ",f1_score(yTest, yPredict4, average='binary'))
    
    r1['confu'] = metrics.confusion_matrix(yTest,yPredict4)
    r1['accu'] = metrics.accuracy_score(yTest,yPredict4)
    r1['auc'] = metrics.auc(fpr,tpr)
    r1['recal'] = metrics.recall_score(yTest,yPredict4)
    r1['precision'] = metrics.precision_score(yTest,yPredict4)
    r1['f1'] = f1_score(yTest, yPredict4, average='binary')
    writeResults2[i] = r1
    
    print()

    results = crossValidiation(XX, Y, k = 10, classifier = clf, continious = False)
    print("accuracy = ",results['accuracy'])
    print("AUC = ", results['auc'])
    print("recal = ",results['recall'])
    print("precision = ",results['precision'])
    print("f1 = ",results['f1'])
    results['len snps'] = len(snpReduc[i])
    writeResults[i] = results
    
    print()
    
    
#writeResult(path,'tree',writeResults)
#writeResultConf(path,'tree',writeResults2)

# ALL


In [None]:

r1 = {}
writeResults2={}
writeResults={}

enc = OneHotEncoder(n_values =3) 
enc.fit(X) 
XX = enc.fit_transform(X)
XX = XX.toarray()
print(XX.shape)

enc.fit(xTraining) 
xTraining1 = enc.fit_transform(xTraining)
xTraining1 = xTraining1.toarray()
print(xTraining1.shape)

enc.fit(xTest) 
xTest1 = enc.fit_transform(xTest)
xTest1 = xTest1.toarray()
print(xTest1.shape)
    

bern = BernoulliNB()
bern.fit(xTraining1, yTraining1)
yPredict6 = bern.predict(xTest1)
print(metrics.accuracy_score(yTest,yPredict6))
print(metrics.confusion_matrix(yTest,yPredict6))
error6 = mean_squared_error(yTest, yPredict6)
print("error 6 = ",error6)
RMSE6 = mean_squared_error(yTest,yPredict6)**0.5
print("RMSE6 = ",RMSE6)

fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict6)
print("AUC = ", metrics.auc(fpr,tpr))
print("recal = ",metrics.recall_score(yTest,yPredict6))
print("precision = ",metrics.precision_score(yTest,yPredict6))
print("f1Score = ",f1_score(yTest, yPredict6, average='binary'))

r1['confu'] = metrics.confusion_matrix(yTest,yPredict6)
r1['accu'] = metrics.accuracy_score(yTest,yPredict6)
r1['auc'] = metrics.auc(fpr,tpr)
r1['recal'] = metrics.recall_score(yTest,yPredict6)
r1['precision'] = metrics.precision_score(yTest,yPredict6)
r1['f1'] = f1_score(yTest, yPredict6, average='binary')


writeResults2['all'] = r1
    
print()

results = crossValidiation(XX, Y, k = 10,classifier = bern, continious = False)
    
print("accuracy = ",results['accuracy'])
print("AUC = ", results['auc'])
print("recal = ",results['recall'])
print("precision = ",results['precision'])
print("f1 = ",results['f1'])


results['len snps'] = len(XX.T)
writeResults['all'] = results
    
print()
    
    
writeResult(path,'bernoulli_all',writeResults)
writeResultConf(path,'bernoulli_all',writeResults2)


In [None]:
r1 = {}
writeResults2={}
writeResults={}

enc = OneHotEncoder(n_values =3) 
enc.fit(X) 
XX = enc.fit_transform(X)
XX = XX.toarray()
print(XX.shape)

enc.fit(xTraining) 
xTraining1 = enc.fit_transform(xTraining)
xTraining1 = xTraining1.toarray()
print(xTraining1.shape)

enc.fit(xTest) 
xTest1 = enc.fit_transform(xTest)
xTest1 = xTest1.toarray()
print(xTest1.shape)
    

bern = SVC(kernel='linear')
bern.fit(xTraining1, yTraining1)
yPredict6 = bern.predict(xTest1)
print(metrics.accuracy_score(yTest,yPredict6))
print(metrics.confusion_matrix(yTest,yPredict6))
error6 = mean_squared_error(yTest, yPredict6)
print("error 6 = ",error6)
RMSE6 = mean_squared_error(yTest,yPredict6)**0.5
print("RMSE6 = ",RMSE6)

fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict6)
print("AUC = ", metrics.auc(fpr,tpr))
print("recal = ",metrics.recall_score(yTest,yPredict6))
print("precision = ",metrics.precision_score(yTest,yPredict6))
print("f1Score = ",f1_score(yTest, yPredict6, average='binary'))

r1['confu'] = metrics.confusion_matrix(yTest,yPredict6)
r1['accu'] = metrics.accuracy_score(yTest,yPredict6)
r1['auc'] = metrics.auc(fpr,tpr)
r1['recal'] = metrics.recall_score(yTest,yPredict6)
r1['precision'] = metrics.precision_score(yTest,yPredict6)
r1['f1'] = f1_score(yTest, yPredict6, average='binary')
   
writeResults2['all'] = r1
    
print()

results = crossValidiation(XX, Y, k = 10,classifier = bern, continious = False)
    
print("accuracy = ",results['accuracy'])
print("AUC = ", results['auc'])
print("recal = ",results['recall'])
print("precision = ",results['precision'])
print("f1 = ",results['f1'])


results['len snps'] = len(XX.T)
writeResults['all'] = results
    
print()
    
    
writeResult(path,'svm_all',writeResults)
writeResultConf(path,'svm_all',writeResults2)

In [None]:
r1 = {}
writeResults2={}
writeResults={}

enc = OneHotEncoder(n_values =3) 
enc.fit(X) 
XX = enc.fit_transform(X)
XX = XX.toarray()
print(XX.shape)

enc.fit(xTraining) 
xTraining1 = enc.fit_transform(xTraining)
xTraining1 = xTraining1.toarray()
print(xTraining1.shape)

enc.fit(xTest) 
xTest1 = enc.fit_transform(xTest)
xTest1 = xTest1.toarray()
print(xTest1.shape)
    

bern = linear_model.LogisticRegression()
bern.fit(xTraining1, yTraining1)
yPredict6 = bern.predict(xTest1)
print(metrics.accuracy_score(yTest,yPredict6))
print(metrics.confusion_matrix(yTest,yPredict6))
error6 = mean_squared_error(yTest, yPredict6)
print("error 6 = ",error6)
RMSE6 = mean_squared_error(yTest,yPredict6)**0.5
print("RMSE6 = ",RMSE6)

fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict6)
print("AUC = ", metrics.auc(fpr,tpr))
print("recal = ",metrics.recall_score(yTest,yPredict6))
print("precision = ",metrics.precision_score(yTest,yPredict6))
print("f1Score = ",f1_score(yTest, yPredict6, average='binary'))

r1['confu'] = metrics.confusion_matrix(yTest,yPredict6)
r1['accu'] = metrics.accuracy_score(yTest,yPredict6)
r1['auc'] = metrics.auc(fpr,tpr)
r1['recal'] = metrics.recall_score(yTest,yPredict6)
r1['precision'] = metrics.precision_score(yTest,yPredict6)
r1['f1'] = f1_score(yTest, yPredict6, average='binary')
   
writeResults2['all'] = r1

print()

results = crossValidiation(XX, Y, k = 10,classifier = bern, continious = False)
    
print("accuracy = ",results['accuracy'])
print("AUC = ", results['auc'])
print("recal = ",results['recall'])
print("precision = ",results['precision'])
print("f1 = ",results['f1'])


results['len snps'] = len(XX.T)
writeResults['all'] = results
    
print()
    
    
writeResult(path,'llr_all',writeResults)
writeResultConf(path,'llr_all',writeResults2)

In [None]:
r1 = {}
writeResults2={}
writeResults={}

#enc = OneHotEncoder(n_values =3) 
#enc.fit(X) 
#XX = enc.fit_transform(X)
#XX = XX.toarray()
#print(XX.shape)

#enc.fit(xTraining) 
#xTraining1 = enc.fit_transform(xTraining)
#xTraining1 = xTraining1.toarray()
#print(xTraining1.shape)

#enc.fit(xTest) 
#xTest1 = enc.fit_transform(xTest)
#xTest1 = xTest1.toarray()
#print(xTest1.shape)
    

bern = tree.DecisionTreeClassifier()
bern.fit(xTraining1, yTraining1)
yPredict6 = bern.predict(xTest1)
print(metrics.accuracy_score(yTest,yPredict6))
print(metrics.confusion_matrix(yTest,yPredict6))
error6 = mean_squared_error(yTest, yPredict6)
print("error 6 = ",error6)
RMSE6 = mean_squared_error(yTest,yPredict6)**0.5
print("RMSE6 = ",RMSE6)

fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict6)
print("AUC = ", metrics.auc(fpr,tpr))
print("recal = ",metrics.recall_score(yTest,yPredict6))
print("precision = ",metrics.precision_score(yTest,yPredict6))
print("f1Score = ",f1_score(yTest, yPredict6, average='binary'))

r1['confu'] = metrics.confusion_matrix(yTest,yPredict6)
r1['accu'] = metrics.accuracy_score(yTest,yPredict6)
r1['auc'] = metrics.auc(fpr,tpr)
r1['recal'] = metrics.recall_score(yTest,yPredict6)
r1['precision'] = metrics.precision_score(yTest,yPredict6)
r1['f1'] = f1_score(yTest, yPredict6, average='binary')
   
writeResults2['all'] = r1

print()

results = crossValidiation(XX, Y, k = 10,classifier = bern, continious = False)
    
print("accuracy = ",results['accuracy'])
print("AUC = ", results['auc'])
print("recal = ",results['recall'])
print("precision = ",results['precision'])
print("f1 = ",results['f1'])


results['len snps'] = len(XX.T)
writeResults['all'] = results
    
print()
    
    
writeResult(path,'llr_all',writeResults)
writeResultConf(path,'llr_all',writeResults2)