In [None]:
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import numpy as np
import os.path
import csv
from sklearn.svm import SVC
from sklearn.svm import NuSVC
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn import grid_search
import sklearn.linear_model as linear_model
import sklearn.metrics as metrics
from sklearn import preprocessing
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from random import randint
from sklearn import tree
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import math
import time
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest

from metrics.Correlation import Correlation
from IO.Write import Write
from IO.Read import Read
from metrics.RSquare import RSquare
from DataSet.Dataset import DataSet


def setIdToName(aList):
    
    ids = {}
    nameToId = {}
    idToName = {}
    count = 0
    
    for i in aList:
        
        nameToId[i] = count
        idToName[count] = i
        count += 1
        
    ids['nameToId'] = nameToId
    ids['idToName'] = idToName
    
    return ids

def createNewIds(oldsnps,snps):
    
    nameToId = {}
    idToName = {}
    newIds = {}
    
    oldIds = oldsnps['snps']['idToName']
    
    for i in range(len(snps)):
        
        nameToId[oldIds[snps[i]]] = i
        idToName[i] = oldIds[snps[i]]
        
    newIds['nameToId'] = nameToId
    newIds['idToName'] = idToName
        
    return newIds

def setSnpsCode(patients,chromosomes):
    
    for i in patients.keys():
        patients[i].snpCode(chromosomes)
        
    return patients
    

def tables(sampleX,sampleY,k):
  
    samples = {}
    
    for run in range(1,k+1):
        
        d1 = {}
        

        dataTestX = sampleX[run]
        dataTestY = sampleY[run]

        n = 0

        for i in sampleX.keys():

            if i != run:

                n += len(sampleX[i])

        dataTrainX = np.zeros((n,len(sampleX[1].T)),dtype = int)
        dataTrainY = np.zeros((n,),dtype = int)

        count = 0

        for sample in sampleX.keys():

            if sample != run:

                 for i in range(len(sampleX[sample])):
                    for j in range(len(sampleX[sample].T)):
                        dataTrainX[count,j] = sampleX[sample][i,j]

                    dataTrainY[count] = sampleY[sample][i]
                    count += 1

        d1['trainX'] = dataTrainX
        d1['trainY'] = dataTrainY
        d1['testX'] = dataTestX
        d1['testY'] = dataTestY
        
        samples[run] = d1
    
    return samples
    
def kSampleData(k,X,Y):
    
    x = int (len(X) / k)
    allElements = np.zeros((len(X),),dtype = int)
    
    count1 = 1
    sampleX = {}
    sampleY = {}
   
    
    while count1 <= k:
        count2 = 1
        sampleData = []
        
        if count1 == k:
            x =  len(X) - ((k-1) * x)
        
        dataX = np.zeros((x,len(X.T)),dtype = int)
        dataY = np.zeros((x,),dtype = int)
        
        while count2 <= x:
            
            aRand = randint(0,len(X)-1)
            
            while allElements[aRand] == 1:
                
                aRand = randint(0,len(X)-1)
            
            allElements[aRand] = 1
            sampleData.append(aRand)
            count2 += 1
            
        for i in range(len(sampleData)):
            for j in range(len(X.T)):
                dataX[i,j] = X[sampleData[i],j]
            
            dataY[i] = Y[sampleData[i]]
            
        sampleX[count1] = dataX
        sampleY[count1] = dataY
        count1 +=1
        
    return tables(sampleX,sampleY,k)




def createNewTable(snps,X):
    
    newX = np.zeros((len(X),len(snps)),dtype = int)
    count=0
    
    
    
    for i in range(len(newX)):
        for j in range(len(newX.T)):
            newX[i,j] = -1
    
    for i in range(len(snps)):
        
        newX[:,i] = X[:,snps[i]]
        
        
    print("new shape = ",newX.shape)
            
    return newX 



def featuresIds(oldSnps,snps):
    
    features = {}
    
    for i in range(len(snps)):
        features[i] = snps[i]
        
    return features



def crossValidiation(X, Y, k = 1, continious = True, classifier = None,OLS = False,Logistic = False):
    
    if not classifier:
        print("wrong!!!!!!! you have to choise a classifier")
        return
    
    results = {}
    accuracy = {}
    auc = {}
    recall = {}
    precision = {}
    f1Score = {}
    
    sumResults = 0.0
    sumAccuracy = 0.0
    sumAuc = 0.0
    sumRecall = 0.0
    sumPrecision = 0.0
    sumF1Score = 0.0
    
    samples = kSampleData(k,X,Y)
    
    for run in range(1, k + 1):
        
        trainX = samples[run]['trainX']
        trainY = samples[run]['trainY']
        
        #trainX,trainY = balancedData(trainX,trainY)
        
        testX = samples[run]['testX']
        testY = samples[run]['testY']
        
        
        if OLS:
            classifier = sm.OLS(trainY,trainX)
            yPredict = classifier.fit().predict(testX)
        else:

            classifier.fit(trainX, trainY)
            yPredict = classifier.predict(testX)
        
        if continious:
            
            for i in range(len(yPredict)):
                
                if (abs(0 - yPredict[i]) - abs(1 - yPredict[i])) <= 1e-10 :
                    yPredict[i] = 0
                else:
                    yPredict[i] = 1
                    
        if Logistic:
            
            probabilities = classifier.predict_proba(testX)
            
            for i in range(len(probabilities)):
                if probabilities[i][1] >= 0.8:
                    yPredict[i] = 1
                else:
                    yPredict[i] = 0
          
        accuracy[run] = metrics.accuracy_score(testY,yPredict)#(yPredict,testY)#
        fpr, tpr, thresholds = metrics.roc_curve(testY,yPredict)
        auc[run] = metrics.auc(fpr,tpr)
        recall[run] = metrics.recall_score(testY,yPredict)
        precision[run] = metrics.precision_score(testY,yPredict)
        f1Score[run] = f1_score(testY, yPredict, average='binary')
        
    
    for i in accuracy.keys():
        sumAccuracy = sumAccuracy + accuracy[i]
        sumAuc = sumAuc + auc[i]
        sumRecall = sumRecall + recall[i]
        sumPrecision = sumPrecision + precision[i]
        sumF1Score = sumF1Score + f1Score[i]
    
    results['accuracy'] = sumAccuracy / k
    results['auc'] = sumAuc / k
    results['recall'] = sumRecall / k
    results['precision'] = sumPrecision / k
    results['f1'] = sumF1Score / k
    
    return results

def writeResultConf(path,name,results):
    
    timee = time.strftime("%d-%m-%Y")
    
    folder = path + name + 'Confu' + " ( " + timee + " ) " + '_'
    #file = path + name + 'Confu' + '.txt'
    
    i=1
    while os.path.exists(folder):
        timee = time.strftime("%d-%m-%Y")
        folder = path + name + 'Confu' + " ( " + timee + " ) " + '_' + str(i) + '_'
    
        i += 1
        
 #   os.makedirs(folder)
    
    file = folder + name + 'Confu' + '.txt' 
    
    write = open(file,'w')
    
    write.write(timee + '\n'+ '\n')
    write.write(name + '\n')
    
    for i in results:
        
        results1 = results[i]
        write.write("Category = " + i + '\n')
        
        for j in results1:
            
            write.write(j + " = " + str(results1[j]) + '\n')
            
        write.write('\n')
        write.write('\n')
        
    write.close()

    

def writeResult(path,name,results,category):
    
    timee = time.strftime("%d-%m-%Y")
    
    folder = path + name  + " ( " + timee + " ) " + '_'
    #file = path + name  + '.txt'
    
    i=1
    while os.path.exists(folder):
        timee = time.strftime("%d-%m-%Y")
        folder = path + name + " ( " + timee + " ) " + '_' + str(i) + '_'
       
        i += 1
        
   # os.makedirs(folder)
    
    file = folder + name + '.txt'
    
    write = open(file,'w')
    
    write.write(timee + '\n'+ '\n')
    write.write(name + '\n')
    write.write("Category = " + category + '\n')
    for i in results[category]:
        
        #results1 = results[i]
        
        
        #for j in results:
            
        write.write(i + " = " + str(results[category][i]) + '\n')
            
        
        
    write.close()

def balancedData(X,Y):
    
    cases = 0
    for i in range(len(Y)):
        if Y[i] == 1:
            cases += 1
            
    Xbalanced = np.zeros((2*cases,len(X.T)))
    Ybalanced = np.zeros(2*cases)
    controls = 0
    count = 0
    selected = np.zeros(len(X))
    '''for i in range(len(Y)):
        if(Y[i] == 0 and controls < cases):
            Xbalanced[count,:] = X[i,:]
            Ybalanced[count] = Y[i]
            controls +=1
            count += 1
        elif Y[i] == 1:
            Xbalanced[count,:] = X[i,:]
            Ybalanced[count] = Y[i]
            count += 1'''
    
    
    for i in range(len(Y)):
        
        if Y[i] == 1:
            Xbalanced[count,:] = X[i,:]
            Ybalanced[count] = Y[i]
            count += 1
            selected[i]=i
    
    while(controls<cases):
        aRand = randint(0,len(X)-1)
        while(selected[aRand] == 1):
            aRand = randint(0,len(X)-1)
            
        Xbalanced[count,:] = X[aRand,:]
        Ybalanced[count] = Y[aRand]
        controls +=1
        count += 1
    
    print("len x = ", Xbalanced.shape)
    print("len y = ", Ybalanced.shape)
    
    return Xbalanced, Ybalanced
            

In [None]:

#path = 'C:\\Users\\ANTONIS\\Desktop\\newSet\\maf\\maf = 0.05\\pvalue = 0.001\\'
#path = 'C:\\Users\\ANTONIS\\Desktop\\newSet\\maf\\maf = 0.05\\pvalue = 0.01\\'
#path = 'C:\\Users\\ANTONIS\\Desktop\\newSet\\maf\\maf = 0.05\\pvalue = 0.005\\'
#path = 'C:\\Users\\ANTONIS\\Desktop\\newSet\\maf\\maf = 0.05\\pvalue = 0.05\\'

path = 'D:\\newSet\maf\\maf = 0.05\\assoc\\pvalue = 0.001\\'

numberOfChromosomes = 22#'ari8mos twn xromoswmatwn'
patientsTrain = {}
patientsTest = {}
allPatients = {}

chromosomes = {}

read = Read(path,numberOfChromosomes)
write = Write(path,numberOfChromosomes)

patients = read.readPatients('Patients.txt')
chromosomes = read.readSnps(".assoc")
write.writePatientsList(patients,'patient.txt')
write.writeSnpsList(chromosomes)


# run train_lgen bat and test_leg bat

In [None]:

snps = read.getListOfSnps()
ids = {} 


ids['patients'] = setIdToName(list(patients.keys()))
ids['snps'] = setIdToName(snps)


if os.path.exists(path + 'snpCode.txt'):
    print("mphka")
    X, Y = read.readSnpsCode(patients,ids)
    
else:
    
    write.writeSnpLog(read.getNumberOfPatients(),read.getNumberOfSnps(),chromosomes)
    X, Y = read.readSnpsCode(patients,ids)
    
X.shape

# Correlation

In [None]:
X,Y =balancedData(X,Y)

xTraining, xTest, yTraining1, yTest = train_test_split(X, Y, test_size=0.1, random_state=randint(0,2018))
print("mergex = ",X.shape)
print("xTrain = ",xTraining.shape)
print("xTest = ",xTest.shape)
print("yTrain = ",yTraining1.shape)
print("yTest = ",yTest.shape)

In [None]:
cor = Correlation(X)
#cor = RSquare(X)

#cor = Correlation(xTraining)
#cor = RSquare(xTraining)
snpReduc = {}

# Correlation high

In [None]:
snpReduc['high'] = cor.getHighCorrelationSnps(0.7,down=100,up=100)
write.writeSnpsUsed(snpReduc['high'],ids['snps']['idToName'],chromosomes,'high')

# Correlation Low

In [None]:
down = 55
up = 100 
threshold = 0.7
#snpReduc['low'] = cor.getLowCorrelationSnps(threshold, down=down,up=up)
snpReduc['low100'] = cor.getLowCorrelationSnps(threshold, down=down,up=up)
write.writeSnpsUsed(snpReduc['low100'],ids['snps']['idToName'],chromosomes,'55%')

In [None]:
down = 50
up = 100 
threshold = 0.7
#snpReduc['low'] = cor.getLowCorrelationSnps(threshold, down=down,up=up)
snpReduc['low97'] = cor.getLowCorrelationSnps(threshold, down=down,up=up)
write.writeSnpsUsed(snpReduc['low97'],ids['snps']['idToName'],chromosomes,'50%')

In [None]:
print(len(snpReduc['low97']))
print(len(snpReduc['low100']))

In [None]:
'''print(ids['snps']['nameToId']['rs75570604'])
get = cor.getCorrMatrix()

summ = 0 

for i in range(len(get)):
    if get[6555,i] - 0.7 < 1e-10 :
        summ += 1'''

'''for i in range(len(get)):
    sum1 = 0
    for j in range(len(get.T)):
        if get[i,j] - 0.7 < 1e-10 :
            sum1 += 1
    if sum1 == len(get.T)-1:
        summ += 1
'''

'''for i in range(len(get)):
    sum1 = 0
    for j in range(len(get.T)):
        if get[i,j] - 0.7 > 1e-10 :
            sum1 += 1
    if sum1 > summ:
        summ = sum1'''


'''print(summ)
print(len(get))'''

In [None]:
categories = []
#categories = ['low','high']
#categories = ['low']
categories = ['low97','low100']
writeResults2={}
writeResults={}

In [None]:
'''featuresIdss = {}
featuresIdss['low'] = featuresIds(snpReduc['low'])'''


In [None]:
'''from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest



test = SelectKBest(score_func=chi2,k = 1000)
fit1 = test.fit(xTraining, yTraining1)
xTraining1 = fit1.transform(xTraining)

fit2 = test.fit(xTest, yTest)
xTest1 = fit2.transform(xTest)

fit3 = test.fit(X, Y)
XX = fit3.transform(X)

print("xtest", xTest1.shape)
print("xtraining", xTraining1.shape)
print("X", XX.shape)'''

In [None]:
dok = cor.getCorrMatrix()
summ = 0

for i in range(len(dok)):
    
    for j in range(len(dok.T)):
        
        if dok[i,j] > 1e-10 and dok[i,j] - 0.7 <= 1e-10:
            if summ<=10:
                print(dok[i,j])
            summ += 1
            
            
print(summ)
    

# RFR

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier

for i in categories:
    r1={}
    writeResults={}
    XX = createNewTable(snpReduc[i],X)
    xTraining1 = createNewTable(snpReduc[i],xTraining)
    xTest1 = createNewTable(snpReduc[i],xTest)
    
    
    
    clf = RandomForestClassifier()
    #clf.fit(xTraining1, yTraining1)
   
    param_grid = {
              "n_estimators" : [9, 18, 27, 36, 45, 54, 63],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"],
               "warm_start":[True,False]}
    
    '''param_grid = { 
         'n_estimators':[2000,3000,5000], 
         'max_depth':[5,15,30], 
         'min_samples_split':[2,3], 
        'min_samples_leaf':[1,2],
        "warm_start":[True,False],
                "oob_score":[True,False]}
    '''
    
    # run grid search
    grid_search = GridSearchCV(estimator=clf, param_grid=param_grid,cv = 10,verbose=1)
    
    grid_search.fit(xTraining1, yTraining1)
    
    
    print()
    
    #yPredict4 = clf.predict(xTest1)
    yPredict4 = grid_search.predict(xTest1)

    print(metrics.accuracy_score(yTest,yPredict4))
    print(metrics.confusion_matrix(yTest,yPredict4))
    error4 = mean_squared_error(yTest, yPredict4)
    print("error 4 = ",error4)
    RMSE4 = mean_squared_error(yTest,yPredict4)**0.5
    print("RMSE4 = ",RMSE4)

    fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict4)
    print("AUC = ", metrics.auc(fpr,tpr))
    print("recal = ",metrics.recall_score(yTest,yPredict4))
    print("precision = ",metrics.precision_score(yTest,yPredict4))
    print("f1Score = ",f1_score(yTest, yPredict4, average='binary'))
    
    r1['confu'] = metrics.confusion_matrix(yTest,yPredict4)
    r1['accu'] = metrics.accuracy_score(yTest,yPredict4)
    r1['auc'] = metrics.auc(fpr,tpr)
    r1['recal'] = metrics.recall_score(yTest,yPredict4)
    r1['precision'] = metrics.precision_score(yTest,yPredict4)
    r1['f1'] = f1_score(yTest, yPredict4, average='binary')
    writeResults2[i] = r1
    
    print()

    results = crossValidiation(XX, Y, k = 10, classifier = grid_search, continious = False)
    results['down'] = down
    results['up'] = up
    results['thres'] = threshold
    print("accuracy = ",results['accuracy'])
    print("AUC = ", results['auc'])
    print("recal = ",results['recall'])
    print("precision = ",results['precision'])
    print("f1 = ",results['f1'])
    results['len_snps'] = len(snpReduc[i])
   # results['len snps'] = 500
    writeResults[i] = results
    
    print()
    
    
    #writeResult(path,'rfr'+i,writeResults,i)
   # writeResultConf(path,'rfr'+i,writeResults2)
    

In [None]:
clf.fit(xTraining1, yTraining1)
yPredict4 = clf.predict(xTest1)


print(metrics.accuracy_score(yTest,yPredict4))
print(metrics.confusion_matrix(yTest,yPredict4))
error4 = mean_squared_error(yTest, yPredict4)
print("error 4 = ",error4)
RMSE4 = mean_squared_error(yTest,yPredict4)**0.5
print("RMSE4 = ",RMSE4)

fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict4)
print("AUC = ", metrics.auc(fpr,tpr))
print("recal = ",metrics.recall_score(yTest,yPredict4))
print("precision = ",metrics.precision_score(yTest,yPredict4))
print("f1Score = ",f1_score(yTest, yPredict4, average='binary'))

# SVM KERNEL

In [None]:
from sklearn.svm import NuSVC

for i in categories:
    
    r1 = {}
    r1['down'] = down
    r1['up'] = up
    r1['thres'] = threshold
    writeResults={}
    print("Category = ",i)
    
    XX = createNewTable(snpReduc[i],X)
    xTraining1 = createNewTable(snpReduc[i],xTraining)
    xTest1 = createNewTable(snpReduc[i],xTest)
    
    
    
    '''  test = SelectKBest(score_func=chi2,k = 500)
    fit1 = test.fit(xTraining1, yTraining1)
    xTraining1 = fit1.transform(xTraining1)

    fit2 = test.fit(xTest1, yTest)
    xTest1 = fit2.transform(xTest1)

    fit3 = test.fit(XX, Y)
    XX = fit3.transform(XX)

    print("xtest", xTest1.shape)
    print("xtraining", xTraining1.shape)
    print("X", XX.shape)
    '''
    enc = OneHotEncoder(n_values =3) 
    enc.fit(XX) 
    XX = enc.fit_transform(XX)
    XX = XX.toarray()
    print(XX.shape)

    enc.fit(xTraining1) 
    xTraining1 = enc.fit_transform(xTraining1)
    xTraining1 = xTraining1.toarray()
    print(xTraining1.shape)

    enc.fit(xTest1) 
    xTest1 = enc.fit_transform(xTest1)
    xTest1 = xTest1.toarray()
    print(xTest1.shape)
    
    
    clf = SVC(kernel='linear')
    #clf = NuSVC(kernel='rbf',nu=0.01)
    clf.fit(xTraining1, yTraining1)
    yPredict2 = clf.predict(xTest1)
    print(metrics.accuracy_score(yTest,yPredict2))
    print(metrics.confusion_matrix(yTest,yPredict2))
    error2 = mean_squared_error(yTest, yPredict2)
    print("error 2 = ",error2)
    RMSE2 = mean_squared_error(yTest,yPredict2)**0.5
    print("RMSE2 = ",RMSE2)
    fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict2)
    print("AUC = ", metrics.auc(fpr,tpr))
    print("recal = ",metrics.recall_score(yTest,yPredict2))
    print("precision = ",metrics.precision_score(yTest,yPredict2))
    print("f1Score = ",f1_score(yTest, yPredict2, average='binary'))
    
    r1['confu'] = metrics.confusion_matrix(yTest,yPredict2)
    r1['accu'] = metrics.accuracy_score(yTest,yPredict2)
    r1['auc'] = metrics.auc(fpr,tpr)
    r1['recal'] = metrics.recall_score(yTest,yPredict2)
    r1['precision'] = metrics.precision_score(yTest,yPredict2)
    r1['f1'] = f1_score(yTest, yPredict2, average='binary')
    
    writeResults2[i] = r1
    print()

    results = crossValidiation(XX, Y, k = 10, classifier = clf, continious = False)
    results['down'] = down
    results['up'] = up
    results['thres'] = threshold
    print("accuracy = ",results['accuracy'])
    print("AUC = ", results['auc'])
    print("recal = ",results['recall'])
    print("precision = ",results['precision'])
    print("f1 = ",results['f1'])
    results['len_snps'] = len(snpReduc[i])
   # results['len snps'] = 500
    writeResults[i] = results
    
    print()
    
    #writeResult(path,'svm'+i,writeResults,i)
    #writeResultConf(path,'svm'+i,writeResults2)

# LINEAR LOGISTIC REGRESSION 

In [None]:

for i in categories:
    
    r1 = {}
    writeResults={}
    print("Category = ",i)
    
   
    XX = createNewTable(snpReduc[i],X)
    xTraining1 = createNewTable(snpReduc[i],xTraining)
    xTest1 = createNewTable(snpReduc[i],xTest)
    
    
    
    '''test = SelectKBest(score_func=chi2,k = 500)
    fit1 = test.fit(xTraining1, yTraining1)
    xTraining1 = fit1.transform(xTraining1)

    fit2 = test.fit(xTest1, yTest)
    xTest1 = fit2.transform(xTest1)

    fit3 = test.fit(XX, Y)
    XX = fit3.transform(XX)

    print("xtest", xTest1.shape)
    print("xtraining", xTraining1.shape)
    print("X", XX.shape)'''
   
    enc = OneHotEncoder(n_values =3) 
    enc.fit(XX) 
    XX = enc.fit_transform(XX)
    XX = XX.toarray()
    print(XX.shape)
    enc.fit(xTraining1) 
    xTraining1 = enc.fit_transform(xTraining1)
    xTraining1 = xTraining1.toarray()
    print(xTraining1.shape)

    enc.fit(xTest1) 
    xTest1 = enc.fit_transform(xTest1)
    xTest1 = xTest1.toarray()
    print(xTest1.shape)
    
     

    #lr_clf = linear_model.LogisticRegression()
 
    lr_clf = linear_model.LogisticRegressionCV()
    lr_clf.fit(xTraining1, yTraining1)
    
    yPredict4 = lr_clf.predict(xTest1)

    print(metrics.accuracy_score(yTest,yPredict4))
    print(metrics.confusion_matrix(yTest,yPredict4))
    error4 = mean_squared_error(yTest, yPredict4)
    print("error 4 = ",error4)
    RMSE4 = mean_squared_error(yTest,yPredict4)**0.5
    print("RMSE4 = ",RMSE4)

    fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict4)
    print("AUC = ", metrics.auc(fpr,tpr))
    print("recal = ",metrics.recall_score(yTest,yPredict4))
    print("precision = ",metrics.precision_score(yTest,yPredict4))
    print("f1Score = ",f1_score(yTest, yPredict4, average='binary'))
    
    r1['confu'] = metrics.confusion_matrix(yTest,yPredict4)
    r1['accu'] = metrics.accuracy_score(yTest,yPredict4)
    r1['auc'] = metrics.auc(fpr,tpr)
    r1['recal'] = metrics.recall_score(yTest,yPredict4)
    r1['precision'] = metrics.precision_score(yTest,yPredict4)
    r1['f1'] = f1_score(yTest, yPredict4, average='binary')
    writeResults2[i] = r1
    
    print()

    results = crossValidiation(XX, Y, k = 10, classifier = lr_clf, continious = False)
    results['down'] = down
    results['up'] = up
    results['thres'] = threshold
    print("accuracy = ",results['accuracy'])
    print("AUC = ", results['auc'])
    print("recal = ",results['recall'])
    print("precision = ",results['precision'])
    print("f1 = ",results['f1'])
    results['len_snps'] = len(snpReduc[i])
   # results['len snps'] = 500
    writeResults[i] = results
    
    print()
    
    
    #writeResult(path,'llrcv'+i,writeResults,i)
    #writeResultConf(path,'llrcv'+i,writeResults2)

In [None]:
for iCat in categories:
    
    ids['coef']={}
    XX = createNewTable(snpReduc[iCat],X)

    lr_clf = linear_model.LogisticRegressionCV()
    lr_clf.fit(XX, Y)

    coefs=lr_clf.coef_[0]
    '''  print("before = ",len(set(coefs)))
    for i in range(len(coefs)):
        coefs[i] = abs(coefs[i])

    print("after = ",len(set(coefs)))'''

    idToName = {}
    nameToId = {}

    for i in range(len(coefs)):
        nameToId[coefs[i]] = []

    for i in range(len(coefs)):
        nameToId[coefs[i]].append(i)
        idToName[i] = coefs[i]


    ids['coef']['nameToId'] = nameToId
    ids['coef']['idToName'] = idToName


    sc = sorted(coefs )

    top_30 = []

    for i in sc[-30:]:

        snp = ids['coef']['nameToId'][i][0]
        ids['coef']['nameToId'][i].remove(snp)
        top_30.append(snpReduc[iCat][snp])
     #   top_30.append(snp)

    #snpReduc['low'] = top_30

    write.writeSnpsUsed(top_30,ids['snps']['idToName'],chromosomes,iCat+'_not_abs_balanced')
    #write.writeSnpsUsed(top_30,ids['snps']['idToName'],chromosomes,'top_30_all_abs')

In [None]:
for iCat in categories:
    
    ids['coef']={}
  #  XX = createNewTable(snpReduc[iCat],X)

    lr_clf = linear_model.LogisticRegressionCV()
    lr_clf.fit(X, Y)

    coefs=lr_clf.coef_[0]
    print("before = ",len(set(coefs)))
    for i in range(len(coefs)):
        coefs[i] = abs(coefs[i])

    print("after = ",len(set(coefs)))

    idToName = {}
    nameToId = {}

    for i in range(len(coefs)):
        nameToId[coefs[i]] = []

    for i in range(len(coefs)):
        nameToId[coefs[i]].append(i)
        idToName[i] = coefs[i]


    ids['coef']['nameToId'] = nameToId
    ids['coef']['idToName'] = idToName


    sc = sorted(coefs )

    top_30 = []

    for i in sc[-30:]:

        snp = ids['coef']['nameToId'][i][0]
        ids['coef']['nameToId'][i].remove(snp)
       # top_30.append(snpReduc[iCat][snp])
        top_30.append(snp)

    #snpReduc['low'] = top_30

  #  write.writeSnpsUsed(top_30,ids['snps']['idToName'],chromosomes,iCat+'_abs_balanced')
    write.writeSnpsUsed(top_30,ids['snps']['idToName'],chromosomes,'_all_abs')


In [None]:
from sklearn.naive_bayes import GaussianNB

for i in categories:
    
    r1 = {}
    writeResults={}
    print("Category = ",i)
    
   
    XX = createNewTable(snpReduc[i],X)
    xTraining1 = createNewTable(snpReduc[i],xTraining)
    xTest1 = createNewTable(snpReduc[i],xTest)
    
    
    
    '''
      test = SelectKBest(score_func=chi2,k = 500)
    fit1 = test.fit(xTraining1, yTraining1)
    xTraining1 = fit1.transform(xTraining1)

    fit2 = test.fit(xTest1, yTest)
    xTest1 = fit2.transform(xTest1)

    fit3 = test.fit(XX, Y)
    XX = fit3.transform(XX)

    print("xtest", xTest1.shape)
    print("xtraining", xTraining1.shape)
    print("X", XX.shape)
    
    '''
    
    enc = OneHotEncoder(n_values =3) 
    enc.fit(XX) 
    XX = enc.fit_transform(XX)
    XX = XX.toarray()
    print(XX.shape)
    
    enc.fit(xTraining1) 
    xTraining1 = enc.fit_transform(xTraining1)
    xTraining1 = xTraining1.toarray()
    print(xTraining1.shape)

    enc.fit(xTest1) 
    xTest1 = enc.fit_transform(xTest1)
    xTest1 = xTest1.toarray()
    print(xTest1.shape)
    

    gnb = GaussianNB()
    gnb.fit(xTraining1, yTraining1)
    yPredict6 = gnb.predict(xTest1)
    print(metrics.accuracy_score(yTest,yPredict6))
    print(metrics.confusion_matrix(yTest,yPredict6))
    error6 = mean_squared_error(yTest, yPredict6)
    print("error 6 = ",error6)
    RMSE6 = mean_squared_error(yTest,yPredict6)**0.5
    print("RMSE6 = ",RMSE6)

    fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict6)
    print("AUC = ", metrics.auc(fpr,tpr))
    print("recal = ",metrics.recall_score(yTest,yPredict6))
    print("precision = ",metrics.precision_score(yTest,yPredict6))
    print("f1Score = ",f1_score(yTest, yPredict6, average='binary'))
    
    r1['confu'] = metrics.confusion_matrix(yTest,yPredict6)
    r1['accu'] = metrics.accuracy_score(yTest,yPredict6)
    r1['auc'] = metrics.auc(fpr,tpr)
    r1['recal'] = metrics.recall_score(yTest,yPredict6)
    r1['precision'] = metrics.precision_score(yTest,yPredict6)
    r1['f1'] = f1_score(yTest, yPredict6, average='binary')
    
    writeResults2[i] = r1
    
    print()

    results = crossValidiation(XX, Y, k = 10,classifier = gnb, continious = False)
    results['down'] = down
    results['up'] = up
    results['thres'] = threshold
    print("accuracy = ",results['accuracy'])
    print("AUC = ", results['auc'])
    print("recal = ",results['recall'])
    print("precision = ",results['precision'])
    print("f1 = ",results['f1'])
    print()
    results['len_snps'] = len(snpReduc[i])
  #  results['len snps'] = 500
    writeResults[i] = results
    
    print()
    
    
    #writeResult(path,'gnb'+i,writeResults,i)
    #writeResultConf(path,'gnb'+i,writeResults2)

In [None]:
#xTraining.shape
yTraining1.shape

# BERNOULLI


In [None]:

'''xTraining1 = createNewTable(snpReduc['low'],xTraining)
lr_clf = linear_model.LogisticRegressionCV()
lr_clf.fit(xTraining1, yTraining1)

coefs=lr_clf.coef_[0]
top_20 = np.argpartition(coefs, -50)[-50:]
#top_20 = np.argpartition(coefs, 50)[:50]
removedSnps = []
for i in range(len(snpReduc['low'])):
    if i not in top_20:
        removedSnps.append(snpReduc['low'][i])'''



for i in categories:
   
    r1 = {}
    writeResults={}
    print("Category = ",i)
    
   
   
  
    
    XX = createNewTable(snpReduc[i],X)
    xTraining1 = createNewTable(snpReduc[i],xTraining)
    xTest1 = createNewTable(snpReduc[i],xTest)
    
    
    
    '''
    test = SelectKBest(score_func=chi2,k = 500)
    fit1 = test.fit(xTraining1, yTraining1)
    xTraining1 = fit1.transform(xTraining1)

    fit2 = test.fit(xTest1, yTest)
    xTest1 = fit2.transform(xTest1)

    fit3 = test.fit(XX, Y)
    XX = fit3.transform(XX)
    print("xtest", xTest1.shape)
    print("xtraining", xTraining1.shape)
    print("X", XX.shape)

    
    '''
    
    enc = OneHotEncoder(n_values =3) 
    enc.fit(XX) 
    XX = enc.fit_transform(XX)
    XX = XX.toarray()
    print(XX.shape)
    
    enc.fit(xTraining1) 
    xTraining1 = enc.fit_transform(xTraining1)
    xTraining1 = xTraining1.toarray()
    print(xTraining1.shape)

    enc.fit(xTest1) 
    xTest1 = enc.fit_transform(xTest1)
    xTest1 = xTest1.toarray()
    print(xTest1.shape)
    

    bern = BernoulliNB()
    bern.fit(xTraining1, yTraining1)
    yPredict6 = bern.predict(xTest1)
    print(metrics.accuracy_score(yTest,yPredict6))
    print(metrics.confusion_matrix(yTest,yPredict6))
    error6 = mean_squared_error(yTest, yPredict6)
    print("error 6 = ",error6)
    RMSE6 = mean_squared_error(yTest,yPredict6)**0.5
    print("RMSE6 = ",RMSE6)

    fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict6)
    print("AUC = ", metrics.auc(fpr,tpr))
    print("recal = ",metrics.recall_score(yTest,yPredict6))
    print("precision = ",metrics.precision_score(yTest,yPredict6))
    print("f1Score = ",f1_score(yTest, yPredict6, average='binary'))
    
    r1['confu'] = metrics.confusion_matrix(yTest,yPredict6)
    r1['accu'] = metrics.accuracy_score(yTest,yPredict6)
    r1['auc'] = metrics.auc(fpr,tpr)
    r1['recal'] = metrics.recall_score(yTest,yPredict6)
    r1['precision'] = metrics.precision_score(yTest,yPredict6)
    r1['f1'] = f1_score(yTest, yPredict6, average='binary')
    
    writeResults2[i] = r1
    
    print()

    results = crossValidiation(XX, Y, k = 10,classifier = bern, continious = False)
    results['down'] = down
    results['up'] = up
    results['thres'] = threshold
    print("accuracy = ",results['accuracy'])
    print("AUC = ", results['auc'])
    print("recal = ",results['recall'])
    print("precision = ",results['precision'])
    print("f1 = ",results['f1'])
    print()
    results['len_snps'] = len(snpReduc[i])
  #  results['len snps'] = 500
    writeResults[i] = results
    
    print()
    
    
    #writeResult(path,'bernoulli'+i,writeResults,i)
    #writeResultConf(path,'bernoulli'+i,writeResults2)

In [None]:
'''l= [5,6,7,2,3,4,10,6,7,9,8,5]
top_3 = np.argpartition(l,3)[:3]
print(top_3)
removedSnps=[]
for i in range(len(l)):
    if i not in top_3:
        removedSnps.append(l[i])
print(removedSnps)'''

In [None]:
'''XX = createNewTable(snpReduc['low'],X)
#enc = OneHotEncoder(n_values =3) 
#enc.fit(XX) 
#XX = enc.fit_transform(XX)
#XX = XX.toarray()
print(XX.shape)

re = cross_val_score(bern, XX, Y, cv=10)
print(sum(re)/10)

print()
XX = X
#enc = OneHotEncoder(n_values =3) 
#enc.fit(XX) 
#XX = enc.fit_transform(XX)
#XX = XX.toarray()
print(XX.shape)
re = cross_val_score(bern, XX, Y, cv=10)
print(sum(re)/10)
'''

# TREE

In [None]:
from sklearn import tree
from sklearn.ensemble import BaggingClassifier
for i in categories:
    
    r1 = {}
    writeResults={}
    print("Category = ",i)
    
   
    XX = createNewTable(snpReduc[i],X)
    xTraining1 = createNewTable(snpReduc[i],xTraining)
    xTest1 = createNewTable(snpReduc[i],xTest)
    
    
    
    ''' test = SelectKBest(score_func=chi2,k = 500)
    fit1 = test.fit(xTraining1, yTraining1)
    xTraining1 = fit1.transform(xTraining1)

    fit2 = test.fit(xTest1, yTest)
    xTest1 = fit2.transform(xTest1)

    fit3 = test.fit(XX, Y)
    XX = fit3.transform(XX)
    print("xtest", xTest1.shape)
    print("xtraining", xTraining1.shape)
    print("X", XX.shape)
    '''
    
    enc = OneHotEncoder(n_values =3) 
    enc.fit(XX) 
    XX = enc.fit_transform(XX)
    XX = XX.toarray()
    print(XX.shape)

    enc.fit(xTraining1) 
    xTraining1 = enc.fit_transform(xTraining1)
    xTraining1 = xTraining1.toarray()
    print(xTraining1.shape)

    enc.fit(xTest1) 
    xTest1 = enc.fit_transform(xTest1)
    xTest1 = xTest1.toarray()
    print(xTest1.shape)
    
    

    clf= tree.DecisionTreeClassifier()
    
    #clf = BaggingClassifier(clf1, n_estimators=100, max_samples=2,
     #                   random_state=1)
    
    clf.fit(xTraining1, yTraining1)
    yPredict4 = clf.predict(xTest1)

    print(metrics.accuracy_score(yTest,yPredict4))
    print(metrics.confusion_matrix(yTest,yPredict4))
    error4 = mean_squared_error(yTest, yPredict4)
    print("error 4 = ",error4)
    RMSE4 = mean_squared_error(yTest,yPredict4)**0.5
    print("RMSE4 = ",RMSE4)

    fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict4)
    print("AUC = ", metrics.auc(fpr,tpr))
    print("recal = ",metrics.recall_score(yTest,yPredict4))
    print("precision = ",metrics.precision_score(yTest,yPredict4))
    print("f1Score = ",f1_score(yTest, yPredict4, average='binary'))
    
    r1['confu'] = metrics.confusion_matrix(yTest,yPredict4)
    r1['accu'] = metrics.accuracy_score(yTest,yPredict4)
    r1['auc'] = metrics.auc(fpr,tpr)
    r1['recal'] = metrics.recall_score(yTest,yPredict4)
    r1['precision'] = metrics.precision_score(yTest,yPredict4)
    r1['f1'] = f1_score(yTest, yPredict4, average='binary')
    writeResults2[i] = r1
    
    print()

    results = crossValidiation(XX, Y, k = 10, classifier = clf, continious = False)
    results['down'] = down
    results['up'] = up
    results['thres'] = threshold
    print("accuracy = ",results['accuracy'])
    print("AUC = ", results['auc'])
    print("recal = ",results['recall'])
    print("precision = ",results['precision'])
    print("f1 = ",results['f1'])
    results['len_snps'] = len(snpReduc[i])
   # results['len snps'] = 500
    writeResults[i] = results
    
    print()
    
    
    #writeResult(path,'tree'+i,writeResults,i)
    #writeResultConf(path,'tree'+i,writeResults2)

In [None]:
import graphviz 

for iCat in categories:
    featuresName = []
    for i in range(len(snpReduc[iCat])):
        snp = snpReduc[iCat][i]
        featuresName.append(ids['snps']['idToName'][snp])
    xTraining1 = createNewTable(snpReduc[iCat],xTraining)
    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(xTraining1, yTraining1)
    treeee = open(path+iCat+'_balancedTree.dot','w')
    dot_data = tree.export_graphviz(clf, out_file=treeee, 
                            feature_names=featuresName,leaves_parallel =True,max_depth = 6)  
    graph = graphviz.Source(dot_data)


# ALL


In [None]:

r1 = {}
writeResults2={}
writeResults={}

'''enc = OneHotEncoder(n_values =3) 
enc.fit(X) 
XX = enc.fit_transform(X)
XX = XX.toarray()
print(XX.shape)

enc.fit(xTraining) 
xTraining1 = enc.fit_transform(xTraining)
xTraining1 = xTraining1.toarray()
print(xTraining1.shape)

enc.fit(xTest) 
xTest1 = enc.fit_transform(xTest)
xTest1 = xTest1.toarray()
print(xTest1.shape)'''
    

bern = BernoulliNB()
bern.fit(xTraining, yTraining1)
yPredict6 = bern.predict(xTest)
print(metrics.accuracy_score(yTest,yPredict6))
print(metrics.confusion_matrix(yTest,yPredict6))
error6 = mean_squared_error(yTest, yPredict6)
print("error 6 = ",error6)
RMSE6 = mean_squared_error(yTest,yPredict6)**0.5
print("RMSE6 = ",RMSE6)

fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict6)
print("AUC = ", metrics.auc(fpr,tpr))
print("recal = ",metrics.recall_score(yTest,yPredict6))
print("precision = ",metrics.precision_score(yTest,yPredict6))
print("f1Score = ",f1_score(yTest, yPredict6, average='binary'))

r1['confu'] = metrics.confusion_matrix(yTest,yPredict6)
r1['accu'] = metrics.accuracy_score(yTest,yPredict6)
r1['auc'] = metrics.auc(fpr,tpr)
r1['recal'] = metrics.recall_score(yTest,yPredict6)
r1['precision'] = metrics.precision_score(yTest,yPredict6)
r1['f1'] = f1_score(yTest, yPredict6, average='binary')


writeResults2['all'] = r1
    
print()

results = crossValidiation(X, Y, k = 10,classifier = bern, continious = False)
results['down'] = down
results['up'] = up
results['thres'] = threshold    
print("accuracy = ",results['accuracy'])
print("AUC = ", results['auc'])
print("recal = ",results['recall'])
print("precision = ",results['precision'])
print("f1 = ",results['f1'])


results['len_snps'] = len(X.T)
writeResults['all'] = results
    
print()
    
    
writeResult(path,'bernoulli_all',writeResults,'all')
#writeResultConf(path,'bernoulli_all',writeResults2)


In [None]:
r1 = {}
writeResults2={}
writeResults={}

'''enc = OneHotEncoder(n_values =3) 
enc.fit(X) 
XX = enc.fit_transform(X)
XX = XX.toarray()
print(XX.shape)

enc.fit(xTraining) 
xTraining1 = enc.fit_transform(xTraining)
xTraining1 = xTraining1.toarray()
print(xTraining1.shape)

enc.fit(xTest) 
xTest1 = enc.fit_transform(xTest)
xTest1 = xTest1.toarray()
print(xTest1.shape)'''
    

bern = SVC(kernel='linear')
bern.fit(xTraining, yTraining1)
yPredict6 = bern.predict(xTest)
print(metrics.accuracy_score(yTest,yPredict6))
print(metrics.confusion_matrix(yTest,yPredict6))
error6 = mean_squared_error(yTest, yPredict6)
print("error 6 = ",error6)
RMSE6 = mean_squared_error(yTest,yPredict6)**0.5
print("RMSE6 = ",RMSE6)

fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict6)
print("AUC = ", metrics.auc(fpr,tpr))
print("recal = ",metrics.recall_score(yTest,yPredict6))
print("precision = ",metrics.precision_score(yTest,yPredict6))
print("f1Score = ",f1_score(yTest, yPredict6, average='binary'))

r1['confu'] = metrics.confusion_matrix(yTest,yPredict6)
r1['accu'] = metrics.accuracy_score(yTest,yPredict6)
r1['auc'] = metrics.auc(fpr,tpr)
r1['recal'] = metrics.recall_score(yTest,yPredict6)
r1['precision'] = metrics.precision_score(yTest,yPredict6)
r1['f1'] = f1_score(yTest, yPredict6, average='binary')
   
writeResults2['all'] = r1
    
print()

results = crossValidiation(X, Y, k = 10,classifier = bern, continious = False)
results['down'] = down
results['up'] = up
results['thres'] = threshold  
print("accuracy = ",results['accuracy'])
print("AUC = ", results['auc'])
print("recal = ",results['recall'])
print("precision = ",results['precision'])
print("f1 = ",results['f1'])


results['len_snps'] = len(X.T)
writeResults['all'] = results
    
print()
    
    
writeResult(path,'svm_all',writeResults,'all')
#writeResultConf(path,'svm_all',writeResults2)

In [None]:
r1 = {}
writeResults2={}
writeResults={}

'''enc = OneHotEncoder(n_values =3) 
enc.fit(X) 
XX = enc.fit_transform(X)
XX = XX.toarray()
print(XX.shape)

enc.fit(xTraining) 
xTraining1 = enc.fit_transform(xTraining)
xTraining1 = xTraining1.toarray()
print(xTraining1.shape)

enc.fit(xTest) 
xTest1 = enc.fit_transform(xTest)
xTest1 = xTest1.toarray()
print(xTest1.shape)'''
    

bern = linear_model.LogisticRegressionCV()
bern.fit(xTraining, yTraining1)
yPredict6 = bern.predict(xTest)
print(metrics.accuracy_score(yTest,yPredict6))
print(metrics.confusion_matrix(yTest,yPredict6))
error6 = mean_squared_error(yTest, yPredict6)
print("error 6 = ",error6)
RMSE6 = mean_squared_error(yTest,yPredict6)**0.5
print("RMSE6 = ",RMSE6)

fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict6)
print("AUC = ", metrics.auc(fpr,tpr))
print("recal = ",metrics.recall_score(yTest,yPredict6))
print("precision = ",metrics.precision_score(yTest,yPredict6))
print("f1Score = ",f1_score(yTest, yPredict6, average='binary'))

r1['confu'] = metrics.confusion_matrix(yTest,yPredict6)
r1['accu'] = metrics.accuracy_score(yTest,yPredict6)
r1['auc'] = metrics.auc(fpr,tpr)
r1['recal'] = metrics.recall_score(yTest,yPredict6)
r1['precision'] = metrics.precision_score(yTest,yPredict6)
r1['f1'] = f1_score(yTest, yPredict6, average='binary')
   
#writeResults2['all'] = r1

print()

results = crossValidiation(X, Y, k = 10,classifier = bern, continious = False)
results['down'] = down
results['up'] = up
results['thres'] = threshold    
print("accuracy = ",results['accuracy'])
print("AUC = ", results['auc'])
print("recal = ",results['recall'])
print("precision = ",results['precision'])
print("f1 = ",results['f1'])


results['len_snps'] = len(X.T)
writeResults['all'] = results
    
print()
    
    
writeResult(path,'llr_all_CV',writeResults,'all')
#writeResultConf(path,'llr_all_CV',writeResults2)

In [None]:
r1 = {}
writeResults2={}
writeResults={}

#enc = OneHotEncoder(n_values =3) 
#enc.fit(X) 
#XX = enc.fit_transform(X)
#XX = XX.toarray()
#print(XX.shape)

#enc.fit(xTraining) 
#xTraining1 = enc.fit_transform(xTraining)
#xTraining1 = xTraining1.toarray()
#print(xTraining1.shape)

#enc.fit(xTest) 
#xTest1 = enc.fit_transform(xTest)
#xTest1 = xTest1.toarray()
#print(xTest1.shape)
    

bern = tree.DecisionTreeClassifier()
bern.fit(xTraining, yTraining1)
yPredict6 = bern.predict(xTest)
print(metrics.accuracy_score(yTest,yPredict6))
print(metrics.confusion_matrix(yTest,yPredict6))
error6 = mean_squared_error(yTest, yPredict6)
print("error 6 = ",error6)
RMSE6 = mean_squared_error(yTest,yPredict6)**0.5
print("RMSE6 = ",RMSE6)

fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict6)
print("AUC = ", metrics.auc(fpr,tpr))
print("recal = ",metrics.recall_score(yTest,yPredict6))
print("precision = ",metrics.precision_score(yTest,yPredict6))
print("f1Score = ",f1_score(yTest, yPredict6, average='binary'))

r1['confu'] = metrics.confusion_matrix(yTest,yPredict6)
r1['accu'] = metrics.accuracy_score(yTest,yPredict6)
r1['auc'] = metrics.auc(fpr,tpr)
r1['recal'] = metrics.recall_score(yTest,yPredict6)
r1['precision'] = metrics.precision_score(yTest,yPredict6)
r1['f1'] = f1_score(yTest, yPredict6, average='binary')
   
writeResults2['all'] = r1

print()

results = crossValidiation(X, Y, k = 10,classifier = bern, continious = False)
results['down'] = down
results['up'] = up
results['thres'] = threshold    
print("accuracy = ",results['accuracy'])
print("AUC = ", results['auc'])
print("recal = ",results['recall'])
print("precision = ",results['precision'])
print("f1 = ",results['f1'])


results['len_snps'] = len(X.T)
writeResults['all'] = results
    
print()
    
    
writeResult(path,'tree_all',writeResults,'all')
#writeResultConf(path,'tree_all',writeResults2)

In [None]:
featuresName = []
for i in range(len(ids['snps']['idToName'])):
   # snp = ids['snps']['idToName'][i]
    featuresName.append(ids['snps']['idToName'][i])
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, Y)
treeee = open(path+'all_balancedTree.dot','w')
dot_data = tree.export_graphviz(clf, out_file=treeee, 
                            feature_names=featuresName,leaves_parallel =True,max_depth = 6)  
graph = graphviz.Source(dot_data)

In [None]:
r1={}
writeResults={}

    
    
    
clf = RandomForestClassifier()
#clf.fit(xTraining1, yTraining1)
   
param_grid = {
              "n_estimators" : [9, 18, 27, 36, 45, 54, 63],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"],
               "warm_start":[True,False]}
    
'''param_grid = { 
         'n_estimators':[2000,3000,5000], 
         'max_depth':[5,15,30], 
         'min_samples_split':[2,3], 
        'min_samples_leaf':[1,2],
        "warm_start":[True,False],
                "oob_score":[True,False]}
'''
    
    # run grid search
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid,cv = 10,verbose=1)
    


results = crossValidiation(X, Y, k = 10, classifier = grid_search, continious = False)
results['down'] = down
results['up'] = up
results['thres'] = threshold
print("accuracy = ",results['accuracy'])
print("AUC = ", results['auc'])
print("recal = ",results['recall'])
print("precision = ",results['precision'])
print("f1 = ",results['f1'])
results['len_snps'] = len(X.T)
   # results['len snps'] = 500
writeResults['all'] = results
    
print()
    
    
writeResult(path,'rfrall',writeResults,'all')