In [None]:
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import numpy as np
import os.path
import csv
from sklearn.svm import SVC
from sklearn.svm import NuSVC
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn import grid_search
import sklearn.linear_model as linear_model
import sklearn.metrics as metrics
from sklearn import preprocessing
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from random import randint
from sklearn import tree
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import math

from metrics.Correlation import Correlation
from IO.Write import Write
from IO.Read import Read
from metrics.RSquare import RSquare
from DataSet.Dataset import DataSet


def setIdToName(aList):
    
    ids = {}
    nameToId = {}
    idToName = {}
    count = 0
    
    for i in aList:
        
        nameToId[i] = count
        idToName[count] = i
        count += 1
        
    ids['nameToId'] = nameToId
    ids['idToName'] = idToName
    
    return ids


def setSnpsCode(patients,chromosomes):
    
    for i in patients.keys():
        patients[i].snpCode(chromosomes)
        
    return patients
    



def tables(sampleX,sampleY,k):
  
    samples = {}
    
    for run in range(1,k+1):
        
        d1 = {}
        

        dataTestX = sampleX[run]
        dataTestY = sampleY[run]

        n = 0

        for i in sampleX.keys():

            if i != run:

                n += len(sampleX[i])

        dataTrainX = np.zeros((n,len(sampleX[1].T)),dtype = int)
        dataTrainY = np.zeros((n,),dtype = int)

        count = 0

        for sample in sampleX.keys():

            if sample != run:

                 for i in range(len(sampleX[sample])):
                    for j in range(len(sampleX[sample].T)):
                        dataTrainX[count,j] = sampleX[sample][i,j]

                    dataTrainY[count] = sampleY[sample][i]
                    count += 1

        d1['trainX'] = dataTrainX
        d1['trainY'] = dataTrainY
        d1['testX'] = dataTestX
        d1['testY'] = dataTestY
        
        samples[run] = d1
    
    return samples
    
def kSampleData(k,X,Y):
    
    x = int (len(X) / k)
    allElements = np.zeros((len(X),),dtype = int)
    
    count1 = 1
    sampleX = {}
    sampleY = {}
   
    
    while count1 <= k:
        count2 = 1
        sampleData = []
        
        if count1 == k:
            x =  len(X) - ((k-1) * x)
        
        dataX = np.zeros((x,len(X.T)),dtype = int)
        dataY = np.zeros((x,),dtype = int)
        
        while count2 <= x:
            
            aRand = randint(0,len(X)-1)
            
            while allElements[aRand] == 1:
                
                aRand = randint(0,len(X)-1)
            
            allElements[aRand] = 1
            sampleData.append(aRand)
            count2 += 1
            
        for i in range(len(sampleData)):
            for j in range(len(X.T)):
                dataX[i,j] = X[sampleData[i],j]
            
            dataY[i] = Y[sampleData[i]]
            
        sampleX[count1] = dataX
        sampleY[count1] = dataY
        count1 +=1
        
    return tables(sampleX,sampleY,k)


def calculateJaccardSim(X):
    
    xNew = np.zeros((len(X.T),len(X.T)),dtype = float)
    
    for i in range(len(X.T)):
        
        for j in range(i+1,len(X.T)):
            
            result = metrics.jaccard_similarity_score(X[:,i],X[:,j])
            
            xNew[i,j] = result
            xNew[j,i] = result
            
        xNew[i,i] = 1.0
        
    
    return xNew    

def reduceFeatures(X,a,b,c = 10,method = 'Cosine_Similarity'):
    
    snpsOut = []
    snpsIn = []
    snps1 = []
    snps2 = []
    snpsRandom = []
    
    snpsReturn = {}
    
    if method == 'Cosine_Similarity':
        
        snpsCount = {}
        print("size = ",len(X.T))
        for i in range(len(X.T)):
            
            snpsCount[i] = 0
        
        xNew = metrics.pairwise.cosine_similarity(X.T)
        print("xNew shape ",xNew.shape)
       
        for i in range(len(xNew)):
            for j in range(i+1,len(xNew.T)):
                    
                if ((xNew[i,j] - a >= 1e-10) and (xNew[i,j] - b) <= 1e-10):
       
                    snpsCount[j] = snpsCount[j] + 1
                    snpsCount[i] = snpsCount[i] + 1
                    
        countBigThanZero = 0  
       
        
        for i in snpsCount.keys():
            if snpsCount[i] > 0:
                countBigThanZero += 1
        
         
        for i in snpsCount.keys():
            
            if snpsCount[i] > 0:
                snpsIn.append(i)
            
            if snpsCount[i] > countBigThanZero * c / 100:           
                snps1.append(i)
        
        for i in range(len(X.T)):
            if i not in snps1:
                snps2.append(i)
                
        for i in snps2:
            flag = 0
            for j in snps2:
                 if (1 - xNew[i,j]) <= 1e-2 and j != i:
                        flag = 1
                        break
            if flag == 0:
                snpsOut.append(i)
                
                
        snpsSelected = np.zeros((len(X.T),1),dtype = int)
     
        for i in snpsOut:
            snpsSelected[i] = 1
        
        if len(snpsOut) < len(X.T)/2:
            size = len(snpsOut) + 1
        elif len(snpsOut) == len(X.T):
            size = 0
        else:
            size = len(X.T) - len(snpsOut) + 1
        
        for i in range(1,size):
            
            aRand = randint(0,len(X.T)-1)
    
            while(snpsSelected[aRand] == 1):

                aRand = randint(0,len(X.T)-1)
                
            snpsSelected[aRand] = 1
            snpsRandom.append(aRand)
        
       
        
        print("snps = ",len(snpsOut))
        print("len snpsIn = ",len(snpsIn))
        print("len snpsRandom = ",len(snpsRandom))
        
        snpsReturn['snpsOutArea'] = snpsOut
        snpsReturn['snpsInArea'] = snpsIn
        snpsReturn['snpsRandom'] = snpsRandom
        
    elif method == 'jaccard1':
    
        snpsPairs = {}
        print("size = ",len(X.T))
        for i in range(len(X.T)):
            snpsPairs[i] = 0
            
        
        xNew = X
        print("xNew shape ",xNew.shape)
       
        for i in range(len(xNew)):
            for j in range(i+1,len(xNew.T)):
                    
                if (xNew[i,j] - a <= 1e-10) :
                    
                    snpsPairs[i] = snpsPairs[i] + 1
                    snpsPairs[j] = snpsPairs[j] + 1
                    
                
        for i in snpsPairs.keys():
            if snpsPairs[i] >= c * len(X.T) / 100:
                snpsOut.append(i)
       
                   
        
        
        print("snps = ",len(snpsOut))
        print("len snpsIn = ",len(snpsIn))
        print("len snpsRandom = ",len(snpsRandom))
        
        snpsReturn['snpsOutArea'] = snpsOut
        snpsReturn['snpsInArea'] = snpsIn
        snpsReturn['snpsRandom'] = snpsRandom
        
    
            
    return snpsReturn


def createNewTable(snps,X):
    
    newX = np.zeros((len(X),len(snps)),dtype = int)
    
    for i in range(len(newX)):
        for j in range(len(newX.T)):
            newX[i,j] = -1
    
    for i in range(len(snps)):
        for j in range(len(X)):
            
            newX[j,i] = X[j,snps[i]]
            
    print("new shape = ",newX.shape)
            
    return newX



def crossValidiation(X, Y, k = 1, continious = True, classifier = None,OLS = False,Logistic = False):
    
    if not classifier:
        print("wrong!!!!!!! you have to choise a classifier")
        return
    
    results = {}
    accuracy = {}
    auc = {}
    recall = {}
    precision = {}
    f1Score = {}
    
    sumResults = 0.0
    sumAccuracy = 0.0
    sumAuc = 0.0
    sumRecall = 0.0
    sumPrecision = 0.0
    sumF1Score = 0.0
    
    samples = kSampleData(k,X,Y)
    
    for run in range(1, k + 1):
        
        trainX = samples[run]['trainX']
        trainY = samples[run]['trainY']
        
        testX = samples[run]['testX']
        testY = samples[run]['testY']
        
        
        if OLS:
            classifier = sm.OLS(trainY,trainX)
            yPredict = classifier.fit().predict(testX)
        else:

            classifier.fit(trainX, trainY)
            yPredict = classifier.predict(testX)
        
        if continious:
            
            for i in range(len(yPredict)):
                
                if (abs(0 - yPredict[i]) - abs(1 - yPredict[i])) <= 1e-10 :
                    yPredict[i] = 0
                else:
                    yPredict[i] = 1
                    
        if Logistic:
            
            probabilities = classifier.predict_proba(testX)
            
            for i in range(len(probabilities)):
                if probabilities[i][1] >= 0.8:
                    yPredict[i] = 1
                else:
                    yPredict[i] = 0
          
        accuracy[run] = metrics.accuracy_score(testY,yPredict)#(yPredict,testY)#
        fpr, tpr, thresholds = metrics.roc_curve(testY,yPredict)
        auc[run] = metrics.auc(fpr,tpr)
        recall[run] = metrics.recall_score(testY,yPredict)
        precision[run] = metrics.precision_score(testY,yPredict)
        f1Score[run] = f1_score(testY, yPredict, average='binary')
        
    
    for i in accuracy.keys():
        sumAccuracy = sumAccuracy + accuracy[i]
        sumAuc = sumAuc + auc[i]
        sumRecall = sumRecall + recall[i]
        sumPrecision = sumPrecision + precision[i]
        sumF1Score = sumF1Score + f1Score[i]
    
    results['accuracy'] = sumAccuracy / k
    results['auc'] = sumAuc / k
    results['recall'] = sumRecall / k
    results['precision'] = sumPrecision / k
    results['f1'] = sumF1Score / k
    
    return results



In [None]:
path = 'C:\\Users\\ANTONIS\\Desktop\\p = 0.0001\\' #bake ton fakelo pou 8a epe3ergas8eis
#path = 'C:\\Users\\ANTONIS\\Desktop\\pValues\\pvalue = 1e-05\\'

numberOfChromosomes = 22#'ari8mos twn xromoswmatwn'
patientsTrain = {}
patientsTest = {}
allPatients = {}

chromosomes = {}

read = Read(path,numberOfChromosomes)
write = Write(path,numberOfChromosomes)

patients = read.readPatients('Patients.txt')
chromosomes = read.readSnps(".assoc")
write.writePatientsList(patients,'patient.txt')
write.writeSnpsList(chromosomes)


# run train_lgen bat and test_leg bat

In [None]:

snps = read.getListOfSnps()
ids = {} 
idsTest = {}



if os.path.exists(path + 'snpCode.txt'):
    print("mphka")
    patients = read.readSnpsCode(patients)
    
    
else:
    patients = read.readLgen(patients)
    
   
    patients = setSnpsCode(patients,chromosomes)
    
    


ids['patients'] = setIdToName(list(patients.keys()))
ids['snps'] = setIdToName(snps)





In [None]:
trainSet = DataSet(patients,ids)


X = trainSet.getXTable()
Y = trainSet.getYTable()

if not os.path.exists(path + 'snpCode.txt'):
    print("mphka 3")
    write.saveData(ids,patientsTrain,'Train',xTraining,chromosomes)
    write.saveData(idsTest,patientsTest,'Test',xTest)

# Correlation

In [None]:
xTraining, xTest, yTraining1, yTest = train_test_split(X, Y, test_size=0.1, random_state=0)
print("mergex = ",X.shape)
print("xTrain = ",xTraining.shape)
print("xTest = ",xTest.shape)
print("yTrain = ",yTraining1.shape)
print("yTest = ",yTest.shape)

In [None]:
#cor = Correlation(X)
#cor = RSquare(X)

#cor = Correlation(xTraining)
cor = RSquare(xTraining)

In [None]:
#corr = cor.getCorrMatrix()
corr = cor.getRMatrix()

snpsCount = {}

for i in range(5415):
    
    snpsCount[i] = 0
    
for i in range(5415):
    for j in range(i+1,5415):
        
        if corr[i,j] - 0.8 >= 1e-12:
            snpsCount[i] = snpsCount[i] + 1
            snpsCount[j] = snpsCount[j] + 1
            
            

# Correlation high

In [None]:
snpsRed = []
count = 0
c1 = []
for i in snpsCount.keys():
    if snpsCount[i]>0:
        c1.append(i)
        count += 1

snpsRed = c1
print("count = ",count)
print("len snpsRed = ",len(snpsRed))

# Correlation Low

In [None]:
snpsRed = []
for i in range(5415):
    
    if (i not in c1):
        snpsRed.append(i)
        
print("len snpsRed = ",len(snpsRed))
print("count + len(snpsRed) = ", count + len(snpsRed))

In [None]:
XX = createNewTable(snpsRed,X)
xTraining1 = createNewTable(snpsRed,xTraining)
xTest1 = createNewTable(snpsRed,xTest)

# Sepearte cases and Controls

In [None]:
xTraining, xTest, yTraining1, yTest = train_test_split(X, Y, test_size=0.1, random_state=0)
print("mergex = ",X.shape)
print("xTrain = ",xTraining.shape)
print("xTest = ",xTest.shape)
print("yTrain = ",yTraining1.shape)
print("yTest = ",yTest.shape)

In [None]:
cases = []
controls = []
yCo = []
yCa = []
idsCa = {}
idsCos = {}

caIds = {}
cosIds = {}

for i in range(len(yTraining1)):
    if yTraining1[i] == 0 :
        controls.append(i)
        
    elif yTraining1[i] == 1:
        cases.append(i)
        
count = 0

for i in controls:
    idsCos[i] = count
    cosIds[count] = i
    count += 1
    
count = 0

for i in cases:
    idsCa[i] = count
    caIds[count] = i
    count += 1
        
control = np.zeros((len(controls),len(xTraining.T)))
case = np.zeros((len(cases),len(xTraining.T)))

for i in controls:
    pos = idsCos[i]
    control[pos,:] = xTraining[i,:]
   # yCo.append(Y[pos])
    
for i in cases:
    pos = idsCa[i]
    case[pos,:] = xTraining[i,:]
   # yCa.append(Y[pos])

print("cases = ",case.shape)
print("controls = ",control.shape)


In [None]:
#r21 = RSquare(control)
#r22 = RSquare(case)

r21 = Correlation(control)
r22 = Correlation(case)


In [None]:
#co = r21.getRMatrix()
#ca = r22.getRMatrix()

co = r21.getCorrMatrix()
ca = r22.getCorrMatrix()

countCo ={}
countCa = {}

count1 = 0
count2 = 0

c1 = []
c2 = []

for i in range(5415):
    countCo[i] = 0
    countCa[i] = 0
    
for i in range(5415):
    for j in range(i+1,5415):
        
        if abs(co[i,j]) - 0.8 >= 1e-12:
            countCo[i] = countCo[i] + 1
            countCo[j] = countCo[j] + 1
            
        if abs(ca[i,j]) - 0.8 >= 1e-12:
            
            countCa[i] = countCa[i] + 1
            countCa[j] = countCa[j] + 1
    
for i in countCo.keys():
    if countCo[i]>0:
        c1.append(i)
        count1 += 1
        
    if countCa[i]>0:
        c2.append(i)
        count2 += 1
        
print("count1 = ",count1)
print("count1 = ",count2)
print("c1 =",len(c1))
print("c2 =",len(c2))

# Case U CONTROL

In [None]:
snpsRed = []

c1H = list(set(c1) - set(c2))
c2H = list(set(c2) - set(c1))

for i in c1H:
    snpsRed.append(i)
    
for i in c2H:
    snpsRed.append(i)

print("c1 =",len(c1))
print("c2 =",len(c2))
print("c1H =",len(c1H))
print("c2H =",len(c2H))
print("len snpsRed = ",len(snpsRed))

# Case & Control

In [None]:
snpsRed = []
count = 0
for i in c1:
    if i in c2:
        count += 1
snpsRed = list(set(c1) & set(c2))
print("c1 =",len(c1))
print("c2 =",len(c2))
print("len snpsRed = ",len(snpsRed))
print("count = ",count)

# 1 - ((Case U Control) + (Case & Control))

In [None]:
snpsRed = []
c1H = []
c2H = []
c3H = []
helpList = []

c1H = list(set(c1) - set(c2))
c2H = list(set(c2) - set(c1))
c3H = list(set(c1) & set(c2))

for i in c1H:
    helpList.append(i)
    
for i in c2H:
    helpList.append(i)
    
for i in c3H:
    helpList.append(i)

for i in range(5415):
    
    if i not in helpList:
        snpsRed.append(i)
        
print("c1H =",len(c1H))
print("c2H =",len(c2H))
print("c3H =",len(c3H))
print("c1 =",len(c1))
print("c2 =",len(c2))
print("helpList =",len(helpList)) 
print("len snpsRed = ",len(snpsRed))
print("5415 - len(helpList) = ", 5415 - len(helpList))

# 1 - (Case & Control)

In [None]:
snpsRed = []
count = 0
for i in c1:
    if i in c2:
        count += 1
cHelp = list(set(c1) & set(c2))

for i in range(5415):
    if i not in cHelp:
        snpsRed.append(i)

print("c1 =",len(c1))
print("c2 =",len(c2))
print("len cHelp = ",len(cHelp))
print("count = ",count)
print("len snpsRed = ",len(snpsRed))

In [None]:
XX = createNewTable(snpsRed,X)
xTraining1 = createNewTable(snpsRed,xTraining)
xTest1 = createNewTable(snpsRed,xTest)

In [None]:
lr_clf = linear_model.LogisticRegression()  
lr_clf.fit(mergeXtable, mergeYtable)
xTraining = lr_clf.fit_transform(X,Y)



In [None]:
Xt = xNew = calculateJaccardSim(X)

In [None]:
count = {}
sn = {}
s = []
for i in range(5415):
    count[i] = 0
    sn[i] = []
for i in range(5415):
    for j in range(i+1, 5415):
        if Xt[i,j] - 0.3 >= 1e-10 and Xt[i,j] - 0.6 <= 1e-10 :
            count[i] = count[i] + 1
            count[j] = count[j] + 1
            sn[i].append(j)


    
c = 0
for i in sn.keys():
    
    c = c + len((sn[i]))
              
print("c = ",c)

for i in range(len(X.T)):
    for j in range(i+1,len(X.T)):
         sn[j] = list(set(sn[j]) - set(sn[i]))
            
    l = []
    l.append(j)
    sn[i] = list(set(sn[i]) - set(l))      
            
'''c = 0
for i in count.keys():
    if count[i] >= 25 * 5415 /100:
        c +=1
   # if len(sn[i]) > 0:
    #    print("i = ",i)
    #    print(sn[i])
    #    print()
        
print("c = ",c)'''

In [None]:
snpsReduced = reduceFeatures(X,0.3,0.6,c = 10,method = 'Cosine_Similarity')

In [None]:
XX = createNewTable(snpsRed,X)

In [None]:
snps1 = snpsReduced['snpsOutArea']
snps2 = snpsReduced['snpsRandom']
snps3 = snpsReduced['snpsInArea']

count = 0
for i in snps2:
    if i in snps1:
        count += 1
        
print("random - outArea = ",count)

count = 0
for i in snps2:
    if i in snps3:
        count += 1
        
print("random - inArea = ",count)

count = 0
for i in snps1:
    if i in snps3:
        count += 1
        
print("outArea - inArea = ",count)

In [None]:
from sklearn.feature_selection import VarianceThreshold

sel = VarianceThreshold(threshold=(.4 * (1 - .9)))
xTraining = sel.fit_transform(X)


In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

xTraining = SelectKBest(chi2, k=2500).fit_transform(X, Y)


In [None]:
rfr = RandomForestRegressor(n_estimators = 100, random_state = 2016, verbose = 10,max_depth = None,n_jobs=1)
rfr.fit(X, Y)
xTraining = rfr.transform(X)
print(xTraining.shape)

In [None]:
'''enc = OneHotEncoder(n_values =3)
#enc.fit(xTraining) 
enc.fit(XX) 
#XX = enc.fit_transform(xTraining)
XX = enc.fit_transform(XX)
print(XX.toarray())
XX = XX.toarray()'''


In [None]:
'''xTraining1, xTest1, yTraining1, yTest = train_test_split(XX, Y, test_size=0.1, random_state=0)
print("mergex = ",XX.shape)
print("xTrain = ",xTraining1.shape)
print("xTest = ",xTest1.shape)
print("yTrain = ",yTraining1.shape)
print("yTest = ",yTest.shape)'''

# # RF

In [None]:
rfr = RandomForestRegressor(n_estimators = 100, random_state = 2017, verbose = 10,
                            max_depth = None,n_jobs=-1)
rfr.fit(xTraining1, yTraining1)
yPredict3 = rfr.predict(xTest1)

count = 0
for i in range(len(yPredict3)):
    count += yPredict3[i]

mo3 = count / len(yPredict3)

for i in range(len(yPredict3)):
    #if yPredict3[i] < mo3:
    if (abs(0 - yPredict3[i]) - abs(1- yPredict3[i])) < 1e-10 :
        yPredict3[i] = 0
    else:
        yPredict3[i] = 1
        
print(metrics.accuracy_score(yTest,yPredict3))
print(metrics.confusion_matrix(yTest,yPredict3))
error3 = mean_squared_error(yTest, yPredict3)
print("error 3 = ",error3)
RMSE3 = mean_squared_error(yTest,yPredict3)**0.5
print("RMSE3 = ",RMSE3)

#print("cros validation = ",crossValidiation(mergeXtable, mergeYtable, k = 10, classifier = rfr))
fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict3)
print("AUC = ", metrics.auc(fpr,tpr))
print("recal = ",metrics.recall_score(yTest,yPredict3))
print("precision = ",metrics.precision_score(yTest,yPredict3))
print("f1Score = ",f1_score(yTest, yPredict3, average='binary'))
print()
results = crossValidiation(XX,Y, k = 10, classifier = rfr)
print("accuracy = ",results['accuracy'])
print("AUC = ", results['auc'])
print("recal = ",results['recall'])
print("precision = ",results['precision'])
print("f1 = ",results['f1'])

# SVM

In [None]:
'''
#SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
 # kernel='rbf', max_iter=-1, probability=False, shrinking=True, tol=0.001,
 # verbose=False)


clf = SVC()
clf.fit(xTraining1, yTraining1)
yPredict2 = clf.predict(xTest1)
print(metrics.accuracy_score(yTest,yPredict2))
print(metrics.confusion_matrix(yTest,yPredict2))
error2 = mean_squared_error(yTest, yPredict2)
print("error 2 = ",error2)
RMSE2 = mean_squared_error(yTest,yPredict2)**0.5
print("RMSE2 = ",RMSE2)

#print("cros validation = ",crossValidiation(mergeXtable, mergeYtable, k = 10, classifier = clf))
fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict2)
print("AUC = ", metrics.auc(fpr,tpr))
print("recal = ",metrics.recall_score(yTest,yPredict2))
print("precision = ",metrics.precision_score(yTest,yPredict2))
print("f1Score = ",f1_score(yTest, yPredict2, average='binary'))
print()

results = crossValidiation(XX, Y, k = 10, classifier = clf,continious = False)
#results = crossValidiation1(mergeXtable, mergeYtable, k = 10, classifier = 'svm',area = 'snpsOutArea',continious = False)
print("accuracy = ",results['accuracy'])
print("AUC = ", results['auc'])
print("recal = ",results['recall'])
print("precision = ",results['precision'])
print("f1 = ",results['f1'])'''

In [None]:
'''clf = SVC(kernel ='poly')
clf.fit(xTraining1, yTraining1)
yPredict2 = clf.predict(xTest1)
print(metrics.accuracy_score(yTest,yPredict2))
print(metrics.confusion_matrix(yTest,yPredict2))
error2 = mean_squared_error(yTest, yPredict2)
print("error 2 = ",error2)
RMSE2 = mean_squared_error(yTest,yPredict2)**0.5
print("RMSE2 = ",RMSE2)

#print("cros validation = ",crossValidiation(mergeXtable, mergeYtable, k = 10, classifier = clf))
fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict2)
print("AUC = ", metrics.auc(fpr,tpr))
print("recal = ",metrics.recall_score(yTest,yPredict2))
print("precision = ",metrics.precision_score(yTest,yPredict2))
print("f1Score = ",f1_score(yTest, yPredict2, average='binary'))
print()

results = crossValidiation(XX, Y, k = 10, classifier = clf,continious = False)
#results = crossValidiation1(mergeXtable, mergeYtable, k = 10, classifier = 'svm',area = 'snpsOutArea',continious = False)
print("accuracy = ",results['accuracy'])
print("AUC = ", results['auc'])
print("recal = ",results['recall'])
print("precision = ",results['precision'])
print("f1 = ",results['f1'])'''

# SVM KERNEL

In [None]:

clf = SVC(kernel='linear')
clf.fit(xTraining1, yTraining1)
yPredict2 = clf.predict(xTest1)
print(metrics.accuracy_score(yTest,yPredict2))
print(metrics.confusion_matrix(yTest,yPredict2))
error2 = mean_squared_error(yTest, yPredict2)
print("error 2 = ",error2)
RMSE2 = mean_squared_error(yTest,yPredict2)**0.5
print("RMSE2 = ",RMSE2)


fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict2)
print("AUC = ", metrics.auc(fpr,tpr))
print("recal = ",metrics.recall_score(yTest,yPredict2))
print("precision = ",metrics.precision_score(yTest,yPredict2))
print("f1Score = ",f1_score(yTest, yPredict2, average='binary'))
print()

results = crossValidiation(XX, Y, k = 10, classifier = clf, continious = False)
print("accuracy = ",results['accuracy'])
print("AUC = ", results['auc'])
print("recal = ",results['recall'])
print("precision = ",results['precision'])
print("f1 = ",results['f1'])

In [None]:
#re = cross_val_score(clf, XX, Y, cv=10)
#print(sum(re)/10)

# LINEAR LOGISTIC REGRESSION 

In [None]:
lr_clf = linear_model.LogisticRegression()  
lr_clf.fit(xTraining1, yTraining1)
yPredict4 = lr_clf.predict(xTest1)

print(metrics.accuracy_score(yTest,yPredict4))
print(metrics.confusion_matrix(yTest,yPredict4))
error4 = mean_squared_error(yTest, yPredict4)
print("error 4 = ",error4)
RMSE4 = mean_squared_error(yTest,yPredict4)**0.5
print("RMSE4 = ",RMSE4)


fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict4)
print("AUC = ", metrics.auc(fpr,tpr))
print("recal = ",metrics.recall_score(yTest,yPredict4))
print("precision = ",metrics.precision_score(yTest,yPredict4))
print("f1Score = ",f1_score(yTest, yPredict4, average='binary'))
print()

results = crossValidiation(XX, Y, k = 10, classifier = lr_clf, continious = False)
#results = crossValidiation1(mergeXtable, mergeYtable, k = 10, classifier = 'llr',area = 'snpsOutArea',continious = False)
print("accuracy = ",results['accuracy'])
print("AUC = ", results['auc'])
print("recal = ",results['recall'])
print("precision = ",results['precision'])
print("f1 = ",results['f1'])

In [None]:
#re = cross_val_score(lr_clf, XX, Y, cv=10)
#print(sum(re)/10)

# Linear Perceptron

In [None]:
perceptron = linear_model.Perceptron(penalty='l1', alpha=0.00000001, fit_intercept=True,
              n_iter=100, shuffle=True, verbose=2016, eta0=0.00000001, n_jobs=-1, random_state=2016, warm_start=True)

perceptron.fit(xTraining1, yTraining1)
yPredict4 = perceptron.predict(xTest1)

print(metrics.accuracy_score(yTest,yPredict4))
print(metrics.confusion_matrix(yTest,yPredict4))
error4 = mean_squared_error(yTest, yPredict4)
print("error 4 = ",error4)
RMSE4 = mean_squared_error(yTest,yPredict4)**0.5
print("RMSE4 = ",RMSE4)

fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict4)
print("AUC = ", metrics.auc(fpr,tpr))
print("recal = ",metrics.recall_score(yTest,yPredict4))
print("precision = ",metrics.precision_score(yTest,yPredict4))
print("f1Score = ",f1_score(yTest, yPredict4, average='binary'))
print()

results = crossValidiation(XX, Y, k = 10, classifier = perceptron, continious = False)

print("accuracy = ",results['accuracy'])
print("AUC = ", results['auc'])
print("recal = ",results['recall'])
print("precision = ",results['precision'])
print("f1 = ",results['f1'])

In [None]:
#re = cross_val_score(perceptron, XX, Y, cv=10)
#print(sum(re)/10)

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(xTraining1, yTraining1)
yPredict6 = gnb.predict(xTest1)
print(metrics.accuracy_score(yTest,yPredict6))
print(metrics.confusion_matrix(yTest,yPredict6))
error6 = mean_squared_error(yTest, yPredict6)
print("error 6 = ",error6)
RMSE6 = mean_squared_error(yTest,yPredict6)**0.5
print("RMSE6 = ",RMSE6)

fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict6)
print("AUC = ", metrics.auc(fpr,tpr))
print("recal = ",metrics.recall_score(yTest,yPredict6))
print("precision = ",metrics.precision_score(yTest,yPredict6))
print("f1Score = ",f1_score(yTest, yPredict6, average='binary'))
print()

results = crossValidiation(XX, Y, k = 10,classifier = gnb, continious = False)
#results = crossValidiation1(mergeXtable, mergeYtable, k = 10, classifier = 'gnb',area = 'snpsOutArea',continious = False)
print("accuracy = ",results['accuracy'])
print("AUC = ", results['auc'])
print("recal = ",results['recall'])
print("precision = ",results['precision'])
print("f1 = ",results['f1'])

In [None]:
#re = cross_val_score(gnb, XX, Y, cv=10)
#print(sum(re)/10)

# Decision Tree

In [None]:
'''
dt = tree.DecisionTreeClassifier()
dt = dt.fit(xTraining1, yTraining1)

yPredict7 = dt.predict(xTest1)
print(metrics.accuracy_score(yTest,yPredict7))
print(metrics.confusion_matrix(yTest,yPredict7))
error6 = mean_squared_error(yTest, yPredict7)
print("error 6 = ",error6)
RMSE6 = mean_squared_error(yTest,yPredict7)**0.5
print("RMSE6 = ",RMSE6)

results = crossValidiation(XX,Y, k = 10, classifier = dt, continious = False)
#results = crossValidiation1(mergeXtable, mergeYtable, k = 10, classifier = 'dt',area = 'snpsOutArea',continious = False)
print("accuracy = ",results['accuracy'])
print("AUC = ", results['auc'])
print("recal = ",results['recall'])
print("precision = ",results['precision'])
print("f1 = ",results['f1'])'''