In [None]:
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import numpy as np
import os.path
import csv
from sklearn.svm import SVC
from sklearn.svm import NuSVC
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn import grid_search
import sklearn.linear_model as linear_model
import sklearn.metrics as metrics
from sklearn import preprocessing
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from random import randint
from sklearn import tree
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

class PatientPhenotype:
    
    def __init__(self, eid, case, sex, yearBirth):
        
        self.eid = eid.strip()
        self.case = int (case.strip())
        self.sex = sex.strip()
        self.yearBirth = yearBirth.strip()
        self.snps = {}
        
    def getEid(self):
        return self.eid
     
    def getCase(self):
        return self.case
    
    def getSex(self):
        return self.sex
    
    def getYearBirth(self):
        return self.yearBirth
        
    def addSnps(self, snpId, allele1,allele2):
        self.snps[snpId] = Snp(snpId,allele1,allele2)
        
    def snpCode(self,chromosomes = {}, snp = '', code = -1):
    
        if len(chromosomes.keys()) > 0:
    
             for i in range(len(chromosomes.keys())):
    
                chro = 'chr'+str(i+1)
            
                for snp in chromosomes[chro].keys():
                
                    allele1 = chromosomes[chro][snp][0].strip()
                    allele2 = chromosomes[chro][snp][1].strip()
                    
                    self.snps[snp.strip()].setSnpCode(allele1,allele2)
                    
        else:
            
            self.snps[snp.strip()].setCode(code)
            
    def getSnpCode(self,snpId):
        return self.snps[snpId].getSnpCode()
    
    def getAllele1(self,snpId):
        return self.snps[snpId].getAllele1()
    
    def getAllele2(self,snpId):
        return self.snps[snpId].getAllele2()
        
    def getSize(self):
        return len(self.snps)
        
        
class Snp:
    
    def __init__(self,snpId,allele1,allele2):
        
        self.snpId = snpId
        self.allele1 = allele1
        self.allele2 = allele2
        self.snpCode = -1
        self.snpCode1 = -1
        
    def getId(self):
        
        return self.snpId
        
    def getAllele1(self):
        
        return self.allele1
        
    def getAllele2(self):
        
        return self.allele2
        
    def setSnpCode(self,allele1,allele2):
      
        if self.allele1.strip() == allele1.strip() and self.allele2.strip() == allele1.strip():
            code = 2
           
        elif self.allele1.strip() == allele1.strip() and self.allele2.strip() != allele1.strip():
            code = 1
           
        elif self.allele1.strip() != allele1.strip() and self.allele2.strip() == allele1.strip():
            code = 1
            
        elif self.allele1.strip() != allele1.strip() and self.allele2.strip() != allele1.strip():
            code = 0
            
        self.snpCode = code
        
        
    def setSnpCode1(self,allele1,allele2):
        
        l1 = [allele1,allele2]
        l2 = [self.allele1.strip(),self.allele12.strip()]
        
        l1 = list(sorted(l1))
        l2 = list(sorted(l2))
      
        if self.allele1.strip() == allele1.strip() and self.allele2.strip() != allele2.strip():
            code = 1
           
        elif self.allele1.strip() == allele1.strip() and self.allele2.strip() == allele2.strip():
            code = 0
            
        elif self.allele1.strip() != allele1.strip() and self.allele2.strip() == allele2.strip():
            code = -1
            
        self.snpCode1 = code
        
    def setCode(self,aCode):
        
        self.snpCode = aCode
        
    def getSnpCode(self):
        
        return self.snpCode
    
class Read:
    def __init__(self,path,numberOfChromosomes):
        
        self.chromosomes = {}
        self.numberOfSnps = 0
        self.path = path
        self.numberOfChromosomes = numberOfChromosomes
        
    def readPatients(self,kind):
        
        patients = {}
        
        try:
            f = open(self.path + kind,'r')
            f.readline()
            
            
            try:

                for line in f:
                    patients[line.split()[0].strip()] = PatientPhenotype(line.split()[0],line.split()[3],
                                                                         line.split()[1],line.split()[2])
                    
                f.close()

            except Exception as x:
                print("error = ",x)
                f.close()
                
        except Exception as x:
            
            print("error = ",x)
            f.close()
        
        return patients
        
    
    def readSnps(self,fileKind):
        
        for i in range(self.numberOfChromosomes):
    
            chro = 'chr'+str(i+1)
            path = self.path + chro + fileKind
            
            try:
                
                f = open(path,'r')
                f.readline()
                
                try:

                    self.chromosomes[chro] = self.__readSnpsOfChromosome(f)

                    f.close()

                except Exception as x:
                    print("error = ",x)
                    f.close()
                    
            except Exception as x:
            
                print("error = ",x)
                f.close()
                
    
                
        return self.chromosomes
    
    def __readSnpsOfChromosome(self,file):
        
        snps = {} 
       
        for line in file:
            
            alleles = []
            alleles.append(line.split()[3].strip())
            alleles.append(line.split()[6].strip())
            
            try:
                if line.split()[1].strip() != '.':
                    snps[line.split()[1].strip()] = alleles
                    self.numberOfSnps += 1
                    
            except Exception as x:
                print("error = ",x)
                
                file.close()
                
        return snps
        
    def readLgen(self,patients,kind = ''):
        
        
        for i in range(self.numberOfChromosomes):
            
            chro = 'chr'+str(i+1)
            path = self.path + chro + kind +'.lgen'
    
            if os.path.exists(path):
                
                try:
                    f = open(path,'r')
                
                    for line in f:
                        try:
                            if line.split()[0].strip() in patients.keys():

                                patients[line.split()[0].strip()].addSnps(line.split()[2].strip(),line.split()[3].strip(),
                                                                                        line.split()[4].strip())
                        except Exception as x:
                            print("error = ",x)
                            f.close()
                            
                    f.close()
              
                except Exception as x:
                        print("error = ",x)
                        f.close()
                
       
        return patients
    
    def getListOfSnps(self):
        snps = []
        for i in range(self.numberOfChromosomes):
            chro = 'chr'+str(i+1)
            for snp in self.chromosomes[chro].keys():
                snps.append(snp)
        
        return snps
        
    def getNumberOfSnps(self):
        
        return self.numberOfSnps
    
    
    def readSnpsCode(self,patients,kind = ''):
        
        try:
            read = open(self.path + kind + 'snpCode.txt','r')
            read.readline()
            read.readline()
            print("mphka2")
            for line in read:   

                try:
                    patient = line.split('\t')[0].strip()
                    snp = line.split('\t')[1].strip()
                    code = int (line.split('\t')[2].strip())
                    allele1 = line.split('\t')[3].strip()
                    allele2 = line.split('\t')[4].strip()
                    if patient in patients.keys() and snp != '.':
                        patients[patient].addSnps(snp,allele1,allele2)
                        patients[patient].snpCode(snp = snp,code = code)
                except Exception as x:
                    print("error = ",x)
                    read.close()
            
            read.close()
    
        except Exception as x:
            print("error = ",x)
            read.close()
            
        return patients
            
        
    
class Write:
    
    def __init__(self,path,numberOfChromosomes):
        
        self.path = path
        self.numberOfChromosomes = numberOfChromosomes
        
    def writePatientsList(self,patients,kind):
        
        path = self.path + kind
        
        try:
            write = open(path,'w')
            for patient in patients.keys():
                write.write(patient.strip() + '\n')
            
            write.close()
        except Exception as x:
            print("error = ",x)
            write.close()
        
        
    def writeSnpsList(self,chromosomes):
        
        for i in range(self.numberOfChromosomes):
    
            chro = 'chr'+str(i+1)
            try:
                path = self.path + chro + 'snpList.txt'
                write = open(path,'w')

                for snp in chromosomes[chro].keys():
                    write.write(snp.strip() + '\n')

                write.close()
            except Exception as x:
                print("error = ",x)
                write.close()
            
    def writeSnpsUsed(self,snpsIds,idToName,chromosomes,name = None):
        
        if not name:
            print("give a name to file")
            return
        
        path = self.path + name
        
        if os.path.exists(path):
            print("the file already exists........ give another name")
            return
        
        snps = []
        for i in snpsIds:
            snps.append(idToName[i])
            
        print("snpsIds = ",len(snpsIds))
        print("idToName = ",len(idToName))
        
        write = open(path,'w')
        try:
            for i in range(1,23):
            
                chro = 'chr'+str(i)
                chromList = chromosomes[chro]

                if len(list(set(chromList) - set(snps))) < len(chromList):
                    write.write("chromosome"+(i)+'\n')
                    for j in snps:
                        if j in chromosomes[chro]:
                            write.write(j + '\n')
                    write.write('\n')

            write.close()
        except Exception as x:
            print("error = ",x)
            write.close()
            
    def saveData(self,ids,patients,patientKind,data,chroms = {}):
    
        self.__snpCodeLog(ids['patients']['idToName'],ids['snps']['idToName'],patients,data,patientKind)
        
    def __patientsLogFile(self,ids,patientKind):
        
        write = open(self.path + patientKind + 'Ids.txt','w')
        
        write.write(str(len(ids['nameToId'])) + '\n')
        
        for patient in ids['nameToId'].keys():
            
            write.write(patient.strip() + '\t' + str(ids['nameToId'][patient]).strip() + '\n')
            
        write.close()
        
    def __snpsLogFile(self,ids,chroms):
        
        if len(chroms.keys()) > 0:
        
            write = open(self.path + 'SnpsIds.txt','w')
        
            write.write(str(len(ids['nameToId'])) + '\n')
        
            for chro in chroms.keys():
              
                for snp in chroms[chro].keys():
                    write.write(snp.strip() + '\t' + str(ids['nameToId'][snp.strip()]).strip() + '\n')
            
            write.close()
            
    def __snpCodeLog(self,patientsIds,snpsIds,patients,data,patientKind):
        
        write = open(self.path + patientKind + 'snpCode.txt','w')
        
        write.write(str(len(patientsIds)) + '\n')
        write.write(str(len(snpsIds)) + '\n')
        
        for i in range(len(data)):
            for j in range(len(data.T)):
                allele1 = patients[patientsIds[i]].getAllele1(snpsIds[j])
                allele2 = patients[patientsIds[i]].getAllele2(snpsIds[j])
                write.write(patientsIds[i].strip() + '\t' + snpsIds[j].strip() + '\t' + str(data[i,j]).strip() + '\t' 
                                                                            + allele1.strip() + '\t' + allele2.strip() + '\n')
                
        write.close()
        
        
            
class DataSet:
    
    def __init__(self,patients,ids):
        
        self.n = len(ids['patients']['nameToId'].keys())
        self.m =len(ids['snps']['nameToId'].keys()) 
        self.patients = patients
        self.ids = ids
                     
        self.xTable = np.zeros((self.n,self.m),dtype = int)
        self.yTable = np.zeros((self.n,),dtype = int)
        
        for i in range(self.n):
            for j in range(self.m):
                self.xTable[i,j] = -1
                     
        self.__fillXTable()
        self.__fillYTable()
                     
                     
    def __fillXTable(self):
    
        for i in range(self.n):
            for j in range(self.m):
        
                patient = self.ids['patients']['idToName'][i]
                snp = self.ids['snps']['idToName'][j]
        
                self.xTable[i,j] = self.patients[patient].getSnpCode(snp)
                     
    def __fillYTable(self):
    
        for i in range(self.n):
    
            patient = self.ids['patients']['idToName'][i]
            self.yTable[i] = self.patients[patient].getCase()
        
    def getXTable(self):
                     
        return self.xTable
                     
    def getYTable(self):
                     
        return self.yTable
    
    
class PCA:
    
    def __init__(self,X,error = 0.9):
        
    
        self.matrix = X
        self.n = len(X)
        self.m = len(X.T)
        self.eigenValues = None
        self.eigenVectors = None
        self.F = None
        self.finalData = None
        self.selectedSnps = []
        self.eigenValuesPos = {}
        self.kEigenVectors = 0
        self.error = error
        self.meanMatrix = np.zeros((self.m,1),dtype = float)
        self.normalizedMatrix = np.zeros((self.n,self.m),dtype = float)
        self.covMatrix = None
        
        self.__calculateMeanMatrix()
        self.__normalizeMatrix()
        self.__calculateCovMatrix()
        self.__calculateEigenValuesAndEigenVectors()
        self.__findKEigenVectors()
        self.__createFMatrix()
        self.__createFinalMatrix()
                                  
    def __calculateMeanMatrix(self):
        
        for i in range(self.m):
            summ = 0
            for j in range(self.n):
                summ = self.matrix[j,i] + summ
                
            self.meanMatrix[i] = summ / self.n
            
    def __normalizeMatrix(self):
        
        for i in range(self.m):
            
            for j in range(self.n):
                
                self.normalizedMatrix[j,i] = self.matrix[j,i] - self.meanMatrix[i] 
            
    def __calculateCovMatrix(self):
       
        self.covMatrix = self.normalizedMatrix.T.dot(self.normalizedMatrix)
        print("cov = ",self.covMatrix.shape)

    def __calculateEigenValuesAndEigenVectors(self):
        
        eigen = np.linalg.eigh(self.covMatrix)
        self.eigenValues = eigen[0]
        self.eigenVectors = eigen[1]
            
        for i in self.eigenValues:
            self.eigenValuesPos[i] = []
            
        for i in range(len(self.eigenValues)):
            self.eigenValuesPos[self.eigenValues[i]].append(i)
            
        print("values = ",self.eigenValues)
        print("pos1 = ",self.eigenValuesPos[self.eigenValues[0]])
            
        self.eigenValues = sorted(self.eigenValues,reverse = True)
        print("values after = ",self.eigenValues)
        print("pos2 = ",self.eigenValuesPos[self.eigenValues[0]])
            
    def __findKEigenVectors(self):
        
        
        sumOfEigenValues = sum(self.eigenValues)
        summ = 0
        count = 0
        
        while (summ / sumOfEigenValues) - self.error < 1e-10:
            
            summ = summ + self.eigenValues[count]
            count += 1
            
        print("count is ",count)
        print("sum of eigenValues = ",sumOfEigenValues)
        print("sum is ",summ)
        
        self.kEigenVectors = count
        
    def __createFMatrix(self):
        
        self.F = np.zeros((self.m,self.kEigenVectors),dtype = float)
        
        for i in range(self.kEigenVectors):
            
            pos = self.eigenValuesPos[self.eigenValues[i]][0]
            self.selectedSnps.append(pos)
            self.eigenValuesPos[self.eigenValues[i]].remove(pos)
            
            self.F[:,i] = self.eigenVectors[:,pos]
                
        
    def __createFinalMatrix(self):
        
        self.finalData = self.matrix.dot(self.F)
        
        
    def getSelectedSnps(self):
        
        return self.selectedSnps
            
    def getFinalData(self):
        
        return self.finalData



def setIdToName(aList):
    
    ids = {}
    nameToId = {}
    idToName = {}
    count = 0
    
    for i in aList:
        
        nameToId[i] = count
        idToName[count] = i
        count += 1
        
    ids['nameToId'] = nameToId
    ids['idToName'] = idToName
    
    return ids


def setSnpsCode(patients,chromosomes):
    
    for i in patients.keys():
        patients[i].snpCode(chromosomes)
        
    return patients
    



def tables(sampleX,sampleY,k):
  
    samples = {}
    
    for run in range(1,k+1):
        
        d1 = {}
        

        dataTestX = sampleX[run]
        dataTestY = sampleY[run]

        n = 0

        for i in sampleX.keys():

            if i != run:

                n += len(sampleX[i])

        dataTrainX = np.zeros((n,len(sampleX[1].T)),dtype = int)
        dataTrainY = np.zeros((n,),dtype = int)

        count = 0

        for sample in sampleX.keys():

            if sample != run:

                 for i in range(len(sampleX[sample])):
                    for j in range(len(sampleX[sample].T)):
                        dataTrainX[count,j] = sampleX[sample][i,j]

                    dataTrainY[count] = sampleY[sample][i]
                    count += 1

        d1['trainX'] = dataTrainX
        d1['trainY'] = dataTrainY
        d1['testX'] = dataTestX
        d1['testY'] = dataTestY
        
        samples[run] = d1
    
    return samples
    
def kSampleData(k,X,Y):
    
    x = int (len(X) / k)
    allElements = np.zeros((len(X),),dtype = int)
    
    count1 = 1
    sampleX = {}
    sampleY = {}
   
    
    while count1 <= k:
        count2 = 1
        sampleData = []
        
        if count1 == k:
            x =  len(X) - ((k-1) * x)
        
        dataX = np.zeros((x,len(X.T)),dtype = int)
        dataY = np.zeros((x,),dtype = int)
        
        while count2 <= x:
            
            aRand = randint(0,len(X)-1)
            
            while allElements[aRand] == 1:
                
                aRand = randint(0,len(X)-1)
            
            allElements[aRand] = 1
            sampleData.append(aRand)
            count2 += 1
            
        for i in range(len(sampleData)):
            for j in range(len(X.T)):
                dataX[i,j] = X[sampleData[i],j]
            
            dataY[i] = Y[sampleData[i]]
            
        sampleX[count1] = dataX
        sampleY[count1] = dataY
        count1 +=1
        
    return tables(sampleX,sampleY,k)


def calculateJaccardSim(X):
    
    xNew = np.zeros((len(X.T),len(X.T)),dtype = float)
    
    for i in range(len(X.T)):
        
        for j in range(i+1,len(X.T)):
            
            result = metrics.jaccard_similarity_score(X[:,i],X[:,j])
            
            xNew[i,j] = result
            xNew[j,i] = result
            
        xNew[i,i] = 1.0
        
    
    return xNew    

def reduceFeatures(X,a,b,c = 10,method = 'Cosine_Similarity'):
    
    snpsOut = []
    snpsIn = []
    snps1 = []
    snps2 = []
    snpsRandom = []
    
    snpsReturn = {}
    
    if method == 'Cosine_Similarity':
        
        snpsCount = {}
        print("size = ",len(X.T))
        for i in range(len(X.T)):
            
            snpsCount[i] = 0
        
        xNew = metrics.pairwise.cosine_similarity(X.T)
        print("xNew shape ",xNew.shape)
       
        for i in range(len(xNew)):
            for j in range(i+1,len(xNew.T)):
                    
                if ((xNew[i,j] - a >= 1e-10) and (xNew[i,j] - b) <= 1e-10):
       
                    snpsCount[j] = snpsCount[j] + 1
                    snpsCount[i] = snpsCount[i] + 1
                    
        countBigThanZero = 0  
       
        
        for i in snpsCount.keys():
            if snpsCount[i] > 0:
                countBigThanZero += 1
        
         
        for i in snpsCount.keys():
            
            if snpsCount[i] > 0:
                snpsIn.append(i)
            
            if snpsCount[i] > countBigThanZero * c / 100:           
                snps1.append(i)
        
        for i in range(len(X.T)):
            if i not in snps1:
                snps2.append(i)
                
        for i in snps2:
            flag = 0
            for j in snps2:
                 if (1 - xNew[i,j]) <= 1e-2 and j != i:
                        flag = 1
                        break
            if flag == 0:
                snpsOut.append(i)
                
                
        snpsSelected = np.zeros((len(X.T),1),dtype = int)
     
        for i in snpsOut:
            snpsSelected[i] = 1
        
        if len(snpsOut) < len(X.T)/2:
            size = len(snpsOut) + 1
        elif len(snpsOut) == len(X.T):
            size = 0
        else:
            size = len(X.T) - len(snpsOut) + 1
        
        for i in range(1,size):
            
            aRand = randint(0,len(X.T)-1)
    
            while(snpsSelected[aRand] == 1):

                aRand = randint(0,len(X.T)-1)
                
            snpsSelected[aRand] = 1
            snpsRandom.append(aRand)
        
       
        
        print("snps = ",len(snpsOut))
        print("len snpsIn = ",len(snpsIn))
        print("len snpsRandom = ",len(snpsRandom))
        
        snpsReturn['snpsOutArea'] = snpsOut
        snpsReturn['snpsInArea'] = snpsIn
        snpsReturn['snpsRandom'] = snpsRandom
        
    elif method == 'jaccard1':
    
        snpsPairs = {}
        print("size = ",len(X.T))
        for i in range(len(X.T)):
            snpsPairs[i] = 0
            
        
        xNew = X
        print("xNew shape ",xNew.shape)
       
        for i in range(len(xNew)):
            for j in range(i+1,len(xNew.T)):
                    
                if (xNew[i,j] - a <= 1e-10) :
                    
                    snpsPairs[i] = snpsPairs[i] + 1
                    snpsPairs[j] = snpsPairs[j] + 1
                    
                
        for i in snpsPairs.keys():
            if snpsPairs[i] >= c * len(X.T) / 100:
                snpsOut.append(i)
       
                   
        
        
        print("snps = ",len(snpsOut))
        print("len snpsIn = ",len(snpsIn))
        print("len snpsRandom = ",len(snpsRandom))
        
        snpsReturn['snpsOutArea'] = snpsOut
        snpsReturn['snpsInArea'] = snpsIn
        snpsReturn['snpsRandom'] = snpsRandom
        
    
            
    return snpsReturn


def createNewTable(snps,X):
    
    newX = np.zeros((len(X),len(snps)),dtype = int)
    
    for i in range(len(newX)):
        for j in range(len(newX.T)):
            newX[i,j] = -1
    
    for i in range(len(snps)):
        for j in range(len(X)):
            
            newX[j,i] = X[j,snps[i]]
            
    print("new shape = ",newX.shape)
            
    return newX



def crossValidiation(X, Y, k = 1, continious = True, classifier = None,OLS = False,Logistic = False):
    
    if not classifier:
        print("wrong!!!!!!! you have to choise a classifier")
        return
    
    results = {}
    crossVal = {}
    auc = {}
    recall = {}
    precision = {}
    f1Score = {}
    
    sumResults = 0.0
    sumCross = 0.0
    sumAuc = 0.0
    sumRecall = 0.0
    sumPrecision = 0.0
    sumF1Score = 0.0
    
    samples = kSampleData(k,X,Y)
    
    for run in range(1, k + 1):
        
        trainX = samples[run]['trainX']
        trainY = samples[run]['trainY']
        
        testX = samples[run]['testX']
        testY = samples[run]['testY']
        
        
        if OLS:
            classifier = sm.OLS(trainY,trainX)
            yPredict = classifier.fit().predict(testX)
        else:

            classifier.fit(trainX, trainY)
            yPredict = classifier.predict(testX)
        
        if continious:
            
            for i in range(len(yPredict)):
                
                if (abs(0 - yPredict[i]) - abs(1 - yPredict[i])) <= 1e-10 :
                    yPredict[i] = 0
                else:
                    yPredict[i] = 1
                    
        if Logistic:
            
            probabilities = classifier.predict_proba(testX)
            
            for i in range(len(probabilities)):
                if probabilities[i][1] >= 0.8:
                    yPredict[i] = 1
                else:
                    yPredict[i] = 0
          
        crossVal[run] = metrics.accuracy_score(testY,yPredict)#(yPredict,testY)#
        fpr, tpr, thresholds = metrics.roc_curve(testY,yPredict)#(yPredict,testY)#(testY,yPredict)
        auc[run] = metrics.auc(fpr,tpr)
        recall[run] = metrics.recall_score(testY,yPredict)#(yPredict,testY)#(testY,yPredict)
        precision[run] = metrics.precision_score(testY,yPredict)#(yPredict,testY)#(testY,yPredict)
        f1Score[run] = f1_score(testY, yPredict, average='binary')
        
    
    for i in crossVal.keys():
        sumCross = sumCross + crossVal[i]
        sumAuc = sumAuc + auc[i]
        sumRecall = sumRecall + recall[i]
        sumPrecision = sumPrecision + precision[i]
        sumF1Score = sumF1Score + f1Score[i]
    
    results['cross'] = sumCross / k
    results['auc'] = sumAuc / k
    results['recall'] = sumRecall / k
    results['precision'] = sumPrecision / k
    results['f1'] = sumF1Score / k
    
    return results



In [None]:
path = 'C:\\Users\\ANTONIS\\Desktop\\p = 0.0001\\' #bake ton fakelo pou 8a epe3ergas8eis
#path = 'C:\\Users\\ANTONIS\\Desktop\\pValues\\pvalue = 1e-05\\'

numberOfChromosomes = 22#'ari8mos twn xromoswmatwn'
patientsTrain = {}
patientsTest = {}
allPatients = {}

chromosomes = {}

read = Read(path,numberOfChromosomes)
write = Write(path,numberOfChromosomes)

patients = read.readPatients('Patients.txt')
chromosomes = read.readSnps(".assoc")
write.writePatientsList(patients,'patient.txt')

write.writeSnpsList(chromosomes)


# run train_lgen bat and test_leg bat

In [None]:

snps = read.getListOfSnps()
ids = {} 
idsTest = {}



if os.path.exists(path + 'snpCode.txt'):
    print("mphka")
    patients = read.readSnpsCode(patients)
    
    
else:
    patients = read.readLgen(patients)
    
   
    patients = setSnpsCode(patients,chromosomes)
    
    


ids['patients'] = setIdToName(list(patients.keys()))
ids['snps'] = setIdToName(snps)





In [None]:
trainSet = DataSet(patients,ids)


X = trainSet.getXTable()
Y = trainSet.getYTable()

if not os.path.exists(path + 'snpCode.txt'):
    print("mphka 3")
    write.saveData(ids,patientsTrain,'Train',xTraining,chromosomes)
    write.saveData(idsTest,patientsTest,'Test',xTest)

In [None]:
lr_clf = linear_model.LogisticRegression()  
lr_clf.fit(mergeXtable, mergeYtable)
xTraining = lr_clf.fit_transform(X,Y)



In [None]:
Xt = xNew = calculateJaccardSim(X)

In [None]:
count = {}
sn = {}
s = []
for i in range(5415):
    count[i] = 0
    sn[i] = []
for i in range(5415):
    for j in range(i+1, 5415):
        if Xt[i,j] - 0.3 >= 1e-10 and Xt[i,j] - 0.6 <= 1e-10 :
            count[i] = count[i] + 1
            count[j] = count[j] + 1
            sn[i].append(j)


    
c = 0
for i in sn.keys():
    
    c = c + len((sn[i]))
              
print("c = ",c)

for i in range(len(X.T)):
    for j in range(i+1,len(X.T)):
         sn[j] = list(set(sn[j]) - set(sn[i]))
            
    l = []
    l.append(j)
    sn[i] = list(set(sn[i]) - set(l))      
            
'''c = 0
for i in count.keys():
    if count[i] >= 25 * 5415 /100:
        c +=1
   # if len(sn[i]) > 0:
    #    print("i = ",i)
    #    print(sn[i])
    #    print()
        
print("c = ",c)'''

In [None]:
snpsReduced = reduceFeatures(X,0.3,0.6,c = 10,method = 'Cosine_Similarity')

In [None]:
XX = createNewTable(snpsReduced['snpsOutArea'],X)

In [None]:
snps1 = snpsReduced['snpsOutArea']
snps2 = snpsReduced['snpsRandom']
snps3 = snpsReduced['snpsInArea']

count = 0
for i in snps2:
    if i in snps1:
        count += 1
        
print("random - outArea = ",count)

count = 0
for i in snps2:
    if i in snps3:
        count += 1
        
print("random - inArea = ",count)

count = 0
for i in snps1:
    if i in snps3:
        count += 1
        
print("outArea - inArea = ",count)

In [None]:
from sklearn.feature_selection import VarianceThreshold

sel = VarianceThreshold(threshold=(.4 * (1 - .9)))
xTraining = sel.fit_transform(X)


In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

xTraining = SelectKBest(chi2, k=2500).fit_transform(X, Y)


In [None]:
rfr = RandomForestRegressor(n_estimators = 100, random_state = 2016, verbose = 10,max_depth = None,n_jobs=1)
rfr.fit(X, Y)
xTraining = rfr.transform(X)
print(xTraining.shape)

In [None]:
enc = OneHotEncoder(n_values =3)
enc.fit(xTraining) 
XX = enc.fit_transform(xTraining)
print(XX.toarray())
XX = XX.toarray()


In [None]:
xTraining1, xTest1, yTraining1, yTest = train_test_split(XX, Y, test_size=0.1, random_state=0)
print("mergex = ",XX.shape)
print("xTrain = ",xTraining1.shape)
print("xTest = ",xTest1.shape)
print("yTrain = ",yTraining1.shape)
print("yTest = ",yTest.shape)

# # RF

In [None]:
rfr = RandomForestRegressor(n_estimators = 100, random_state = 2017, verbose = 10,
                            max_depth = None,n_jobs=-1)
rfr.fit(xTraining1, yTraining1)
yPredict3 = rfr.predict(xTest1)

count = 0
for i in range(len(yPredict3)):
    count += yPredict3[i]

mo3 = count / len(yPredict3)

for i in range(len(yPredict3)):
    #if yPredict3[i] < mo3:
    if (abs(0 - yPredict3[i]) - abs(1- yPredict3[i])) < 1e-10 :
        yPredict3[i] = 0
    else:
        yPredict3[i] = 1
        
print(metrics.accuracy_score(yTest,yPredict3))
print(metrics.confusion_matrix(yTest,yPredict3))
error3 = mean_squared_error(yTest, yPredict3)
print("error 3 = ",error3)
RMSE3 = mean_squared_error(yTest,yPredict3)**0.5
print("RMSE3 = ",RMSE3)

#print("cros validation = ",crossValidiation(mergeXtable, mergeYtable, k = 10, classifier = rfr))
fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict3)
print("AUC = ", metrics.auc(fpr,tpr))
print("recal = ",metrics.recall_score(yTest,yPredict3))
print("precision = ",metrics.precision_score(yTest,yPredict3))
print("f1Score = ",f1_score(yTest, yPredict3, average='binary'))
print()
results = crossValidiation(ΧΧ,Y, k = 10, classifier = rfr)
print("cros validation = ",results['cross'])
print("AUC = ", results['auc'])
print("recal = ",results['recall'])
print("precision = ",results['precision'])
print("f1 = ",results['f1'])

# SVM

In [None]:

#SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
 # kernel='rbf', max_iter=-1, probability=False, shrinking=True, tol=0.001,
 # verbose=False)


clf = SVC()
clf.fit(xTraining1, yTraining1)
yPredict2 = clf.predict(xTest1)
print(metrics.accuracy_score(yTest,yPredict2))
print(metrics.confusion_matrix(yTest,yPredict2))
error2 = mean_squared_error(yTest, yPredict2)
print("error 2 = ",error2)
RMSE2 = mean_squared_error(yTest,yPredict2)**0.5
print("RMSE2 = ",RMSE2)

#print("cros validation = ",crossValidiation(mergeXtable, mergeYtable, k = 10, classifier = clf))
fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict2)
print("AUC = ", metrics.auc(fpr,tpr))
print("recal = ",metrics.recall_score(yTest,yPredict2))
print("precision = ",metrics.precision_score(yTest,yPredict2))
print("f1Score = ",f1_score(yTest, yPredict2, average='binary'))
print()

results = crossValidiation(XX, Y, k = 10, classifier = clf,continious = False)
#results = crossValidiation1(mergeXtable, mergeYtable, k = 10, classifier = 'svm',area = 'snpsOutArea',continious = False)
print("cros validation = ",results['cross'])
print("AUC = ", results['auc'])
print("recal = ",results['recall'])
print("precision = ",results['precision'])
print("f1 = ",results['f1'])

In [None]:
clf = SVC(kernel ='poly')
clf.fit(xTraining1, yTraining1)
yPredict2 = clf.predict(xTest1)
print(metrics.accuracy_score(yTest,yPredict2))
print(metrics.confusion_matrix(yTest,yPredict2))
error2 = mean_squared_error(yTest, yPredict2)
print("error 2 = ",error2)
RMSE2 = mean_squared_error(yTest,yPredict2)**0.5
print("RMSE2 = ",RMSE2)

#print("cros validation = ",crossValidiation(mergeXtable, mergeYtable, k = 10, classifier = clf))
fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict2)
print("AUC = ", metrics.auc(fpr,tpr))
print("recal = ",metrics.recall_score(yTest,yPredict2))
print("precision = ",metrics.precision_score(yTest,yPredict2))
print("f1Score = ",f1_score(yTest, yPredict2, average='binary'))
print()

results = crossValidiation(XX, Y, k = 10, classifier = clf,continious = False)
#results = crossValidiation1(mergeXtable, mergeYtable, k = 10, classifier = 'svm',area = 'snpsOutArea',continious = False)
print("cros validation = ",results['cross'])
print("AUC = ", results['auc'])
print("recal = ",results['recall'])
print("precision = ",results['precision'])
print("f1 = ",results['f1'])

# SVM KERNEL

In [None]:

clf = SVC(kernel='linear')
clf.fit(xTraining1, yTraining1)
yPredict2 = clf.predict(xTest1)
print(metrics.accuracy_score(yTest,yPredict2))
print(metrics.confusion_matrix(yTest,yPredict2))
error2 = mean_squared_error(yTest, yPredict2)
print("error 2 = ",error2)
RMSE2 = mean_squared_error(yTest,yPredict2)**0.5
print("RMSE2 = ",RMSE2)

#print("cros validation = ",crossValidiation(mergeXtable, mergeYtable, k = 10, classifier = clf))
fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict2)
print("AUC = ", metrics.auc(fpr,tpr))
print("recal = ",metrics.recall_score(yTest,yPredict2))
print("precision = ",metrics.precision_score(yTest,yPredict2))
print("f1Score = ",f1_score(yTest, yPredict2, average='binary'))
print()

results = crossValidiation(XX, Y, k = 10, classifier = clf, continious = False)
#results = crossValidiation1(mergeXtable, mergeYtable, k = 10, classifier = 'svmlr',area = 'snpsOutArea',continious = False)
print("cros validation = ",results['cross'])
print("AUC = ", results['auc'])
print("recal = ",results['recall'])
print("precision = ",results['precision'])
print("f1 = ",results['f1'])

In [None]:
re = cross_val_score(clf, XX, Y, cv=10)
print(sum(re)/10)

# LINEAR LOGISTIC REGRESSION 

In [None]:
lr_clf = linear_model.LogisticRegression()  
lr_clf.fit(xTraining1, yTraining1)
yPredict4 = lr_clf.predict(xTest1)

print(metrics.accuracy_score(yTest,yPredict4))
print(metrics.confusion_matrix(yTest,yPredict4))
error4 = mean_squared_error(yTest, yPredict4)
print("error 4 = ",error4)
RMSE4 = mean_squared_error(yTest,yPredict4)**0.5
print("RMSE4 = ",RMSE4)


fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict4)
print("AUC = ", metrics.auc(fpr,tpr))
print("recal = ",metrics.recall_score(yTest,yPredict4))
print("precision = ",metrics.precision_score(yTest,yPredict4))
print("f1Score = ",f1_score(yTest, yPredict4, average='binary'))
print()

results = crossValidiation(XX, Y, k = 10, classifier = lr_clf, continious = False)
#results = crossValidiation1(mergeXtable, mergeYtable, k = 10, classifier = 'llr',area = 'snpsOutArea',continious = False)
print("cros validation = ",results['cross'])
print("AUC = ", results['auc'])
print("recal = ",results['recall'])
print("precision = ",results['precision'])
print("f1 = ",results['f1'])

In [None]:
re = cross_val_score(lr_clf, XX, Y, cv=10)
print(sum(re)/10)

# Linear Perceptron

In [None]:
perceptron = linear_model.Perceptron(penalty='l1', alpha=0.00000001, fit_intercept=True,
              n_iter=100, shuffle=True, verbose=2016, eta0=0.00000001, n_jobs=-1, random_state=2016, warm_start=True)

perceptron.fit(xTraining1, yTraining1)
yPredict4 = perceptron.predict(xTest1)

print(metrics.accuracy_score(yTest,yPredict4))
print(metrics.confusion_matrix(yTest,yPredict4))
error4 = mean_squared_error(yTest, yPredict4)
print("error 4 = ",error4)
RMSE4 = mean_squared_error(yTest,yPredict4)**0.5
print("RMSE4 = ",RMSE4)

fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict4)
print("AUC = ", metrics.auc(fpr,tpr))
print("recal = ",metrics.recall_score(yTest,yPredict4))
print("precision = ",metrics.precision_score(yTest,yPredict4))
print("f1Score = ",f1_score(yTest, yPredict4, average='binary'))
print()

results = crossValidiation(XX, Y, k = 10, classifier = perceptron, continious = False)
#results = crossValidiation1(mergeXtable, mergeYtable, k = 10, classifier = 'perce',area = 'snpsOutArea',continious = False)
print("cros validation = ",results['cross'])
print("AUC = ", results['auc'])
print("recal = ",results['recall'])
print("precision = ",results['precision'])
print("f1 = ",results['f1'])

In [None]:
re = cross_val_score(perceptron, XX, Y, cv=10)
print(sum(re)/10)

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(xTraining1, yTraining1)
yPredict6 = gnb.predict(xTest1)
print(metrics.accuracy_score(yTest,yPredict6))
print(metrics.confusion_matrix(yTest,yPredict6))
error6 = mean_squared_error(yTest, yPredict6)
print("error 6 = ",error6)
RMSE6 = mean_squared_error(yTest,yPredict6)**0.5
print("RMSE6 = ",RMSE6)

#print("cros validation = ",crossValidiation(mergeXtable, mergeYtable, k = 10, classifier = gnb))
fpr, tpr, thresholds = metrics.roc_curve(yTest,yPredict6)
print("AUC = ", metrics.auc(fpr,tpr))
print("recal = ",metrics.recall_score(yTest,yPredict6))
print("precision = ",metrics.precision_score(yTest,yPredict6))
print("f1Score = ",f1_score(yTest, yPredict6, average='binary'))
print()

results = crossValidiation(XX, Y, k = 10,classifier = gnb, continious = False)
#results = crossValidiation1(mergeXtable, mergeYtable, k = 10, classifier = 'gnb',area = 'snpsOutArea',continious = False)
print("cros validation = ",results['cross'])
print("AUC = ", results['auc'])
print("recal = ",results['recall'])
print("precision = ",results['precision'])
print("f1 = ",results['f1'])

In [None]:
re = cross_val_score(gnb, XX, Y, cv=10)
print(sum(re)/10)

# Decision Tree

In [None]:

dt = tree.DecisionTreeClassifier()
dt = dt.fit(xTraining1, yTraining1)

yPredict7 = dt.predict(xTest1)
print(metrics.accuracy_score(yTest,yPredict7))
print(metrics.confusion_matrix(yTest,yPredict7))
error6 = mean_squared_error(yTest, yPredict7)
print("error 6 = ",error6)
RMSE6 = mean_squared_error(yTest,yPredict7)**0.5
print("RMSE6 = ",RMSE6)

results = crossValidiation(XX,Y, k = 10, classifier = dt, continious = False)
#results = crossValidiation1(mergeXtable, mergeYtable, k = 10, classifier = 'dt',area = 'snpsOutArea',continious = False)
print("cros validation = ",results['cross'])
print("AUC = ", results['auc'])
print("recal = ",results['recall'])
print("precision = ",results['precision'])
print("f1 = ",results['f1'])

In [None]:
re = cross_val_score(dt, XX, Y, cv=10)
print(sum(re)/10)