In [None]:
import numpy as np
import os.path
import csv

class PatientPhenotype:
    
    def __init__(self, eid, case, sex, yearBirth):
        
        self.eid = eid
        self.case = case
        self.sex = sex
        self.yearBirth = yearBirth
        self.snps = {}
        
    def getEid(self):
        return self.eid
     
    def getCase(self):
        return self.case
    
    def getSex(self):
        return self.sex
    
    def getYearBirth(self):
        return self.yearBirth
        
    def addSnps(self, snpId, allele1,allele2):
        self.snps[snpId] = Snp(snpId,allele1,allele2)
        
    def snpCode(self,snpId,allele1,allele2):
       
        self.snps[snpId].setSnpCode(allele1,allele2)
        
    def getSnpCode(self,snpId):
        return self.snps[snpId].getSnpCode()
        
    def getSize(self):
        return len(self.snps)
        
        
class Snp:
    
    def __init__(self,snpId,allele1,allele2):
        
        self.snpId = snpId
        self.allele1 = allele1
        self.allele2 = allele2
        self.snpCode = -1
        
    def getId(self):
        
        return self.snpId
        
    def getAllele1(self):
        
        return self.allele1
        
    def getAllele2(self):
        
        return self.allele2
        
    def setSnpCode(self,allele1,allele2):
      
        if (self.allele1 == allele1 and self.allele2 == allele2) or (self.allele1 == allele2 and self.allele2 == allele1):
            code = 2
           
        elif (self.allele1 == allele1 and self.allele2 != allele2) or (self.allele1 == allele2 and self.allele2 != allele1):
            code = 1
           
        elif (self.allele1 != allele1 and self.allele2 == allele2) or (self.allele1 != allele2 and self.allele2 == allele1):
            code = 1
            
        elif (self.allele1 != allele1 and self.allele2 != allele2) or (self.allele1 != allele2 and self.allele2 != allele1):
            code = 0
            
        self.snpCode = code
        
    def getSnpCode(self):
        
        return self.snpCode
    
    
class SaveDataSet():
    def __init__(self,name,ids,data):
        
        self.name = name
        self.ids = ids
        self.data = data
        self.__writeLogFile()
        self.__writeVariables('patients')
        self.__writeVariables('snps')
        self.__writeCSV()
        
    def __writeLogFile(self):
        
        write = open(self.name+'.log','w')
        write.write("Log file for the Test Data Set "+self.name+'\n')
        write.write(self.name + " has "+ str(len(self.ids['patients']['nameToId'].keys())) + " patients "'\n')
        write.write(self.name + " has "+ str(len(self.ids['snps']['nameToId'].keys())) + " snps "'\n')
        write.close()
        
    def __writeVariables(self,kind):
        
        write = open(self.name+'_'+kind+'.txt','w')
        write.write(kind + '\n')
        
        for i in range(len(self.ids[kind]['nameToId'].keys())):
            name = self.ids[kind]['idToName'][i]
            write.write(name + '\t'+str(self.ids[kind]['nameToId'][name]) + '\n')
        
        write.close()
        
        
    def __writeCSV(self):
        
        f = open(self.name+'.csv', 'wt')
        try:
            writer = csv.writer(f)
            
            line = ' '
            for i in range(len(self.ids['snps']['nameToId'].keys())):
                if i == 0 :
                    line += self.ids['snps']['idToName'][i]
                else:
                    line += ','+self.ids['snps']['idToName'][i]
            writer.writerow([line])
            
            line = ' ' 
            for i in range(len(self.ids['snps']['nameToId'].keys())):
                name = self.ids['snps']['idToName'][i]
                if i == 0:
                    line += str(self.ids['snps']['nameToId'][name])
                else:
                    line += ','+str(self.ids['snps']['nameToId'][name])
            writer.writerow([line])
            
            
            for i in range(len(self.ids['patients']['nameToId'].keys())):
                line = ''
                for j in range(len(self.ids['snps']['nameToId'].keys())):
                    if j == 0:
                        line += str(self.data[i,j]) 
                    else:
                        
                        line += ','+str(self.data[i,j])
                        
                writer.writerow([line])
        finally:
            f.close()
        

def setIdToName(aList):
    
    ids = {}
    nameToId = {}
    idToName = {}
    count = 0
    
    for i in aList:
        
        nameToId[i] = count
        idToName[count] = i
        count += 1
        
    ids['nameToId'] = nameToId
    ids['idToName'] = idToName
    
    return ids

In [2]:
minorSnps = []
chromosomes = {}

for i in range(14):
    
    chro = 'chr'+str(i+1)
    path = 'C:\\Users\\ANTONIS\\Desktop\\test1\\'+chro+'.assoc.fisher'
    f = open(path,'r')
    f.readline()
    snps = {}
    
    for line in f:
        alleles = []
        minorSnps.append(line.split()[1])
        alleles.append(line.split()[3])
        alleles.append(line.split()[6])
        snps[line.split()[1]] = alleles
        
    chromosomes[chro] = snps
    
for i in range(14):
    
    chro = 'chr'+str(i+1)
    path = 'C:\\Users\\ANTONIS\\Desktop\\test1\\'+chro+'snpList.txt'
    write = open(path,'w')
    for snp in chromosomes[chro].keys():
        write.write(snp + '\n')
            
    write.close()
    


# TRAIN SET CREATION

In [3]:
########   TRAIN ##########

patientsPhen = {}

f = open('C:\\Users\\Antonis\\Documents\\GitHub\\diplwmatikh\\data\\phenotype_euro_train.txt','r')
f.readline()

for line in f:
    patientsPhen[line.split()[0]] = PatientPhenotype(line.split()[0],line.split()[3],line.split()[1],line.split()[2])

    
path = 'C:\\Users\\ANTONIS\\Desktop\\test1\\'+'trainPatient.txt'
write = open(path,'w')
for patient in patientsPhen.keys():
    write.write(patient + '\n')
            
write.close()
    





# run train_lgen bat

In [4]:
ids = {}
for i in range(14):
    
    chro = 'chr'+str(i+1)
    path = 'C:\\Users\\ANTONIS\\Desktop\\test1\\'+chro+'train.lgen'
    
    if os.path.exists(path):
        f = open(path,'r')
    
        for line in f:
            patientsPhen[line.split()[0]].addSnps(line.split()[2], line.split()[3],line.split()[4])
    

    
for i in range(14):
    
    chro = 'chr'+str(i+1)
    
    for snp in chromosomes[chro].keys():
        
        allele1 = chromosomes[chro][snp][0]
        allele2 = chromosomes[chro][snp][1]
    
        for patient in patientsPhen.keys():
            
            patientsPhen[patient].snpCode(snp,allele1,allele2)
              
count = 0

for patient in patientsPhen.keys():
            
    if patientsPhen[patient].getSize() == len(minorSnps):
        
        count += 1
        
print("count is ",count)
print("snps is ",len(minorSnps))


ids['patients'] = setIdToName(list(patientsPhen.keys()))
ids['snps'] = setIdToName(minorSnps)
    
xTraining = np.zeros((len(ids['patients']['nameToId'].keys()), len(ids['snps']['nameToId'].keys())),dtype = int)

print("shape id ",xTraining.shape)
print("xtraining = ",len(xTraining))
print("xtraining.T = ",len(xTraining.T))
    
for i in range(len(xTraining)):
    for j in range(len(xTraining.T)):
        
        patient = ids['patients']['idToName'][i]
        snp = ids['snps']['idToName'][j]
        
        xTraining[i,j] = patientsPhen[patient].getSnpCode(snp)
        
        
yTraining = []

for i in range(len(ids['patients']['nameToId'].keys())):
    
    patient = ids['patients']['idToName'][i]
    yTraining.append(patientsPhen[patient].getCase())

count is  4482
snps is  334
shape id  (4482, 334)
xtraining =  4482
xtraining.T =  334


In [56]:
dok = SaveDataSet("dok",ids,xTraining)

KeyboardInterrupt: 

In [57]:
import pandas as pd
df = pd.read_csv('dok.csv')
df

CParserError: Error tokenizing data. C error: out of memory

# TEST SET CREATION

In [None]:
patientsPhenTest = {}

f = open('C:\\Users\\Antonis\\Documents\\GitHub\\diplwmatikh\\data\\phenotype_euro_test.txt','r')
f.readline()
for line in f:
    patientsPhenTest[line.split()[0]] = PatientPhenotype(line.split()[0],line.split()[3],line.split()[1],line.split()[2])  
    
    
path = 'C:\\Users\\ANTONIS\\Desktop\\test1\\'+'testPatient.txt'
write = open(path,'w'):
    for patient in patientsPhenTest.keys():
        write.write(patient + '\n')
            
write.close()

# run test_lgen bat

In [None]:
idsTest = {}

for i in range(14):
    
    chro = 'chr'+str(i+1)
    path = 'C:\\Users\\ANTONIS\\Desktop\\test1\\'+chro+'test.lgen'
    if os.path.exists(path):
        f = open(path,'r')
    
        for line in f:
            patientsPhenTest[line.split()[0]].addSnps(line.split()[2], line.split()[3],line.split()[4]):
    

    
for i in range(14):
    
    chro = 'chr'+str(i+1)
    
    for snp in chromosomes[chro].keys():
        
        allele1 = chromosomes[chro][snp][0]
        allele2 = chromosomes[chro][snp][1]
        
        for patient in patientsPhenTest.keys():
            
            patientsPhenTest[patient].snpCode(snp,allele1,allele2)
            
            

    
count = 0

for patient in patientsPhenTest.keys():
            
    if patientsPhenTest[patient].getSize() == len(minorSnps):
        
        count += 1
        
print("count is Test",count)
print("snps is Test",len(minorsSnps))


idsTest['patients'] = setIdToName(list(patientsPhenTest.keys()))
idsTest['snps'] = setIdToName(minorSnps)
    
xTest = np.zeros((len(idsTest['patients']['nameToId'].keys()), ldtype=ype=(idsTest['snps']['nameToId'].keys())),dtype = int)

print("shape id Test",xTest.shape)
print("xtraining = ",len(xTest))
print("xtest.T = ",len(xTest.T))
    
for i in range(xTest):
    for j in range(xTest.T):
        
        patient = idsTest['patient']['idToName'][i]
        snp = idsTest['snp']['idToName'][j]
        
        xTest[i,j] = patientsPhenTest[patient].getSnpCode(snp)
        
        
yTest = []

for i in range(len(idsTest['patients']['nameToId'].keys()):
    
    patient = idsTest['patient']['idToName'][i]
    yTest.append(patientsPhenTest[patient].getCase())

yTest = np.array(yTest)