In [None]:
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import numpy as np
import os.path
import csv
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn import grid_search
import sklearn.linear_model as linear_model
import sklearn.metrics as metrics
from sklearn import preprocessing
import statsmodels.api as sm
import statsmodels.formula.api as smf

class PatientPhenotype:
    
    def __init__(self, eid, case, sex, yearBirth):
        
        self.eid = eid.strip()
        self.case = int (case.strip())
        self.sex = sex.strip()
        self.yearBirth = yearBirth.strip()
        self.snps = {}
        
    def getEid(self):
        return self.eid
     
    def getCase(self):
        return self.case
    
    def getSex(self):
        return self.sex
    
    def getYearBirth(self):
        return self.yearBirth
        
    def addSnps(self, snpId, allele1,allele2):
        self.snps[snpId] = Snp(snpId,allele1,allele2)
        
    def snpCode(self,chromosomes = {}, snp = '', code = -1):
    
        if len(chromosomes.keys()) > 0:
    
             for i in range(len(chromosomes.keys())):
    
                chro = 'chr'+str(i+1)
            
                for snp in chromosomes[chro].keys():
                
                    allele1 = chromosomes[chro][snp][0].strip()
                    allele2 = chromosomes[chro][snp][1].strip()
    
                    self.snps[snp.strip()].setSnpCode(allele1,allele2)
        else:
            
            self.snps[snp.strip()].setCode(code)
            
    def getSnpCode(self,snpId):
        return self.snps[snpId].getSnpCode()
    
    def getAllele1(self,snpId):
        return self.snps[snpId].getAllele1()
    
    def getAllele2(self,snpId):
        return self.snps[snpId].getAllele2()
        
    def getSize(self):
        return len(self.snps)
        
        
class Snp:
    
    def __init__(self,snpId,allele1,allele2):
        
        self.snpId = snpId
        self.allele1 = allele1
        self.allele2 = allele2
        self.snpCode = -1
        
    def getId(self):
        
        return self.snpId
        
    def getAllele1(self):
        
        return self.allele1
        
    def getAllele2(self):
        
        return self.allele2
        
    def setSnpCode(self,allele1,allele2):
      
        if self.allele1.strip() == allele1.strip() and self.allele2.strip() == allele1.strip():
            code = 2
           
        elif self.allele1.strip() == allele1.strip() and self.allele2.strip() != allele1.strip():
            code = 1
           
        elif self.allele1.strip() != allele1.strip() and self.allele2.strip() == allele1.strip():
            code = 1
            
        elif self.allele1.strip() != allele1.strip() and self.allele2.strip() != allele1.strip():
            code = 0
            
        self.snpCode = code
        
    def setCode(self,aCode):
        
        self.snpCode = aCode
        
    def getSnpCode(self):
        
        return self.snpCode
    
class Read:
    def __init__(self,path,numberOfChromosomes):
        
        self.chromosomes = {}
        self.numberOfSnps = 0
        self.path = path
        self.numberOfChromosomes = numberOfChromosomes
        
    def readPatients(self,kind):
        patients = {}
        f = open(self.path + kind,'r')
        f.readline()

        for line in f:
            patients[line.split()[0].strip()] = PatientPhenotype(line.split()[0],line.split()[3],line.split()[1],line.split()[2])
        
        f.close()
        
        return patients
    
    def readSnps(self,fileKind):
        
        for i in range(self.numberOfChromosomes):
    
            chro = 'chr'+str(i+1)
            path = self.path + chro + fileKind
            f = open(path,'r')
            f.readline()
    
            self.chromosomes[chro] = self.__readSnpsOfChromosome(f)
        
            f.close()
        
        return self.chromosomes
    
    def __readSnpsOfChromosome(self,file):
        
        snps = {} 
       
        for line in file:
            
            alleles = []
            alleles.append(line.split()[3].strip())
            alleles.append(line.split()[6].strip())
            
            snps[line.split()[1].strip()] = alleles
            self.numberOfSnps += 1
                
        return snps
        
    def readLgen(self,patients,kind = ''):
    
        for i in range(self.numberOfChromosomes):
            
            chro = 'chr'+str(i+1)
            path = self.path + chro + kind +'.lgen'
    
            if os.path.exists(path):
                
                f = open(path,'r')
              
                for line in f:
                    
                    if line.split()[0].strip() in patients.keys():
                    
                        patients[line.split()[0].strip()].addSnps(line.split()[2].strip(),line.split()[3].strip(),
                                                                                              line.split()[4].strip())
                f.close()
       
        return patients
    
    def getListOfSnps(self):
        snps = []
        for i in range(self.numberOfChromosomes):
            chro = 'chr'+str(i+1)
            for snp in self.chromosomes[chro].keys():
                snps.append(snp)
        
        return snps
        
    def getNumberOfSnps(self):
        
        return self.numberOfSnps
    
    def readIds(self,kind):
        
        ids = {}
        
        ids['patients'] = self.__readPatientsId(kind)
        ids['snps'] = self.__readSnpsId()
        
        return ids
    
    def __readPatientsId(self,kind):
        
        ids = {}
        idToName = {}
        nameToId = {}
        
        read = open(self.path + kind + 'Ids.txt','r')    
        read.readline()
        
        for line in read:
            
            name = line.split('\t')[0].strip()
            idi = int (line.split('\t')[1].strip())
            
            nameToId[name] = idi
            idToName[idi] = name
            
        ids['nameToId'] = nameToId
        ids['idToName'] = idToName
        
        read.close()
                      
        return ids  
    
    
    def __readSnpsId(self):
        
        ids = {}
        idToName = {}
        nameToId = {}
        
        read = open(self.path + 'SnpsIds.txt','r')    
        read.readline()
        
        for line in read:
            
            name = line.split('\t')[0].strip()
            idi = int (line.split('\t')[1].strip())
            
            nameToId[name] = idi
            idToName[idi] = name 
                      
        ids['nameToId'] = nameToId
        ids['idToName'] = idToName

        read.close()
                      
        return ids  
    
    def readSnpsCode(self,patients,kind = ''):
        
        read = open(self.path + kind + 'SnpCode.txt','r')
        read.readline()
        read.readline()
       
        for line in read:   
        
            patient = line.split('\t')[0].strip()
            snp = line.split('\t')[1].strip()
            code = int (line.split('\t')[2].strip())
            allele1 = line.split('\t')[3].strip()
            allele2 = line.split('\t')[4].strip()
            if patient in patients.keys():
                patients[patient].addSnps(snp,allele1,allele2)
                patients[patient].snpCode(snp = snp,code = code)
         
        read.close()
            
        return patients
            
        
    
class Write:
    
    def __init__(self,path,numberOfChromosomes):
        
        self.path = path
        self.numberOfChromosomes = numberOfChromosomes
        
    def writePatientsList(self,patients,kind):
        
        path = self.path + kind
        write = open(path,'w')
        for patient in patients.keys():
            write.write(patient.strip() + '\n')
            
        write.close()
        
        
    def writeSnpsList(self,chromosomes):
        
        for i in range(self.numberOfChromosomes):
    
            chro = 'chr'+str(i+1)
            path = self.path + chro + 'snpList.txt'
            write = open(path,'w')
            
            for snp in chromosomes[chro].keys():
                write.write(snp.strip() + '\n')
            
            write.close()
            
    def saveData(self,ids,patients,patientKind,data,chroms = {}):
        
        self.__patientsLogFile(ids['patients'],patientKind)
        self.__snpsLogFile(ids['snps'],chroms)
        self.__snpCodeLog(ids['patients']['idToName'],ids['snps']['idToName'],patients,data,patientKind)
        
    def __patientsLogFile(self,ids,patientKind):
        
        write = open(self.path + patientKind + 'Ids.txt','w')
        
        write.write(str(len(ids['nameToId'])) + '\n')
        
        for patient in ids['nameToId'].keys():
            
            write.write(patient.strip() + '\t' + str(ids['nameToId'][patient]).strip() + '\n')
            
        write.close()
        
    def __snpsLogFile(self,ids,chroms):
        
        if len(chroms.keys()) > 0:
        
            write = open(self.path + 'SnpsIds.txt','w')
        
            write.write(str(len(ids['nameToId'])) + '\n')
        
            for chro in chroms.keys():
              
                for snp in chroms[chro].keys():
                    write.write(snp.strip() + '\t' + str(ids['nameToId'][snp.strip()]).strip() + '\n')
            
            write.close()
            
    def __snpCodeLog(self,patientsIds,snpsIds,patients,data,patientKind):
        
        write = open(self.path + patientKind + 'SnpCode.txt','w')
        
        write.write(str(len(patientsIds)) + '\n')
        write.write(str(len(snpsIds)) + '\n')
        
        for i in range(len(data)):
            for j in range(len(data.T)):
                allele1 = patients[patientsIds[i]].getAllele1(snpsIds[j])
                allele2 = patients[patientsIds[i]].getAllele2(snpsIds[j])
                write.write(patientsIds[i].strip() + '\t' + snpsIds[j].strip() + '\t' + str(data[i,j]).strip() + '\t' 
                                                                            + allele1.strip() + '\t' + allele2.strip() + '\n')
                
        write.close()
        
        
            
class DataSet:
    
    def __init__(self,patients,ids):
        
        self.n = len(ids['patients']['nameToId'].keys())
        self.m =len(ids['snps']['nameToId'].keys()) 
        self.patients = patients
        self.ids = ids
                     
        self.xTable = np.zeros((self.n,self.m),dtype = int)
        self.yTable = np.zeros((self.n,),dtype = int)
        
        for i in range(self.n):
            for j in range(self.m):
                self.xTable[i,j] = -1
                     
        self.__fillXTable()
        self.__fillYTable()
                     
                     
    def __fillXTable(self):
    
        for i in range(self.n):
            for j in range(self.m):
        
                patient = self.ids['patients']['idToName'][i]
                snp = self.ids['snps']['idToName'][j]
        
                self.xTable[i,j] = self.patients[patient].getSnpCode(snp)
                     
    def __fillYTable(self):
    
        for i in range(self.n):
    
            patient = self.ids['patients']['idToName'][i]
            self.yTable[i] = self.patients[patient].getCase()
        
    def getXTable(self):
                     
        return self.xTable
                     
    def getYTable(self):
                     
        return self.yTable
    

def setIdToName(aList):
    
    ids = {}
    nameToId = {}
    idToName = {}
    count = 0
    
    for i in aList:
        
        nameToId[i] = count
        idToName[count] = i
        count += 1
        
    ids['nameToId'] = nameToId
    ids['idToName'] = idToName
    
    return ids


def setSnpsCode(patients,chromosomes):
    
    for i in patients.keys():
        patients[i].snpCode(chromosomes)
        
    return patients
    
    
def mergeXTrainXTestTable(test,train):
    
    n = len(test) + len(train)
    m = len(test.T)
    count = 0
    
    mergeTable = np.zeros((n,m),dtype = int)
    
    for i in range(len(train)):
        for j in range(len(train.T)):
            mergeTable[count,j] = train[i,j]
        count += 1
    
    
    for i in range(len(test)):
        for j in range(len(test.T)):
            mergeTable[count,j] = test[i,j]
        count += 1

        
    return mergeTable


def mergeYTrainYTestTable(test,train):
    
    n = len(test) + len(train)
   
    count = 0
    
    mergeTable = np.zeros((n,1),dtype = int)
    
    for i in range(len(train)):
        mergeTable[count] = train[i]
        count += 1
    
    for i in range(len(test)):
        mergeTable[count] = test[i]
        count += 1
        
    return mergeTable

def createAllPatientsStructure(patients,patientsTest,patientsTrain,snps):
    
    for i in patientsTest.keys():
        patients[i]=patientsTest[i]
        
    for i in patientsTrain.keys():
        patients[i]=patientsTrain[i]
        
    return patients

def seperatePatients(patients,allPatients):
    
    for patient in patients.keys():
        
        patients[patient] = allPatients[patient]
        
    return patients

In [None]:
path = 'C:\\Users\\ANTONIS\\Desktop\\test1\\' #bake ton fakelo pou 8a epe3ergas8eis 
numberOfChromosomes = 14#'ari8mos twn xromoswmatwn'
patientsTrain = {}
patientsTest = {}
allPatients = {}

chromosomes = {}

read = Read(path,numberOfChromosomes)
write = Write(path,numberOfChromosomes)

patientsTrain = read.readPatients('phenotype_euro_train_balanced.txt')
patientsTest = read.readPatients('phenotype_euro_test_balanced.txt')

chromosomes = read.readSnps(".assoc.fisher")

write.writePatientsList(patientsTrain,'trainPatient.txt')
write.writePatientsList(patientsTest,'testPatient.txt')
write.writeSnpsList(chromosomes)


# run train_lgen bat and test_leg bat

In [None]:
snps = read.getListOfSnps()
ids = {} 
idsTest = {}

allPatients = createAllPatientsStructure(allPatients,patientsTest,patientsTrain,snps)

if os.path.exists(path + 'snpCode.txt'):
    
    allPatients = read.readSnpsCode(allPatients)
    
    patientsTrain = seperatePatients(patientsTrain,allPatients)
    patientsTest = seperatePatients(patientsTest,allPatients)
    
else:
    allPatients = read.readLgen(allPatients)
    
    patientsTrain = seperatePatients(patientsTrain,allPatients)
    patientsTest = seperatePatients(patientsTest,allPatients)
    
    patientsTrain = setSnpsCode(patientsTrain,chromosomes)
    patientsTest = setSnpsCode(patientsTest,chromosomes)
    

if os.path.exists(path + 'TrainIds.txt') and os.path.exists(path + 'SnpsIds.txt'):
    ids = read.readIds('Train')
else:
    ids['patients'] = setIdToName(list(patientsTrain.keys()))
    ids['snps'] = setIdToName(snps)

if os.path.exists(path + 'Test.txt') and os.path.exists(path + 'SnpsIds.txt'):
    idsTest = read.readIds('Test')
else:    
    idsTest['patients'] = setIdToName(list(patientsTest.keys()))
    idsTest['snps'] = ids['snps']




In [None]:
trainSet = DataSet(patientsTrain,ids)
testSet = DataSet(patientsTest,idsTest)

xTraining = trainSet.getXTable()
yTraining = trainSet.getYTable()
        
xTest = testSet.getXTable()
yTest = testSet.getYTable()


mergeXtable = mergeXTrainXTestTable(xTraining,xTest)
mergeYtable = mergeYTrainYTestTable(yTraining,yTest)
    
write.saveData(ids,patientsTrain,'Train',xTraining,chromosomes)
write.saveData(idsTest,patientsTest,'Test',xTest)

In [None]:
count = 0

for patient in patientsTrain.keys():
            
    if patientsTrain[patient].getSize() == read.getNumberOfSnps():
        
        count += 1
        
print("countTrain is ",count)
print("patientsTrain is ",len(patientsTrain.keys())) 
print("snps is Train",read.getNumberOfSnps())
print("shape id ",xTraining.shape)
print("xTraining = ",len(xTraining))
print("xTraining.T = ",len(xTraining.T))
print("yTraining = ",len(yTraining))

print()

count = 0

for patient in patientsTest.keys():
            
    if patientsTest[patient].getSize() == read.getNumberOfSnps():
        
        count += 1
        
print("count is Test",count)
print("patientsTest is ",len(patientsTrain.keys()))
print("snps is Test",read.getNumberOfSnps())
print("shape id Test",xTest.shape)
print("xtest = ",len(xTest))
print("xtest.T = ",len(xTest.T))
print("ytest = ",len(yTest))

print()
print("mergeXtable = ",len(mergeXtable))
print("mergeXtable.T = ",len(mergeXtable.T))
print("mergeYtable = ",len(mergeYtable))
print("shape id ",mergeXtable.shape)

print()


print("id to name patients",len(ids['patients']['idToName'].keys()))
print("name to id patients",len(ids['patients']['nameToId'].keys()))
print()
print("id to name snps",len(ids['snps']['idToName'].keys()))
print("name to id snps",len(ids['snps']['nameToId'].keys()))

print()
count0 = 0
count1 = 0
count2 = 0
count = 0

for i in range(len(xTraining)):
    for j in range(len(xTraining.T)):
        if xTraining[i,j] == 2:
            count2 += 1
                
        elif xTraining[i,j] == 1:
            count1 += 1
                
        elif xTraining[i,j] == 0:
            
            count0 +=1
        else:
            count += 1
            
print("xTraining")
print("count = ",count)
print("count0 = ",count0)
print("count1 = ",count1)
print("coun2 = ",count2)

print()

count0 = 0
count1 = 0
count2 = 0
count = 0

for i in range(len(xTest)):
    for j in range(len(xTest.T)):
        if xTest[i,j] == 2:
            count2 += 1
                
        elif xTest[i,j] == 1:
            count1 += 1
                
        elif xTest[i,j] == 0:
            
            count0 +=1
        else:
            count += 1
            
print("xTest")
print("count = ",count)
print("count0 = ",count0)
print("count1 = ",count1)
print("coun2 = ",count2)

count0 = 0
count1 = 0
count2 = 0
count = 0

for i in range(len(mergeXtable)):
    for j in range(len(mergeXtable.T)):
        if mergeXtable[i,j] == 2:
            count2 += 1
                
        elif mergeXtable[i,j] == 1:
            count1 += 1
                
        elif mergeXtable[i,j] == 0:
            
            count0 +=1
        else:
            count += 1
print()
            
print("mergeXtable")
print("count = ",count)
print("count0 = ",count0)
print("count1 = ",count1)
print("coun2 = ",count2)

# # RF

In [None]:

rfr = RandomForestRegressor(n_estimators = 500, random_state = 2016, verbose = 20,max_depth = None,n_jobs=-1)

xtraining = rfr.transform(xTraining)
xtest = rfr.transform(xTest)

print(xTraining.shape)
print(xtraining.shape)
print(xtest.shape)


In [None]:
rfr.fit(xtraining, yTraining)
yPredict3 = rfr.predict(xtest)
for i in range(len(yPredict3)):
    
    if yPredict3[i] <= 0.5:
        yPredict3[i]=0
    else:
        yPredict3[i]=1
        
print(metrics.accuracy_score(yTest,yPredict3))
print(metrics.confusion_matrix(yTest,yPredict3))
error3 = mean_squared_error(yTest, yPredict3)
print("error 3 = ",error3)
RMSE3 = mean_squared_error(yTest,yPredict3)**0.5
print("RMSE3 = ",RMSE3)

print("cros validation = ",cross_val_score(rfr, mergeXtable, mergeYtable, cv=10))

# Linear Regression

In [None]:


regr = linear_model.LinearRegression()
regr.fit(xTraining, yTraining)
yPredict1 = regr.predict(xTest)



In [None]:
count = 0
for i in range(len(yPredict1)):
    count += yPredict1[i]

mo = count / len(yPredict1)

for i in range(len(yPredict1)):
    if yPredict1[i] <= mo:
        yPredict1[i] = 0
    else:
        yPredict1[i] = 1

error1 = mean_squared_error(yTest, yPredict1)
print("error 1 = ",error1)
print(metrics.accuracy_score(yTest,yPredict1))
print(metrics.confusion_matrix(yTest,yPredict1))
RMSE1 = mean_squared_error(yTest,yPredict1)**0.5
print("RMSE1 = ",RMSE1)

print("cros validation = ",cross_val_score(regr, mergeXtable, mergeYtable, cv=10))

# SVM

In [None]:

clf = SVC()
clf.fit(xTraining, yTraining)
yPredict2 = clf.predict(xTest)
print(metrics.accuracy_score(yTest,yPredict2))
print(metrics.confusion_matrix(yTest,yPredict2))
error2 = mean_squared_error(yTest, yPredict2)
print("error 2 = ",error2)
RMSE2 = mean_squared_error(yTest,yPredict2)**0.5
print("RMSE2 = ",RMSE2)

#print("cros validation = ",cross_val_score(clf, mergeXtable, mergeYtable, cv=5))

# LINEAR LOGISTIC REGRESSION 

In [None]:
lr_clf = linear_model.ElasticNetCV()
lr_clf.fit(xTraining, yTraining)
yPredict4 = lr_clf.predict(xTest)

count = 0
for i in range(len(yPredict4)):
    count += yPredict4[i]

mo = count / len(yPredict4)

for i in range(len(yPredict4)):
    if yPredict4[i] <= mo:
        yPredict4[i] = 0
    else:
        yPredict4[i] = 1

print(metrics.accuracy_score(yTest,yPredict4))
print(metrics.confusion_matrix(yTest,yPredict4))
error4 = mean_squared_error(yTest, yPredict4)
print("error 4 = ",error4)
RMSE4 = mean_squared_error(yTest,yPredict4)**0.5
print("RMSE4 = ",RMSE4)
#print("cros validation = ",cross_val_score(lr_clf, mergeXtable, mergeYtable, cv=5))

# OLS

In [None]:
ols = sm.OLS(yTraining,xTraining)
yPredict5 = ols.fit().predict(xTest)

mo5 = 0
s5 = 0
for i in yPredict5:
    s5 += i
    
mo5 = s5 / len(yPredict5)

In [None]:
for i in range(len(yPredict5)):
    
    if yPredict5[i] <=mo5:
        yPredict5[i]=0
    else:
        yPredict5[i]=1

print(metrics.accuracy_score(yTest,yPredict5))
print(metrics.confusion_matrix(yTest,yPredict5))
error5 = mean_squared_error(yTest, yPredict5)
print("error 5 = ",error5)
RMSE5 = mean_squared_error(yTest,yPredict5)**0.5
print("RMSE5 = ",RMSE5)
#print("cros validation = ",cross_val_score(ols, mergeXtable, mergeYtable, cv=5))


In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
y_pred = gnb.fit(xTraining, yTraining).predict(xTest)
print(metrics.accuracy_score(yTest,y_pred))
print(metrics.confusion_matrix(yTest,y_pred))
error5 = mean_squared_error(yTest, yPredict5)
print("error 5 = ",error5)
RMSE5 = mean_squared_error(yTest,yPredict5)**0.5
print("RMSE5 = ",RMSE5)