In [1]:
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
#import matplotlib.pyplot as plt
import numpy as np
import os.path
import csv
from sklearn.svm import SVC
from sklearn.svm import NuSVC
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn import grid_search
import sklearn.linear_model as linear_model
import sklearn.metrics as metrics
from sklearn import preprocessing
#import statsmodels.api as sm
#import statsmodels.formula.api as smf
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from random import randint
from sklearn import tree
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import math
import time
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest

from metrics.Correlation import Correlation
from IO.Output import Output
from IO.Input import Input
from metrics.RSquare import RSquare
from DataSet.Dataset import DataSet


def setIdToName(aList):
    
    ids = {}
    nameToId = {}
    idToName = {}
    count = 0
    
    for i in aList:
        
        nameToId[i] = count
        idToName[count] = i
        count += 1
        
    ids['nameToId'] = nameToId
    ids['idToName'] = idToName
    
    return ids





def createNewIds(oldsnps,snps):
    
    nameToId = {}
    idToName = {}
    newIds = {}
    
    oldIds = oldsnps['snps']['idToName']
    
    for i in range(len(snps)):
        
        nameToId[oldIds[snps[i]]] = i
        idToName[i] = oldIds[snps[i]]
        
    newIds['nameToId'] = nameToId
    newIds['idToName'] = idToName
        
    return newIds

def setSnpsCode(patients,chromosomes):
    
    for i in patients.keys():
        patients[i].snpCode(chromosomes)
        
    return patients
    

def tables(sampleX,sampleY,k):
  
    samples = {}
    
    for run in range(1,k+1):
        
        d1 = {}
        

        dataTestX = sampleX[run]
        dataTestY = sampleY[run]

        n = 0

        for i in sampleX.keys():

            if i != run:

                n += len(sampleX[i])

        dataTrainX = np.zeros((n,len(sampleX[1].T)),dtype = int)
        dataTrainY = np.zeros((n,),dtype = int)

        count = 0

        for sample in sampleX.keys():

            if sample != run:

                 for i in range(len(sampleX[sample])):
                    for j in range(len(sampleX[sample].T)):
                        dataTrainX[count,j] = sampleX[sample][i,j]

                    dataTrainY[count] = sampleY[sample][i]
                    count += 1

        d1['trainX'] = dataTrainX
        d1['trainY'] = dataTrainY
        d1['testX'] = dataTestX
        d1['testY'] = dataTestY
        
        samples[run] = d1
    
    return samples
    
def kSampleData(k,X,Y):
    
    x = int (len(X) / k)
    allElements = np.zeros((len(X),),dtype = int)
    
    count1 = 1
    sampleX = {}
    sampleY = {}
   
    
    while count1 <= k:
        count2 = 1
        sampleData = []
        
        if count1 == k:
            x =  len(X) - ((k-1) * x)
        
        dataX = np.zeros((x,len(X.T)),dtype = int)
        dataY = np.zeros((x,),dtype = int)
        
        while count2 <= x:
            
            aRand = randint(0,len(X)-1)
            
            while allElements[aRand] == 1:
                
                aRand = randint(0,len(X)-1)
            
            allElements[aRand] = 1
            sampleData.append(aRand)
            count2 += 1
            
        for i in range(len(sampleData)):
            for j in range(len(X.T)):
                dataX[i,j] = X[sampleData[i],j]
            
            dataY[i] = Y[sampleData[i]]
            
        sampleX[count1] = dataX
        sampleY[count1] = dataY
        count1 +=1
        
    return tables(sampleX,sampleY,k)




def createNewTable(snps,X):
    
    newX = np.zeros((len(X),len(snps)),dtype = np.int32)
    count=0
    
    
    
    for i in range(len(newX)):
        for j in range(len(newX.T)):
            newX[i,j] = -1
    
    for i in range(len(snps)):
        
        newX[:,i] = X[:,snps[i]]
        
        
    print("new shape = ",newX.shape)
            
    return newX 



def featuresIds(oldSnps,snps):
    
    features = {}
    
    for i in range(len(snps)):
        features[i] = snps[i]
        
    return features



def crossValidiation(X, Y, k = 1, continious = True, classifier = None,OLS = False,Logistic = False):
    
    if not classifier:
        print("wrong!!!!!!! you have to choise a classifier")
        return
    
    results = {}
    accuracy = {}
    auc = {}
    recall = {}
    precision = {}
    f1Score = {}
    
    sumResults = 0.0
    sumAccuracy = 0.0
    sumAuc = 0.0
    sumRecall = 0.0
    sumPrecision = 0.0
    sumF1Score = 0.0
    
    samples = kSampleData(k,X,Y)
    
    for run in range(1, k + 1):
        
        trainX = samples[run]['trainX']
        trainY = samples[run]['trainY']
        
        #trainX,trainY = balancedData(trainX,trainY)
        
        testX = samples[run]['testX']
        testY = samples[run]['testY']
        
        
        if OLS:
            classifier = sm.OLS(trainY,trainX)
            yPredict = classifier.fit().predict(testX)
        else:

            classifier.fit(trainX, trainY)
            yPredict = classifier.predict(testX)
        
        if continious:
            
            for i in range(len(yPredict)):
                
                if (abs(0 - yPredict[i]) - abs(1 - yPredict[i])) <= 1e-10 :
                    yPredict[i] = 0
                else:
                    yPredict[i] = 1
                    
        if Logistic:
            
            probabilities = classifier.predict_proba(testX)
            
            for i in range(len(probabilities)):
                if probabilities[i][1] >= 0.8:
                    yPredict[i] = 1
                else:
                    yPredict[i] = 0
          
        accuracy[run] = metrics.accuracy_score(testY,yPredict)#(yPredict,testY)#
        fpr, tpr, thresholds = metrics.roc_curve(testY,yPredict)
        auc[run] = metrics.auc(fpr,tpr)
        recall[run] = metrics.recall_score(testY,yPredict)
        precision[run] = metrics.precision_score(testY,yPredict)
        f1Score[run] = f1_score(testY, yPredict, average='binary')
        
    
    for i in accuracy.keys():
        sumAccuracy = sumAccuracy + accuracy[i]
        sumAuc = sumAuc + auc[i]
        sumRecall = sumRecall + recall[i]
        sumPrecision = sumPrecision + precision[i]
        sumF1Score = sumF1Score + f1Score[i]
    
    results['accuracy'] = sumAccuracy / k
    results['auc'] = sumAuc / k
    results['recall'] = sumRecall / k
    results['precision'] = sumPrecision / k
    results['f1'] = sumF1Score / k
    
    return results

def writeResultConf(path,name,results):
    
    timee = time.strftime("%d-%m-%Y")
    
    folder = path + name + 'Confu' + " ( " + timee + " ) " + '_'
    #file = path + name + 'Confu' + '.txt'
    
    i=1
    while os.path.exists(folder):
        timee = time.strftime("%d-%m-%Y")
        folder = path + name + 'Confu' + " ( " + timee + " ) " + '_' + str(i) + '_'
    
        i += 1
        
 #   os.makedirs(folder)
    
    file = folder + name + 'Confu' + '.txt' 
    
    write = open(file,'w')
    
    write.write(timee + '\n'+ '\n')
    write.write(name + '\n')
    
    for i in results:
        
        results1 = results[i]
        write.write("Category = " + i + '\n')
        
        for j in results1:
            
            write.write(j + " = " + str(results1[j]) + '\n')
            
        write.write('\n')
        write.write('\n')
        
    write.close()

    

def writeResult(path,name,results,category):
    
    timee = time.strftime("%d-%m-%Y")
    
    folder = path + name  + " ( " + timee + " ) " + '_'
    #file = path + name  + '.txt'
    
    i=1
    while os.path.exists(folder):
        timee = time.strftime("%d-%m-%Y")
        folder = path + name + " ( " + timee + " ) " + '_' + str(i) + '_'
       
        i += 1
        
   # os.makedirs(folder)
    
    file = folder + name + '.txt'
    
    write = open(file,'w')
    
    write.write(timee + '\n'+ '\n')
    write.write(name + '\n')
    write.write("Category = " + category + '\n')
    for i in results[category]:
        
        #results1 = results[i]
        
        
        #for j in results:
            
        write.write(i + " = " + str(results[category][i]) + '\n')
            
        
        
    write.close()
    
def writeCoef(path,snpsIds,sc,idToName,name = None):
        
        if not name:
            print("give a name to file")
            return
        
        p = path + name  + " ( " + time.strftime("%d-%m-%Y") + " ).txt "  
    
        i=1
        while os.path.exists(p):
            
            p = path + name  + " ( " + time.strftime("%d-%m-%Y") + " ) " + '_' + str(i)+".txt"
            i += 1
        
        snps = []
        for i in range(len(snpsIds)):
            s = snpsIds[i]
            snps.append(idToName[s])
            
        print("snpsIds = ",len(snpsIds))
        print("idToName = ",len(idToName))
        
        write = open(p,'w')
        for i in range(len(snps)):
            
            write.write(str(snps[i])+'\t'+str(sc[i])+'\n')
            
        write.close()
           
        

def balancedData(X,Y):
    
    cases = 0
    for i in range(len(Y)):
        if Y[i] == 1:
            cases += 1
            
    Xbalanced = np.zeros((2*cases,len(X.T)))
    Ybalanced = np.zeros(2*cases)
    controls = 0
    count = 0
    selected = np.zeros(len(X))
    '''for i in range(len(Y)):
        if(Y[i] == 0 and controls < cases):
            Xbalanced[count,:] = X[i,:]
            Ybalanced[count] = Y[i]
            controls +=1
            count += 1
        elif Y[i] == 1:
            Xbalanced[count,:] = X[i,:]
            Ybalanced[count] = Y[i]
            count += 1'''
    
    
    for i in range(len(Y)):
        
        if Y[i] == 1:
            Xbalanced[count,:] = X[i,:]
            Ybalanced[count] = Y[i]
            count += 1
            selected[i]=i
    
    while(controls<cases):
        aRand = randint(0,len(X)-1)
        while(selected[aRand] == 1):
            aRand = randint(0,len(X)-1)
            
        Xbalanced[count,:] = X[aRand,:]
        Ybalanced[count] = Y[aRand]
        controls +=1
        count += 1
    
    print("len x = ", Xbalanced.shape)
    print("len y = ", Ybalanced.shape)
    
    return Xbalanced, Ybalanced


def casePatient(X,Y):
    
    cases = 0
    for i in range(len(Y)):
        if Y[i] == 1:
            cases += 1
            
    xCase = np.zeros((cases,len(X.T)))
    count = 0
    for i in range(len(Y)):
        
        if Y[i] == 1:
            xCase[count,:] = X[i,:]
            
            count += 1
            
    
    
    print('cases = ', cases)
    print('xCase shape = ', xCase.shape)
    
    return xCase


def writeTable(X,Y,ids):
    
    columns = []
    columns.append('patients')
    for i in range(len(X.T)):
        snp = ids['snps']['idToName'][i]
        columns.append(snp)
    columns.append('TARGET')
    
    file = open(path+'snp2.txt','w')
    
    for i in range(len(columns)-1):
        
        file.write(columns[i]+',')
    
    #print('i=',i)
    
    file.write(columns[i+1]+'\n')
    
    for i in range(len(X)):
        
        file.write(ids['patients']['idToName'][i] +',')
        for j in range(len(X.T)):
            
            file.write(str(X[i,j])+',')
        file.write(str(Y[i])+'\n')
        
    file.close()
    
    
def writelibsvm(X):
    
    file = open(path+'libsvm1.txt','w')
    
    target = X['TARGET']
    X = X.drop(['TARGET'], axis=1)
    l = X.ix(0)
    
    for i in range(len(target)):
        file.write(str(target[i]) + ' ')
        
        for j in range(len(l[i])-1):
            
            file.write(str(j+1)+':'+ str(l[i][j]) +' ')
        file.write(str(j+2)+':' + str(l[i][j+1]) +'\n')  
        
    
        
    file.close()
            



In [8]:

#path = 'C:\\Users\\ANTONIS\\Desktop\\newSet\\maf\\maf = 0.05\\assoc\\pvalue = 0.001\\'
#path = 'C:\\Users\\ANTONIS\\Desktop\\newSet\\maf\\maf = 0.05\\assoc\\pvalue = 0.01\\'
path = 'E:\\newSet\\pvalue = 0.001\\'
#path = 'D:\\newdata\\maf = 0.05\\pvalue = 0.001\\'

#path = '/media/antonis/red/newSet/maf/maf = 0.05/assoc/pvalue = 0.001/'

#path = '/media/antonis/red/newdata/maf = 0.05/pvalue = 0.001/'
#path = '/media/antonis/red/newSet/maf/maf = 0.05/assoc/pvalue = 0.01/'

numberOfChromosomes = 22#'ari8mos twn xromoswmatwn'
chromosomes = {}

read = Input(path,numberOfChromosomes)
write = Output(path,numberOfChromosomes)

patients = read.readPatients('Patients.txt')
chromosomes = read.readSnps(".assoc")
write.writePatientsList(patients,'patient.txt')
write.writeSnpsList(chromosomes)


In [9]:
sum1 =0 
sum2 =0
sumall = 0
a = 0

for i in patients:
    if  patients[i].getCase() == 1:
        sum1 += 1
    elif  patients[i].getCase() == 0:
        sum2 += 1
    
    else:
        a+=1
        
    sumall += 1
        
print("sum1 = ",sum1)
print('sum2 = ', sum2)
print('sumall = ',sumall)
print('all = ',sum1+sum2)
print(a)

sum1 =  1018
sum2 =  3962
sumall =  4980
all =  4980
0


In [10]:

snps = read.getListOfSnps()
ids = {} 


ids['patients'] = setIdToName(list(patients.keys()))
ids['snps'] = setIdToName(snps)


if os.path.exists(path + 'snpCode.txt'):
    print("mphka")
    X, Y = read.readSnpsCode(patients,ids)
    
else:
    
    write.writeSnpLog(read.getNumberOfPatients(),read.getNumberOfSnps(),chromosomes,read.getSnpsList(),patients)
    X, Y = read.readSnpsCode(patients,ids)




mphka
mphka2


In [11]:

xTraining, xTest, yTraining1, yTest = train_test_split(X, Y, test_size=0.1, random_state=randint(0,2018))
print("mergex = ",X.shape)
print("xTrain = ",xTraining.shape)
print("xTest = ",xTest.shape)
print("yTrain = ",yTraining1.shape)
print("yTest = ",yTest.shape)

mergex =  (4980, 7799)
xTrain =  (4482, 7799)
xTest =  (498, 7799)
yTrain =  (4482,)
yTest =  (498,)


In [12]:
#xC = casePatient(X,Y)
#cor = Correlation(xC)

cor = Correlation(X)


#cor = RSquare(X)

#cor = Correlation(xTraining)
#cor = RSquare(xTraining)
snpReduc = {}

In [13]:
down = 100
up = 100 
threshold = 0.7
#snpReduc['low'] = cor.getLowCorrelationSnps(threshold, down=down,up=up)
snpReduc['low100'] = cor.getLowCorrelationSnps(threshold, down=down,up=up,c=-2)
write.writeSnpsUsed(snpReduc['low100'],ids['snps']['idToName'],chromosomes,'100%')
'''

file = open(path+'0.01 cut of.txt','r')
snpReduc = {}
down = 100
up = 100 
threshold = 0.7
snpReduc['low100'] = []
for line in file:
    
    snpReduc['low100'].append(ids['snps']['nameToId'][line.split()[0].strip()])
'''   

count =  883
len snpsRed =  883
snpsIds =  883
idToName =  7799


"\n\nfile = open(path+'0.01 cut of.txt','r')\nsnpReduc = {}\ndown = 100\nup = 100 \nthreshold = 0.7\nsnpReduc['low100'] = []\nfor line in file:\n    \n    snpReduc['low100'].append(ids['snps']['nameToId'][line.split()[0].strip()])\n"

In [14]:
categories = []
#categories = ['low','high']
#categories = ['low']
#categories = ['low97','low100']
categories = ['low100']
writeResults2={}
writeResults={}

In [15]:
from sklearn.svm import NuSVC

for i in categories:
    
    r1 = {}
    r1['down'] = down
    r1['up'] = up
    r1['thres'] = threshold
    writeResults={}
    print("Category = ",i)
    
    XX = createNewTable(snpReduc[i],X)
    xTraining1 = createNewTable(snpReduc[i],xTraining)
    xTest1 = createNewTable(snpReduc[i],xTest)
    
    
   
    
    
   

Category =  low100
new shape =  (4980, 883)
new shape =  (4482, 883)
new shape =  (498, 883)


In [16]:
from multiprocessing.dummy import Pool as ThreadPool
pool = ThreadPool(2)

In [19]:
clf = SVC(kernel='linear')
lr_clf = linear_model.LogisticRegressionCV() 
items = {}
items[0] = {}
items[1] = {}
items[0]['c'] = clf
items[1]['c'] = lr_clf

results = []
for item in items.keys():
    results.append(crossValidiation(XX, Y, k = 10, classifier =items[item]['c'] , continious = False))



'''results = crossValidiation(XX, Y, k = 10, classifier = clf, continious = False)
results['down'] = down
results['up'] = up
results['thres'] = threshold
print("accuracy = ",results['accuracy'])
print("AUC = ", results['auc'])
print("recal = ",results['recall'])
print("precision = ",results['precision'])
print("f1 = ",results['f1'])
results['len_snps'] = len(snpReduc[i])'''



'results = crossValidiation(XX, Y, k = 10, classifier = clf, continious = False)\nresults[\'down\'] = down\nresults[\'up\'] = up\nresults[\'thres\'] = threshold\nprint("accuracy = ",results[\'accuracy\'])\nprint("AUC = ", results[\'auc\'])\nprint("recal = ",results[\'recall\'])\nprint("precision = ",results[\'precision\'])\nprint("f1 = ",results[\'f1\'])\nresults[\'len_snps\'] = len(snpReduc[i])'

0.9118473895582329

In [None]:
def showMyMetric1(results,v1 = 1,v2 = 0):
    
    metrics = {}
    
    if v1 == v2:
        print("wrong values!!!v1 is the same with v2!!!!")
       
    
   # r = results.rdd
    #r3 = r.collect()

    r3 = results.collect()
    
    same0 = 0
    same1 = 0
    sum0 = 0
    sum1 = 0
    sumall = 0
    for i in r3:

        if i[v1] == 0:
            sum0 += 1
            if i[v2] == 0:
                same0 += 1
        elif i[v1] == 1:
            sum1 += 1
            if i[v2] == 1:
                same1 += 1
        sumall += 1

    print('sum0 = ', sum0)
    print('sum1 = ', sum1)
    print('same0 = ', same0)
    print('same1 = ', same1)
    print('all = ', sumall)
    print('all2 = ', sum0+sum1)
    
    metrics['sum0'] = sum0
    metrics['sum1'] = sum1
    metrics['same1'] = same1
    metrics['same0'] = same0
    
    return metrics


def showMyMetric2(results,v1 = 1,v2 = 0):
    
    metrics = {}
    
    if v1 == v2:
        print("wrong values!!!v1 is the same with v2!!!!")
       
    
   # r = results.rdd
    #r3 = r.collect()

    r3 = results.collect()
    
    same0 = 0
    same1 = 0
    sum0 = 0
    sum1 = 0
    sumall = 0
    for i in r3:

        if i[v1] == 0:
            sum0 += 1
            if i[v2] == 0:
                same0 += 1
        elif i[v1] == 1:
            sum1 += 1
            if i[v2] == 1:
                same1 += 1
        sumall += 1

    print('sum0 = ', sum0)
    print('sum1 = ', sum1)
    print('same0 = ', same0)
    print('same1 = ', same1)
    print('all = ', sumall)
    print('all2 = ', sum0+sum1)
    
    metrics['sum0'] = sum0
    metrics['sum1'] = sum1
    metrics['same1'] = same1
    metrics['same0'] = same0
    
    return metrics


def showMyMetric3(results,v1 = 1,v2 = 0):
    
    metrics = {}
    
    if v1 == v2:
        print("wrong values!!!v1 is the same with v2!!!!")
       
    
   # r = results.rdd
    #r3 = r.collect()

    r3 = results.collect()
    
    same0 = 0
    same1 = 0
    sum0 = 0
    sum1 = 0
    sumall = 0
    for i in r3:

        if i[v1] == 0:
            sum0 += 1
            if i[v2] == 0:
                same0 += 1
        elif i[v1] == 1:
            sum1 += 1
            if i[v2] == 1:
                same1 += 1
        sumall += 1

    print('sum0 = ', sum0)
    print('sum1 = ', sum1)
    print('same0 = ', same0)
    print('same1 = ', same1)
    print('all = ', sumall)
    print('all2 = ', sum0+sum1)
    
    metrics['sum0'] = sum0
    metrics['sum1'] = sum1
    metrics['same1'] = same1
    metrics['same0'] = same0
    
    return metrics
    
    
    
def showMyMetric4(results,v1 = 1,v2 = 0):
    
    metrics = {}
    
    if v1 == v2:
        print("wrong values!!!v1 is the same with v2!!!!")
       
    
   # r = results.rdd
    #r3 = r.collect()

    r3 = results.collect()
    
    same0 = 0
    same1 = 0
    sum0 = 0
    sum1 = 0
    sumall = 0
    for i in r3:

        if i[v1] == 0:
            sum0 += 1
            if i[v2] == 0:
                same0 += 1
        elif i[v1] == 1:
            sum1 += 1
            if i[v2] == 1:
                same1 += 1
        sumall += 1

    print('sum0 = ', sum0)
    print('sum1 = ', sum1)
    print('same0 = ', same0)
    print('same1 = ', same1)
    print('all = ', sumall)
    print('all2 = ', sum0+sum1)
    
    metrics['sum0'] = sum0
    metrics['sum1'] = sum1
    metrics['same1'] = same1
    metrics['same0'] = same0
    
    return metrics

def showMyMetric5(results,v1 = 1,v2 = 0):
    
    metrics = {}
    
    if v1 == v2:
        print("wrong values!!!v1 is the same with v2!!!!")
       
    
   # r = results.rdd
    #r3 = r.collect()

    r3 = results.collect()
    
    same0 = 0
    same1 = 0
    sum0 = 0
    sum1 = 0
    sumall = 0
    for i in r3:

        if i[v1] == 0:
            sum0 += 1
            if i[v2] == 0:
                same0 += 1
        elif i[v1] == 1:
            sum1 += 1
            if i[v2] == 1:
                same1 += 1
        sumall += 1

    print('sum0 = ', sum0)
    print('sum1 = ', sum1)
    print('same0 = ', same0)
    print('same1 = ', same1)
    print('all = ', sumall)
    print('all2 = ', sum0+sum1)
    
    metrics['sum0'] = sum0
    metrics['sum1'] = sum1
    metrics['same1'] = same1
    metrics['same0'] = same0
    
    return metrics
    
def calculateAvgMetrics1(results,classLabel=1):
    
    metricss = {}
    
    metrics = showMyMetric1(results)
    
    
    metricss["accuracy"] = (metrics['same1'] + metrics['same0'])/ (metrics['sum1'] + metrics['sum0'])
    metricss["recal"]= (metrics['same1'])/ (metrics['sum1'])
    metricss["precision"] = (metrics['same1'] )/ (metrics['sum1']) +( metrics['sum0'] - metrics['same0'])
    metricss["f1"] = 2 * ((metrics['recal'] * metrics['precision']) / (metrics['recal'] + metrics['precision']))
    
    return metricss
    

def calculateAvgMetrics2(results,classLabel=1):
    
    metricss = {}
    
    metrics = showMyMetric2(results)
    
    
    metricss["accuracy"] = (metrics['same1'] + metrics['same0'])/ (metrics['sum1'] + metrics['sum0'])
    metricss["recal"]= (metrics['same1'])/ (metrics['sum1'])
    metricss["precision"] = (metrics['same1'] )/ (metrics['sum1']) +( metrics['sum0'] - metrics['same0'])
    metricss["f1"] = 2 * ((metrics['recal'] * metrics['precision']) / (metrics['recal'] + metrics['precision']))
    
    return metricss
                                                  

                                

    

def crossVal(numFold = 0,samples = None, labels =None,classifier = None):
    
    avgMetrics = {}
    
    '''if data == None:
        
        print("data not given")
        return'''
    
    if classifier == None:
        
        print("classifier not given")
        return
    
    
    #samples = split(numFold = numFold, df = data)
    
    if samples == None:
        return
    
    for i in range(1,numFold + 1):
        
        print("cross = ",i)
         
        test_data = samples['test'][i]
        
      
        
        train = samples['train'][i]
        
        model = classifier.fit(train)
           
        results = model.transform(test_data)

        evaluate = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='label')
        AUC = evaluate.evaluate(results)
        
        
        
        results1 = results.select('prediction','label')
      
        avgMetrics[i] = AUC
        avgMetrics[i] = calculateAvgMetrics(results1.rdd,classLabel=1)
    
    return avgMetrics


def writeResult(path,name,results=None,category='',thresh=-1, down=-1, up=-1):
    
    timee = time.strftime("%d-%m-%Y")
    
    folder = path + name  + " ( " + timee + " ) " + '_'
    #file = path + name  + '.txt'
    
    i=1
    while os.path.exists(folder):
        timee = time.strftime("%d-%m-%Y")
        folder = path + name + " ( " + timee + " ) " + '_' + str(i) + '_'
       
        i += 1
        
   # os.makedirs(folder)
    
    file = folder + name + '.txt'
    
    write = open(file,'w')
    
    write.write(timee + '\n'+ '\n')
    write.write(name + '\n')
    write.write('threshold = ' + str(thresh) + '\n')
    write.write('down = ' + str(down) +'\n')
    write.write('up = '+ str(up) + '\n')
    write.write("Category = " + category + '\n')
    for i in results:
        
        #results1 = results[i]
        
        
        #for j in results:
            
        write.write(i + " = " + str(result[i]) + '\n')
            
        
        
    write.close()
    
    
        
