In [1]:
import pandas as pd
import os
import shutil
import getFeatures as gf
import numpy as np
from sklearn import metrics
import scipy
from matplotlib import pyplot as plt

In [2]:
def get_all_files(directories):
    filenames = []
    for directory in directories:
        for filename in os.listdir(directory):
            if filename.endswith('.xlsx') or filename.endswith('.xls'):
                filenames.append(directory+filename)
    return filenames

def get_stats(featureFile):
    df = pd.read_excel(featureFile)
    return df.mean(axis = 0), df.std(axis = 0)

def file_2_rank(excelName):
    file2rank = dict()
    try:
        df = pd.read_excel(excelName)
    except:
        df = pd.read_csv(excelName, sep='\t')
    
    for i in range(df.shape[0]):
        file2rank[df.loc[i, 'Filename']] = df.loc[i, 'Rank']
    
    return file2rank

def file_2_feature_train(filenames, file2rank):
    file2feature = dict()
    for filename in filenames:
        assert filename in file2rank.keys(), filename + ' not in ranking excel file'
        
        l = list(gf.classify(filename, ''))
        l.append(file2rank[filename])
        file2feature[filename] = l
        
    return file2feature

def file_2_feature_test(filenames):
    file2feature = dict()
    for filename in filenames:
        l = list(gf.classify(filename, ''))
        file2feature[filename] = l
        
    return file2feature

def distance(sample, cluster):
    dist = 0
    for i in range(len(sample)):
        dist += (cluster[i] - sample[i])**2
    return dist**(1/2)

def find_closest_cluster(df, clusters):
    file2cluster = dict()
    
    def find_min(vals):
        minIndex = None
        minVal = np.inf
        for i in range(len(vals)):
            #print(val[i])
            if vals[i] < minVal:
                minIndex = i
                minVal = vals[i]
        return minIndex+1
    
    for i in range(df.shape[0]):
        allDist = []
        for j in range(clusters.shape[0]):
            dist = distance(df.iloc[i, :], clusters[j, :])
            allDist.append(dist)
        
        file2cluster[list(df.index)[i]] = find_min(allDist)
    return file2cluster

def rank(directories, clusterFilename, cluster2group):
    allFilenames = get_all_files(directories)
    means, stds = get_stats('AllFeatures_Nate.xls')
    file2features = file_2_feature_test(allFilenames)
    #df = pd.read_excel(featureFile)
    clusters = np.loadtxt(clusterFilename)
    
    columns = ['Ratio of Peaks Found', 'Ratio of Peaks to Ideal', 'Ratio of Range', 'Inverse Standard Deviation', 'Area Under the Curve', 'Normed Area Under the Curve', 'Smoothing Error']

    df = pd.DataFrame(list(file2features.values()), columns = columns, index = file2features.keys())
    for column in columns:
        df.loc[:, column] = (df.loc[:, column] - means[column])/stds[column]
    #print(df)
    file2cluster = find_closest_cluster(df, clusters)
    file2group = dict()
    for filename in file2cluster.keys():
        for key in cluster2group.keys():
            if file2cluster[filename] in key:
                file2group[filename] = cluster2group[key]
                break
    
    return file2group

def get_wilcoxon(testRanks, realRanks):
    def remove_directory(filename):
        for i in range(len(filename)):
            if filename[i] == '/':
                return filename[i+1:]
            
    df = pd.read_excel('ranked_curves.xlsx')
    rankedDict = dict()
    for i in range(df.shape[0]):
        rankedDict[df.loc[i, 'name']] = df.loc[i, 'rank']
    
    for key in rankedDict.keys():
        for pair in realRanks.keys():
            if rankedDict[key] >= pair[0] and rankedDict[key] <= pair[1]:
                rankedDict[key] = realRanks[pair]
                break
    
    testList = []
    realList = []
    for f1 in testRanks.keys():
        testList.append(testRanks[f1])
        realList.append(rankedDict[remove_directory(f1)])
    
    print(testList)
    print(realList)
    return scipy.stats.wilcoxon(testList, realList)

In [3]:
cluster2group = dict()
cluster2group[(6, 7)] = 0
cluster2group[(4, 1)] = 1
cluster2group[(2, 8)] = 1
cluster2group[(3, 5)] = 0
test = rank(['train/', 'test/'], 'ClusterValues.txt', cluster2group)

train/AnGam_Mos55_cells.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/Dmel_HI-N31_sRNA.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAeg_Aag2_eGFPIP2B_AK.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAeg_CCL-125cells_CHIKV.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAlbo_VietFcar_AEFV_MERV_SHTV_YS.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAeg_Aag2_SBV2_AK.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/Dmel_fGSOSS_cellline.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/Dmel_Sg4_cellline.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAeg_Female_Soma.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/Dmel_OSCSiomi2015_sRNA.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAeg_Aag2_Zika.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAeg_Female_Ovary_BetaE.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAlbo_C636_2dpi_WNV1.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAlbo_C636_6dpi_WNV1.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/Dmel_3r

train/AeAlbo_CHIKV_9dpi.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAeg_Fcarc_GH_rep3.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAlbo_C636_cells.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAeg_Aag2_BTV2_AK.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/CuQuin_Fcarc_NL.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/Dmel_OSSHann2016_sRNA.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/Dmel_Ovary_Ago2_414.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAlbo_Ovary_OA.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAeg_Aag2_Piwi4IP2A_AK.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAlbo_C6-36_Mock_GP.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/Dmel_OSScells_BetaE.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/CuQuin_Hsu_6dpi_WNV3.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAeg_Female_Carcass_BF72h.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/Dmel_HI-N10_sRNA.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAlbo_CHIKV_3dpi.24_35.trim.

train/AeAeg_Aag2SINV_dsPiwi5.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/Dmel_ML-DmD20-c5_cellline.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/Dmel_sRNA_w1XHar_21day.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAlbo_C636_SINV_GFP.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/CuQuin_Testes_NL.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAeg_Larvae.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AnGam_Ovaries_rep3_KM.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAlbo_C636_2dpi_WNV2.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/CuQuin_Hsu_17dpi_WNV3.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAlbo_U4.4_Mock_GP.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAlbo_VietMcar_AEFV_MERV_SHTV_YS.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAeg_Aag2_Ago3IP2B_AK.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/Dmel_sRNA_HarXw1_21day.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xls
train/AeAlbo_C636_CHIKV_AZT_3dpi.24_35.trim.fastq.uq.polyn.5to5_SPECIES.xl

In [4]:
counters = [0 for i in range(4)]
for i in list(test.values()):
    counters[i-1] += 1
print(counters)

newCounters = []
val = 0
for i in counters:
    val += i
    newCounters.append(val)
print(newCounters)

[183, 0, 0, 144]
[183, 183, 183, 327]


In [5]:
clusters = np.loadtxt('ClusterValues.txt')
print(clusters.shape)

(8, 7)


In [6]:
maxes, stds = get_stats('AllFeatures_Nate.xls')
print(maxes)
print(stds)
print(maxes['Ratio of Peaks Found'])

Ratio of Peaks Found            0.467456
Ratio of Peaks to Ideal         0.931818
Ratio of Range                  0.821902
Inverse Standard Deviation     11.261711
Smoothing Error                27.650496
Area Under the Curve           54.079553
Normed Area Under the Curve     1.533342
dtype: float64
Ratio of Peaks Found            0.192427
Ratio of Peaks to Ideal         0.125912
Ratio of Range                  0.184889
Inverse Standard Deviation     11.840161
Smoothing Error                21.778950
Area Under the Curve           25.917562
Normed Area Under the Curve     0.500339
dtype: float64
0.46745567519958825


In [7]:
realRanks = dict()
realRanks[(1, 45)] = 0
realRanks[(46, 138)] = 0
realRanks[(139, 216)] = 1
realRanks[(216, 350)] = 1

print(get_wilcoxon(test, realRanks))

[0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1]
[1, 1, 1, 0, 1, 1,