In [6]:
import numpy as np
import os
import pandas as pd #data handling library
import sklearn as sk

def get_RPKM_by_file(file_name, data_fields):
    df = pd.read_table(file_name, usecols=data_fields)
    df = df.T # tanspose
    new_header = df.iloc[0] #grab the first row for the header
    df = df[1:] #take the data less the header row
    df = df.rename(columns = new_header.T) #set the header row as the df header
    return df


def get_RPKM_by_filelist(list_fname):
    RPKM_list = []
    data_fields = ('gene','RPKM')
    with open(list_fname, 'r') as f:
        RPKM_list.extend(map(lambda data_file: get_RPKM_by_file(data_file, data_fields), f.readlines()))
    return RPKM_list

def get_RPKM_by_dir(data_dir):
    RPKM_list = []
    data_fields = ('gene','RPKM')
    for dirName, subdirList, fileList in os.walk(data_dir):
        RPKM_list.extend(map(lambda f: get_RPKM_by_file(os.path.join(dirName, f), data_fields), fileList))
    return RPKM_list

data_dir = './Data'
RPKM_list = get_RPKM_by_dir(data_dir)
print RPKM_list

[     ?|100130426 ?|100133144 ?|100134869   ?|10357   ?|10431 ?|136542  \
RPKM  0.06694772   0.9270708   0.4969419  5.518766  22.36725        0   

      ?|155060    ?|26823 ?|280660 ?|317712   ...     ZXDA|7789 ZXDB|158586  \
RPKM  2.880182  0.2761594        0        0   ...     0.4448335   0.9159671   

     ZXDC|79364 ZYG11A|440590 ZYG11B|79699  ZYX|7791 ZZEF1|23140 ZZZ3|26009  \
RPKM   10.07533     0.5495135     5.733637  60.77349    14.72033   9.193995   

     psiTPTE22|387590 tAKR|389932  
RPKM           6.5754   0.1973196  

[1 rows x 20532 columns],      ?|100130426 ?|100133144 ?|100134869   ?|10357   ?|10431 ?|136542  \
RPKM           0    1.180236    0.450388  4.992169  24.97131        0   

      ?|155060    ?|26823 ?|280660 ?|317712   ...    ZXDA|7789 ZXDB|158586  \
RPKM  8.243926  0.2788105        0        0   ...      1.96905    3.550039   

     ZXDC|79364 ZYG11A|440590 ZYG11B|79699  ZYX|7791 ZZEF1|23140 ZZZ3|26009  \
RPKM   16.56948     0.8344213     4.417617  90.04401

In [7]:

#def get_RPKM_by_file(files):
#    for file_name in files:
#        df = pd.read_table(file_name, usecols=data_fields)
#        df = df.T # tanspose
#        new_header = df.iloc[0] #grab the first row for the header
#        df = df[1:] #take the data less the header row
#        df = df.rename(columns = new_header.T) #set the header row as the df header
#        RPKM_list.append(df)
#    return RPKM_list



#flattened_data = get_RPKM_by_file(file_list)




In [8]:
def merge_data(data_list):
    df = pd.concat(data_list, axis=0, join='inner', join_axes=None, ignore_index=False,
        keys=None, levels=None, names=None, verify_integrity=False)
    return df

X = merge_data(RPKM_list)
Y = np.ravel(map(round, np.random.rand(len(X), 1)))

print X

     ?|100130426 ?|100133144 ?|100134869   ?|10357   ?|10431    ?|136542  \
RPKM  0.06694772   0.9270708   0.4969419  5.518766  22.36725           0   
RPKM           0    1.180236    0.450388  4.992169  24.97131           0   
RPKM  0.04956242   0.9206793   0.3488647  3.163578  14.72296           0   
RPKM           0   0.6563802   0.3193393  3.247601  42.65823  0.02245179   
RPKM           0     1.19084   0.4598564  4.235559  32.02006           0   
RPKM           0    1.363317    0.541452  4.209579   25.9165           0   
RPKM           0   0.7934492   0.4086566  3.628636  37.58069           0   
RPKM           0   0.4957196   0.2666096  5.330419  44.28414           0   
RPKM           0    1.180167   0.5171587  3.949443  18.97144           0   
RPKM           0   0.9261403   0.4104143  2.862239  39.80732           0   
RPKM           0   0.9073837   0.3937844  6.820181   45.4014           0   
RPKM           0    1.433736   0.5303374  2.528682  39.12532           0   
RPKM  0.0251

In [9]:
# Impute missing data, assuming X is all numeric values. Y can be strings.
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
X_full = imp.fit_transform(X)

imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
Y_full = imp.fit_transform(Y)

In [10]:
# Encode classes as numbers
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
nonnumeric_columns_X = []
nonnumeric_columns_Y = []
if nonnumeric_columns_X:
    for feature in nonnumeric_columns_X:
        X_full[feature] = le.fit_transform(X_full[feature])
if nonnumeric_columns_Y:
    for feature in nonnumeric_columns_Y:
        Y_full[feature] = le.fit_transform(Y_full[feature])

In [11]:
# Split data into train and test sets.
def split_data(X, Y, p_train):
    msk = np.random.rand(len(RPKM_list)) < p_train
    print Y[0]
    print msk
    X_train = X[msk]
    Y_train = Y[0][msk]
    return X[msk], X[~msk], Y[0][msk], Y[0][~msk]

train_X, test_X, train_Y, test_Y = split_data(X_full, Y_full, 0.6)
print train_Y
print test_Y

[ 1.  0.  0.  1.  1.  0.  1.  1.  0.  0.  1.  0.  0.  0.  1.  0.  0.]
[ True  True False False  True False  True False  True  True  True  True
 False False False  True  True]
[ 1.  0.  1.  1.  0.  0.  1.  0.  0.  0.]
[ 0.  1.  0.  1.  0.  0.  1.]


In [16]:
from utility import statsTest
df_X_train = pd.DataFrame(train_X)
df_Y_train = pd.DataFrame(train_Y)
stats = statsTest.statistics_test(df_X_train, df_Y_train)
stats.index = X.columns.values
stats.columns = ['pvalue']# adjusted
stats = stats.sort(columns=['pvalue']) #adjusted
print stats

                         pvalue
KIAA1377|57562        215.89903
ZBTB7B|51043          215.89903
GPR108|56927          215.89903
ANAPC2|29882          215.89903
CARD10|29775          215.89903
SNAPC1|6617           215.89903
MADD|8567             215.89903
JAKMIP2|9832          215.89903
MAFF|23764            215.89903
SMPD3|55512           215.89903
NEK1|4750             215.89903
TMEM18|129787         215.89903
GPIHBP1|338328        215.89903
TCEAL3|85012          215.89903
GBX2|2637             215.89903
MT1G|4495             215.89903
TCEAL1|9338           215.89903
CASKIN2|57513         215.89903
CTSF|8722             215.89903
PDCD2|5134            215.89903
CTSG|1511             215.89903
SMARCC2|6601          215.89903
MYCNOS|10408          215.89903
INMT|11185            215.89903
CASQ2|845             215.89903
MRPL18|29074          215.89903
SMAGP|57228           215.89903
SMAD4|4089            215.89903
GCGR|2642             215.89903
METTL7A|25840         215.89903
...     

In [17]:
import xgboost as xgb
gbm = xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05).fit(train_X, train_Y)
predictions = gbm.predict(test_X)


[ 0.  0.  0.  0.  1.  0.  0.]
[ 0.  1.  0.  1. -1.  0.  1.]
