In [32]:
import numpy as np
import os
import pandas as pd #data handling library
import sklearn as sk

def get_RPKM_by_file(file_name, data_fields):
    df = pd.read_table(file_name, usecols=data_fields)
    df = df.T # tanspose
    new_header = df.iloc[0] #grab the first row for the header
    df = df[1:] #take the data less the header row
    df = df.rename(columns = new_header.T) #set the header row as the df header
    return df


def get_RPKM_by_filelist(list_fname):
    RPKM_list = []
    data_fields = ('gene','RPKM')
    with open(list_fname, 'r') as f:
        RPKM_list.extend(map(lambda data_file: get_RPKM_by_file(data_file, data_fields), f.readlines()))
    return RPKM_list

def get_RPKM_by_dir(data_dir):
    RPKM_list = []
    data_fields = ('gene','RPKM')
    for dirName, subdirList, fileList in os.walk(data_dir):
        RPKM_list.extend(map(lambda f: get_RPKM_by_file(os.path.join(dirName, f), data_fields), fileList))
    return RPKM_list

data_dir = './Data'
RPKM_list = get_RPKM_by_dir(data_dir)
print RPKM_list[0]

     ?|100130426 ?|100133144 ?|100134869   ?|10357   ?|10431 ?|136542  \
RPKM  0.06694772   0.9270708   0.4969419  5.518766  22.36725        0   

      ?|155060    ?|26823 ?|280660 ?|317712   ...     ZXDA|7789 ZXDB|158586  \
RPKM  2.880182  0.2761594        0        0   ...     0.4448335   0.9159671   

     ZXDC|79364 ZYG11A|440590 ZYG11B|79699  ZYX|7791 ZZEF1|23140 ZZZ3|26009  \
RPKM   10.07533     0.5495135     5.733637  60.77349    14.72033   9.193995   

     psiTPTE22|387590 tAKR|389932  
RPKM           6.5754   0.1973196  

[1 rows x 20532 columns]


In [33]:
def merge_data(data_list):
    df = pd.concat(data_list, axis=0, join='inner', join_axes=None, ignore_index=False,
        keys=None, levels=None, names=None, verify_integrity=False)
    return df

X = merge_data(RPKM_list)
Y = np.ravel(map(round, np.random.rand(len(X), 1)))

print X

     ?|100130426 ?|100133144 ?|100134869   ?|10357   ?|10431    ?|136542  \
RPKM  0.06694772   0.9270708   0.4969419  5.518766  22.36725           0   
RPKM           0    1.180236    0.450388  4.992169  24.97131           0   
RPKM  0.04956242   0.9206793   0.3488647  3.163578  14.72296           0   
RPKM           0   0.6563802   0.3193393  3.247601  42.65823  0.02245179   
RPKM           0     1.19084   0.4598564  4.235559  32.02006           0   
RPKM           0    1.363317    0.541452  4.209579   25.9165           0   
RPKM           0   0.7934492   0.4086566  3.628636  37.58069           0   
RPKM           0   0.4957196   0.2666096  5.330419  44.28414           0   
RPKM           0    1.180167   0.5171587  3.949443  18.97144           0   
RPKM           0   0.9261403   0.4104143  2.862239  39.80732           0   
RPKM           0   0.9073837   0.3937844  6.820181   45.4014           0   
RPKM           0    1.433736   0.5303374  2.528682  39.12532           0   
RPKM  0.0251

In [5]:
# Impute missing data, assuming X is all numeric values. Y can be strings.
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
X_full = imp.fit_transform(X)

imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
Y_full = imp.fit_transform(Y)

In [6]:
# Encode classes as numbers
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
nonnumeric_columns_X = []
nonnumeric_columns_Y = []
if nonnumeric_columns_X:
    for feature in nonnumeric_columns_X:
        X_full[feature] = le.fit_transform(X_full[feature])
if nonnumeric_columns_Y:
    for feature in nonnumeric_columns_Y:
        Y_full[feature] = le.fit_transform(Y_full[feature])

In [7]:
# Split data into train and test sets.
def split_data(X, Y, p_train):
    msk = np.random.rand(len(RPKM_list)) < p_train
    print Y[0]
    print msk
    X_train = X[msk]
    Y_train = Y[0][msk]
    return X[msk], X[~msk], Y[0][msk], Y[0][~msk]

train_X, test_X, train_Y, test_Y = split_data(X_full, Y_full, 0.6)

[ 1.  1.  0.  0.  0.  1.  0.  0.  1.  0.  1.  1.  1.  1.  0.  1.  1.]
[False  True  True  True  True False  True  True  True  True  True False
  True  True False False  True]


In [8]:
from utility import statsTest
df_X_train = pd.DataFrame(train_X)
df_Y_train = pd.DataFrame(train_Y)
stats = statsTest.statistics_test(df_X_train, df_Y_train)
stats.index = X.columns.values
stats.columns = ['pvalue']# adjusted
stats = stats.sort(columns=['pvalue']) #adjusted
print stats[0:10]

                   pvalue
SMG1|23049      81.055241
HCN1|348980     81.055241
ELK4|2005       81.055241
MSLNL|401827    81.055241
EIF2C2|27161    81.055241
WDR37|22884     81.055241
TUBB3|10381     81.055241
PLAC8L1|153770  81.055241
C6orf1|221491   81.055241
C5orf35|133383  81.055241


In [24]:
import xgboost as xgb
#gbm = xgb.XGBClassifier(max_depth=10, n_estimators=300, learning_rate=0.05).fit(train_X, train_Y)
gbm = xgb.XGBClassifier(max_depth=20, n_estimators=300).fit(train_X, train_Y)
predictions = gbm.predict(test_X)


In [25]:
print test_Y - predictions

[ 1.  1.  1.  0.  1.]


In [26]:
gbm.get_params()

{'base_score': 0.5,
 'colsample_bylevel': 1,
 'colsample_bytree': 1,
 'gamma': 0,
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 20,
 'min_child_weight': 1,
 'missing': None,
 'n_estimators': 300,
 'nthread': -1,
 'objective': 'binary:logistic',
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'seed': 0,
 'silent': True,
 'subsample': 1}

In [35]:
tree = xgb.to_graphviz(gbm)
tree.render('xgboost_tree.gv', view=True)

'xgboost_tree.gv.pdf'

In [31]:
print xgb.plot_importance(gbm)
import matplotlib.pyplot as plt
plt.savefig('xgboost_importance.png')

Axes(0.125,0.1;0.775x0.8)
