In [12]:
import pandas as pd
from rfpimp import *
import numpy as np
from pandas.api.types import is_bool_dtype
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_auc_score
import sys
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_confusion_matrix
import matplotlib.pyplot as plt
import pickle
from sklearn.metrics import f1_score
from sklearn.metrics import r2_score

In [2]:
input_df = '../gene_info.full_model.rice_blast.txt'
majority_fraction =  0.05 ## just to get trues and falses balanced well
approach = 'RF'
n_estimators = 200
min_samples_split =  2
min_samples_leaf = 1
max_features = 'None'
max_depth = 'None'
bootstrap = 'True'

In [3]:
def none_or_str(value):
    if value == 'None':
        return None
    return value

max_features = none_or_str(max_features)


def none_or_int(value):
    if value == 'None':
        return None
    return int(value)

max_depth = none_or_int(max_depth)

In [4]:
args_dict = {
    "n_estimators": n_estimators,
    "min_samples_split": min_samples_split,
    "min_samples_leaf": min_samples_leaf,
    "max_features": max_features,
    "max_depth": max_depth,
    "bootstrap": bootstrap
}

In [5]:
if approach == "SMOTE":
    oversample = SMOTE()
    over_X_train, over_y_train = oversample.fit_resample(X_train, y_train)
    X_train = over_X_train
    y_train = over_y_train
if approach == "BRFC":
    model = BalancedRandomForestClassifier(**args_dict)
elif approach == "RF_balanced":
    model = RandomForestClassifier(class_weight="balanced", **args_dict)
elif approach == "RF_balanced_subsample":
    model = RandomForestClassifier(class_weight="balanced_subsample", **args_dict)
elif approach == "RF":
    model = RandomForestClassifier(**args_dict)
elif approach == "SMOTE":
    model = RandomForestClassifier(**args_dict)

In [6]:
def train_test_split_mine_downsample(majority_fraction):
    df_genes = pd.read_csv(input_df)
    df_genes = df_genes[df_genes['lineage']!=4]
    ## pick 4 genomes per lineage as testing data
    genome_test_subset = []
    for lineage in np.unique(df_genes.lineage):
        for genome in np.random.choice(df_genes[df_genes.lineage == lineage].genome, size=4,replace=False):
            genome_test_subset.append(genome)
    df_genes_test_subset = df_genes[df_genes.genome.isin(genome_test_subset)]
    df_genes = df_genes[~df_genes.genome.isin(genome_test_subset)]
    if majority_fraction != 1.0:
        pav_true_subset = df_genes[df_genes['lineage_pav']==True].id
        pav_false_subset_downsampled = np.random.choice(df_genes[df_genes['lineage_pav'] == False].id, size=int(len(df_genes.index)*majority_fraction),replace=False)
        df_genes_downsampled = df_genes[(df_genes.id.isin(pav_false_subset_downsampled)) | (df_genes.id.isin(pav_true_subset))]
    else:
        df_genes_downsampled = df_genes
    # drop columns
    df_genes_downsampled = df_genes_downsampled.drop(['id', 'scaffold', 'start', 'end', 'orientation', 'orthogroups', 'enough_space_te', 'enough_space_gene',
                            'genome', 'lineage', 'lineage_conserved', 'proportion'], axis=1)
    df_genes_test_subset = df_genes_test_subset.drop(['id', 'scaffold', 'start', 'end', 'orientation', 'orthogroups', 'enough_space_te', 'enough_space_gene',
                            'genome', 'lineage', 'lineage_conserved', 'proportion'], axis=1)
    y_train = df_genes_downsampled['lineage_pav']
    X_train = df_genes_downsampled.drop('lineage_pav', axis=1)
    y_test = df_genes_test_subset['lineage_pav']
    X_test = df_genes_test_subset.drop('lineage_pav', axis=1)
    return(y_train,X_train,y_test,X_test)

In [19]:
y_train,X_train,y_test,X_test = train_test_split_mine_downsample(majority_fraction)

In [20]:
X_train

Unnamed: 0,any_te,gene_nearby,gene_gc,flanking_1kb_gc,lengths,tm,signalp,effectorp,H3K27ac,H3K27me3,H3K36me3,cm_expression,ip_expression,eccdna_cov,methylation,go,pfam
gene_0_FJ2000-69A_3_NA,False,True,0.453237,0.445479,5002,False,False,False,51.4109,39.2929,87.1494,5.628200,26.385855,74.77895,0.427901,True,True
gene_0_FJ81-JY_2_NA,False,True,0.458753,0.267000,254,False,False,False,65.2760,37.8795,87.1011,9.176440,21.211200,60.57185,0.384700,False,False
gene_0_FJ81221ZB11-14_2_NA,False,True,0.453067,0.496495,3872,True,False,False,86.7433,34.3371,77.4797,10.315900,42.458200,45.69220,0.452081,True,True
gene_0_FJ86061ZE3-39_2_NA,False,False,0.429121,0.408678,754,False,False,False,34.2481,221.7650,67.6130,0.043725,0.000000,133.57500,0.486362,False,False
gene_0_HN10-1604_3_NA,False,True,0.411036,0.500410,1715,False,True,False,91.6172,35.9223,90.5586,98.656500,840.078000,132.29100,0.415035,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
gene_9999_ML33_3_NA,True,False,0.375918,0.500000,1923,False,False,False,65.2760,37.8795,87.1011,9.176440,21.211200,60.57185,0.384700,False,False
gene_9999_TW-12HL-YL2-1_2_NA,True,False,0.341811,0.476836,2778,False,False,False,65.2760,37.8795,87.1011,9.176440,21.211200,60.57185,0.384700,False,True
gene_9999_TW-12YL-DL3-2_2_NA,False,True,0.380651,0.519000,1232,True,False,False,28.1625,386.9880,117.5050,0.239078,18.825700,51.63340,0.321447,True,False
gene_9999_TW-PT3-1_2_NA,False,True,0.382353,0.482000,1291,False,False,False,45.0907,70.6030,66.9313,8.807950,23.150100,36.76930,0.325584,True,True


In [22]:
model = RandomForestRegressor(**args_dict)

In [10]:
y = X_train['lengths']
x = X_train['H3K36me3']

In [11]:
model.fit(x.array.reshape(-1,1),y)

In [13]:
x_test = X_test['H3K36me3']

In [16]:
y_pred = model.predict(x_test.array.reshape(-1,1))

In [17]:
baseline = r2_score(y_test, y_pred)

In [18]:
baseline

-148255083.076096

In [36]:
y_train,X_train,y_test,X_test = train_test_split_mine_downsample(majority_fraction)
y_train = X_train['lengths']
X_train = X_train.drop(['lengths','H3K36me3'] , axis=1)
y_test = X_test['lengths']
X_test = X_test.drop(['lengths','H3K36me3'], axis=1)
model.fit(X_train,y_train)

In [37]:
y_pred = model.predict(X_test)
baseline = r2_score(y_test, y_pred)
print(baseline)

0.9082071545100181


In [33]:
X_test['H3K36me3'] = np.random.permutation(X_test['H3K36me3'])

In [35]:
y_pred = model.predict(X_test)
permuted = r2_score(y_test, y_pred)
print(permuted)

0.1245959806514968


In [40]:
for perm_feature in X_test:
    print(perm_feature)
    save = X_test[perm_feature].copy()
    X_test[perm_feature] = np.random.permutation(X_test[perm_feature])
    y_pred = model.predict(X_test)
    permuted = r2_score(y_test, y_pred)
    print(permuted)
    X_test[perm_feature] = save

any_te
0.9008476699496493
gene_nearby
0.8678282488640383
gene_gc
0.9012482578323857
flanking_1kb_gc
0.8275032566860848
tm
0.8796745771665061
signalp
0.8522849191625963
effectorp
0.8968231801853757
H3K27ac
0.7195442336827143
H3K27me3
0.349783332131941
cm_expression
0.6574240457967937
ip_expression
0.398206710469323
eccdna_cov
0.7043695125664327
methylation
0.5797379340471323
go
0.7428258145433568
pfam
0.5623219076669612


In [None]:
for perm_feature in X_train.columns:
    if perm_feature == dep_feature:
        row.append('x')
        continue
    save = X_dep_test[perm_feature].copy()
    X_dep_test[perm_feature] = np.random.permutation(X_dep_test[perm_feature])
    y_dep_pred_permuted = model_dep.predict(X_dep_test)
    if dep_feature in boolcols:
        permuted_score = f1_score(y_dep_test, y_dep_pred_permuted)
    else:
        permuted_score = r2_score(y_dep_test, y_dep_pred_permuted)
    diff = baseline-permuted_score
    row.append(diff)
    X_dep_test[perm_feature] = save