In [66]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from imblearn.over_sampling import SMOTE
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import r2_score
import sys
from rfpimp import importances
from rfpimp import dropcol_importances
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_confusion_matrix
import matplotlib.pyplot as plt
from pandas.api.types import is_bool_dtype

In [2]:
input_df = '../gene_info.full_model.rice_blast.txt'
majority_fraction =  0.5 ## just to get trues and falses balanced well
approach = 'RF'
n_estimators = 500
min_samples_split =  2
min_samples_leaf = 1
max_features = 'None'
max_depth = 'None'
bootstrap = 'True'

In [3]:
def none_or_str(value):
    if value == 'None':
        return None
    return value

max_features = none_or_str(max_features)


def none_or_int(value):
    if value == 'None':
        return None
    return int(value)

max_depth = none_or_int(max_depth)

In [4]:
args_dict = {
    "n_estimators": n_estimators,
    "min_samples_split": min_samples_split,
    "min_samples_leaf": min_samples_leaf,
    "max_features": max_features,
    "max_depth": max_depth,
    "bootstrap": bootstrap
}

In [5]:
def reports(model, X_test, y_test):
    y_pred = model.predict(X_test)
    TP = len(y_pred[(y_pred == 1) & (y_test == 1)])
    FN = len(y_pred[(y_pred == 0) & (y_test == 1)])
    FP = len(y_pred[(y_pred == 1) & (y_test == 0)])
    # sensitivity, how sensitive is the test? TP/TP+FN aka recall
    recall = TP/(TP+FN)
    ## PPV, how powerful is a positive? TP/TP+FP aka precision
    precision = TP/(TP+FP)
    ap = average_precision_score(y_test, model.predict_proba(X_test)[:,1])
    auc = roc_auc_score(y_test, model.predict_proba(X_test)[:,1])
    return([recall, precision, ap, auc])

def train_test_split_mine_downsample(majority_fraction):
    df_genes = pd.read_csv(input_df)
    df_genes = df_genes[df_genes['lineage']!=4]
    ## pick 4 genomes per lineage as testing data
    genome_test_subset = []
    for lineage in np.unique(df_genes.lineage):
        for genome in np.random.choice(df_genes[df_genes.lineage == lineage].genome, size=4,replace=False):
            genome_test_subset.append(genome)
    df_genes_test_subset = df_genes[df_genes.genome.isin(genome_test_subset)]
    df_genes = df_genes[~df_genes.genome.isin(genome_test_subset)]
    if majority_fraction != 1.0:
        pav_true_subset = df_genes[df_genes['lineage_pav']==True].id
        pav_false_subset_downsampled = np.random.choice(df_genes[df_genes['lineage_pav'] == False].id, size=int(len(df_genes.index)*majority_fraction),replace=False)
        df_genes_downsampled = df_genes[(df_genes.id.isin(pav_false_subset_downsampled)) | (df_genes.id.isin(pav_true_subset))]
    else:
        df_genes_downsampled = df_genes
    # drop columns
    df_genes_downsampled = df_genes_downsampled.drop(['id', 'scaffold', 'start', 'end', 'orientation', 'orthogroups', 'enough_space_te', 'enough_space_gene',
                            'genome', 'lineage', 'lineage_conserved', 'proportion'], axis=1)
    df_genes_test_subset = df_genes_test_subset.drop(['id', 'scaffold', 'start', 'end', 'orientation', 'orthogroups', 'enough_space_te', 'enough_space_gene',
                            'genome', 'lineage', 'lineage_conserved', 'proportion'], axis=1)
    y_train = df_genes_downsampled['lineage_pav']
    X_train = df_genes_downsampled.drop('lineage_pav', axis=1)
    y_test = df_genes_test_subset['lineage_pav']
    X_test = df_genes_test_subset.drop('lineage_pav', axis=1)
    return(y_train,X_train,y_test,X_test)

In [6]:
y_train,X_train,y_test,X_test = train_test_split_mine_downsample(majority_fraction)

In [7]:
if approach == "SMOTE":
    oversample = SMOTE()
    over_X_train, over_y_train = oversample.fit_resample(X_train, y_train)
    X_train = over_X_train
    y_train = over_y_train
if approach == "BRFC":
    model = BalancedRandomForestClassifier(**args_dict)
elif approach == "RF_balanced":
    model = RandomForestClassifier(class_weight="balanced", **args_dict)
elif approach == "RF_balanced_subsample":
    model = RandomForestClassifier(class_weight="balanced_subsample", **args_dict)
elif approach == "RF":
    model = RandomForestClassifier(**args_dict)
elif approach == "SMOTE":
    model = RandomForestClassifier(**args_dict)

In [8]:
## dont need to train just load the pkl
model.fit(X_train, y_train)

In [14]:
def f1_calc_mine(model, X_valid, y_valid):
    y_pred = model.predict(X_valid)
    f1 = f1_score(y_valid,y_pred)
    return(f1)

0.8599146294140473


In [32]:
y_pred = model.predict(X_test)
baseline = f1_score(y_test, y_pred)
permuted_f1s = []
column = 'any_te'
save = X_test[column].copy()
X_test[column] = np.random.permutation(X_test[column])
y_pred = model.predict(X_test)
permuted_f1 = f1_score(y_test, y_pred)
permuted_f1s.append(permuted_f1)
X_test[column] = save

In [33]:
permuted_f1s

[0.6505066560699383]

In [40]:
y_pred = model.predict(X_test)
baseline = f1_score(y_test, y_pred)
permuted_diffs = []
for column in X_test.columns:
    save = X_test[column].copy()
    X_test[column] = np.random.permutation(X_test[column])
    y_pred = model.predict(X_test)
    permuted_f1 = f1_score(y_test, y_pred)
    diff = baseline-permuted_f1
    permuted_diffs.append(diff)
    X_test[column] = save

In [42]:
X_test.columns

Index(['any_te', 'gene_nearby', 'gene_gc', 'flanking_1kb_gc', 'lengths', 'tm',
       'signalp', 'effectorp', 'H3K27ac', 'H3K27me3', 'H3K36me3',
       'cm_expression', 'ip_expression', 'eccdna_cov', 'methylation', 'go',
       'pfam'],
      dtype='object')

In [63]:
print('\t'.join(X_test.columns.to_list()))

any_te	gene_nearby	gene_gc	flanking_1kb_gc	lengths	tm	signalp	effectorp	H3K27ac	H3K27me3	H3K36me3	cm_expression	ip_expression	eccdna_cov	methylation	go	pfam


In [41]:
for i in range(len(X_test.columns)):
    print(X_test.columns[i])
    print(permuted_diffs[i])

any_te
0.20089024937150612
gene_nearby
0.04455532369490389
gene_gc
0.004137505238245076
flanking_1kb_gc
0.20245512537807886
lengths
0.29491980850163546
tm
0.034092563855527946
signalp
0.012083373785501395
effectorp
0.0056992487967253425
H3K27ac
0.08593627577060947
H3K27me3
0.6718168558189677
H3K36me3
0.17850379302572805
cm_expression
0.0643617502429823
ip_expression
0.06016204232702016
eccdna_cov
0.06618593454910782
methylation
0.08531626651499824
go
0.0769397406804756
pfam
0.17421185352302804


In [48]:
df_test_dict = dict(zip(X_test.columns.to_list(), permuted_diffs))

In [49]:
df_test_dict

{'any_te': 0.20089024937150612,
 'gene_nearby': 0.04455532369490389,
 'gene_gc': 0.004137505238245076,
 'flanking_1kb_gc': 0.20245512537807886,
 'lengths': 0.29491980850163546,
 'tm': 0.034092563855527946,
 'signalp': 0.012083373785501395,
 'effectorp': 0.0056992487967253425,
 'H3K27ac': 0.08593627577060947,
 'H3K27me3': 0.6718168558189677,
 'H3K36me3': 0.17850379302572805,
 'cm_expression': 0.0643617502429823,
 'ip_expression': 0.06016204232702016,
 'eccdna_cov': 0.06618593454910782,
 'methylation': 0.08531626651499824,
 'go': 0.0769397406804756,
 'pfam': 0.17421185352302804}

In [54]:
df_test = pd.DataFrame(df_test_dict, index=['1'])

In [57]:
df_test.loc[len(df_test.index)] = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]

In [60]:
df_means_dict = dict(zip(X_test.columns.to_list(), df_test.mean()))
df_means_test = pd.DataFrame(df_means_dict, index=['1'])

In [62]:
df_test

Unnamed: 0,any_te,gene_nearby,gene_gc,flanking_1kb_gc,lengths,tm,signalp,effectorp,H3K27ac,H3K27me3,H3K36me3,cm_expression,ip_expression,eccdna_cov,methylation,go,pfam
1,0.20089,0.044555,0.004138,0.202455,0.29492,0.034093,0.012083,0.005699,0.085936,0.671817,0.178504,0.064362,0.060162,0.066186,0.085316,0.07694,0.174212
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [61]:
df_means_test

Unnamed: 0,any_te,gene_nearby,gene_gc,flanking_1kb_gc,lengths,tm,signalp,effectorp,H3K27ac,H3K27me3,H3K36me3,cm_expression,ip_expression,eccdna_cov,methylation,go,pfam
1,0.600445,0.522278,0.502069,0.601228,0.64746,0.517046,0.506042,0.50285,0.542968,0.835908,0.589252,0.532181,0.530081,0.533093,0.542658,0.53847,0.587106


In [None]:
## okay now to make the dependence matrix

In [67]:
all_cols = [col for col in X_train]
boolcols = [col for col in X_train if is_bool_dtype(X_train[col])]
df_dep = pd.DataFrame(index=X_train.columns, columns=['Dependence']+X_train.columns.tolist())

In [None]:
for i, dep_feature in enumerate(all_cols):
    row = []
    X_dep_train, y_dep_train = X_train.drop(dep_feature, axis=1), X_train[dep_feature]
    X_dep_test, y_dep_test = X_test.drop(dep_feature, axis=1), X_test[dep_feature]
    if dep_feature in boolcols:
        model_dep = RandomForestClassifier(**args_dict)
    else:
        model_dep = RandomForestRegressor(**args_dict)
    model_dep.fit(X_dep_train,y_dep_train)
    y_dep_pred = model.predict(X_dep_test)
    if dep_feature in boolcols:
        baseline = f1_score(y_dep_test, y_dep_pred)
    else:
        baseline = r2_score(y_dep_test, y_dep_pred)
    row.append(baseline)
    for perm_feature in X_dep_test.columns:
        save = X_dep_test[perm_feature].copy()
        X_dep_test[perm_feature] = np.random.permutation(X_dep_test[perm_feature])
        y_dep_pred_permuted = model.predict(X_dep_test)
        if dep_feature in boolcols:
            permuted_score = f1_score(y_dep_test, y_dep_pred_permuted)
        else:
            permuted_score = f1_score(y_dep_test, y_dep_pred_permuted)
        diff = baseline-permuted_score
        row.append(diff)
        X_dep_test[perm_feature] = save
    df_dep.iloc[i] = row

In [73]:
len(X_dep_train.columns)

16

In [76]:
i = 0
dep_feature = all_cols[0]
row = []
X_dep_train, y_dep_train = X_train.drop(dep_feature, axis=1), X_train[dep_feature]
X_dep_test, y_dep_test = X_test.drop(dep_feature, axis=1), X_test[dep_feature]
if dep_feature in boolcols:
    model_dep = RandomForestClassifier(**args_dict)
else:
    model_dep = RandomForestRegressor(**args_dict)
model_dep.fit(X_dep_train,y_dep_train)
y_dep_pred = model_dep.predict(X_dep_test)
if dep_feature in boolcols:
    baseline = f1_score(y_dep_test, y_dep_pred)
else:
    baseline = r2_score(y_dep_test, y_dep_pred)
row.append(baseline)
for perm_feature in X_dep_test.columns:
    if perm_feature == dep_feature:
        row.append('x')
        continue
    save = X_dep_test[perm_feature].copy()
    X_dep_test[perm_feature] = np.random.permutation(X_dep_test[perm_feature])
    y_dep_pred_permuted = model_dep.predict(X_dep_test)
    if dep_feature in boolcols:
        permuted_score = f1_score(y_dep_test, y_dep_pred_permuted)
    else:
        permuted_score = f1_score(y_dep_test, y_dep_pred_permuted)
    diff = baseline-permuted_score
    row.append(diff)
    X_dep_test[perm_feature] = save
df_dep.iloc[i] = row

KeyboardInterrupt: 

In [75]:
print(row)

[0.8592623550308223, 0.07858404053955814, 0.00010815990123991259, 0.11320917680066023, 0.09759387929815044, 0.007995370087744758, 0.012703181250655615, 0.0006252544607683319, 0.06596315645557749, 0.6308228453292887, 0.0960935882470999, 0.031573165120950764, 0.03799331676592799, 0.031934625738109346, 0.06344632009255746, 0.021870691803418163, 0.018138933232316745]
