In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
sns.set(style="whitegrid")

In [None]:
import plotly
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

In [None]:
#data_dir = "../input/aml-fab-with-names/"
data_dir = "../input/aml-fab-with-names/"

In [None]:
df = pd.read_csv(data_dir + "GSE147515_FAB_mat.csv")

In [None]:
df = df.T
df.head()

In [None]:
df.columns = df.iloc[-1]
df.drop(["Unnamed: 0", "name"], axis=0, inplace=True)
df.head()

In [None]:
df.shape

In [None]:
lbl = pd.read_csv(data_dir + "GSE147515_FAB_lbl.csv")
lbl.head()

In [None]:
lbl.FAB.value_counts()

In [None]:
lbl_filtered = lbl.loc[lbl.FAB.isin(["CTRL", "M3"])]

In [None]:
lbl_filtered.FAB.value_counts()

In [None]:
df_filtered = df.loc[df.index.isin(lbl_filtered.X)]
df_filtered.head()

In [None]:
df_filtered.shape

In [None]:
lbl_filtered.head()

In [None]:
lbl_filtered.set_index(lbl_filtered.X, inplace=True)
lbl_filtered.drop(["Unnamed: 0", "X"], axis=1, inplace=True)
lbl_filtered.head()

In [None]:
lbl_filtered.FAB = lbl_filtered.FAB.map({"CTRL": 0, "M3": 1})
lbl_filtered.head()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df_filtered.values
y = lbl_filtered.values

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)

## LGBM

In [None]:
import lightgbm as lgb

In [None]:
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold, KFold, StratifiedKFold
from sklearn.feature_selection import SelectFromModel

In [None]:
# From GridSearch
lgbm_params = {
    'drop_rate': 0.1,
    'max_depth': 15,
    'n_estimators': 500,
    'learning_rate': 0.1,
    'num_leaves': 15,
    'reg_alpha': 0.001,
    'reg_lambda': 0.01
}

scale_pos_weight = (y == 0).sum() / (y == 1).sum()

lgbm = lgb.LGBMClassifier(
    metric="auc",
    scale_pos_weight=scale_pos_weight,
    min_gain_to_split=0.001,
    n_jobs=4,
    **lgbm_params,
)

In [None]:
lgbm_selector = SelectFromModel(estimator=lgbm, max_features=200).fit(X, y)

In [None]:
ind = lgbm_selector.get_support(indices=True)
lgbm_features = df.columns[ind]
lgbm_features

In [None]:
len(lgbm_features)

# RandomForest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
"""
rf_params = {
    "max_depth": [10,15,25],
    "min_samples_split": [2,3,10],
    "min_samples_leaf": [3,10,25],
    "n_estimators": [100,300,500],
    "criterion": ["gini"],
}
"""
rf_params = {'criterion': 'gini',
 'max_depth': 15,
 'min_samples_leaf': 10,
 'min_samples_split': 2,
 'n_estimators': 500}

rf = RandomForestClassifier(class_weight="balanced", random_state=0, **rf_params)
#gsRf = GridSearchCV(rf, rf_params, cv=10, n_jobs=4, verbose=5)

In [None]:
#gsRf.fit(X, y)

In [None]:
#gsRf.best_params_

In [None]:
#rf_best = gsRf.best_estimator_
#gsRf.best_score_

In [None]:
rf_selector = SelectFromModel(rf, max_features=200).fit(X, y)

In [None]:
ind = rf_selector.get_support(indices=True)
rf_features = df.columns[ind]
rf_features

In [None]:
len(rf_features)

# Logistic Reg

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
"""
lr_params = {
    "C": [0.01, 0.1, 1, 10]
}
"""
lr = LogisticRegression(penalty="l2", C=0.01)
#gsLR = GridSearchCV(lr, lr_params, cv=10, verbose=1, n_jobs=4)

In [None]:
#gsLR.fit(X, y)

In [None]:
#lr_best = gsLR.best_estimator_
#gsLR.best_params_

In [None]:
lr_selector = SelectFromModel(lr, max_features=200).fit(X, y)

In [None]:
ind = lr_selector.get_support(indices=True)
lr_features = df.columns[ind]
lr_features

# Chi Squared

In [None]:
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
norm_scaler = MinMaxScaler()
X_norm = norm_scaler.fit_transform(X)

In [None]:
chi2_selector = SelectKBest(score_func=chi2, k=200).fit(X_norm, y)

In [None]:
ind = chi2_selector.get_support(indices=True)
chi2_features = df.columns[ind]
chi2_features

# LASSO

In [None]:
from sklearn.linear_model import Lasso

In [None]:
lasso = Lasso(random_state=0, alpha=0.001, tol=0.001)
"""
lasso_params = {
    "alpha": [0.0001, 0.001, 0.01, 0.1, 1],
    "tol": [0.001, 0.01],
}
"""
#gsLASSO = GridSearchCV(lasso, lasso_params, cv=10, verbose=1, scoring="neg_mean_absolute_error", n_jobs=4)

In [None]:
#gsLASSO.fit(X, y)
lasso_selector = SelectFromModel(lasso, max_features=200).fit(X, y)

In [None]:
#lasso_best = gsLASSO.best_estimator_
#gsLASSO.best_params_

In [None]:
#gsLASSO.best_score_

In [None]:
ind = lasso_selector.get_support(indices=True)
lasso_features = df.columns[ind]
lasso_features

# Combine Features

In [None]:
import collections

In [None]:
def get_freq(feature_lists):
    counts = collections.defaultdict(int)
    for l in feature_lists:
        for x in l:
            counts[x] += 1
    
    return pd.DataFrame.from_dict(counts, orient="index")

In [None]:
data = [
    lgbm_features,
    rf_features,
    lr_features,
    lasso_features,
    chi2_features,
]

feature_freqs = get_freq(data)

In [None]:
feature_freqs.value_counts()

In [None]:
feature_freqs.to_csv("m3_features.csv")

# Training with all features

In [None]:
from sklearn.metrics import accuracy_score, log_loss, confusion_matrix
from sklearn.tree import DecisionTreeClassifier

In [None]:
kf = RepeatedStratifiedKFold(n_splits=10, n_repeats=5, random_state=0)

predicts = []
accuracy_scores = []
log_loss_scores = []

In [None]:
def train_model_lgbm(model, X):
    predicts = []
    accuracy_scores = []
    log_loss_scores = []
    
    for train_index, test_index in kf.split(X, y):
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=100, verbose=0)

        """
        y_pred = model.predict(X_test)
        predicts.append(y_pred)
        accuracy_scores.append(accuracy_score(y_test, y_pred))
        log_loss_scores.append(log_loss(y_test, y_pred))
        """

In [None]:
def train_model(model, X):
    predicts = []
    accuracy_scores = []
    log_loss_scores = []
    
    for train_index, test_index in kf.split(X, y):
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]
        model.partial_fit(X_train, y_train)

        y_pred = model.predict(X_val)
        predicts.append(y_val)
        accuracy_scores.append(accuracy_score(y_val, y_pred))
        log_loss_scores.append(log_loss(y_val, y_pred))

In [None]:
def print_results(model, X_test):
    y_pred = model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred, labels=[0,1]).ravel()
    cm = confusion_matrix(y_test, y_pred, labels=[0,1])
    
    total = sum(cm)
    accuracy = (tp+tn)/(tp+tn+fn+fp)
    sensitivity = tp/(tp+fn)
    specificity = tn/(tn+fp)
    print("Accuracy: ", accuracy)
    print("Sensitivity: ", sensitivity)
    print("Sepcificity: ", specificity)
    print("Counfsion Matrix: \n", cm)

In [None]:
def graph_importances(model, model_name):
    ind = np.argsort(model.feature_importances_)[::-1][:20]
    g = sns.barplot(
        y = df.columns[ind][:20],
        x = model.feature_importances_[ind][:20],
        orient = "h",
    )
    g.set_xlabel("Importance")
    g.set_ylabel("Symbol")
    g.set_title(model_name + " feature importance")
    g.figure.savefig(model_name + " feature importance.png", bbox_inches='tight')

In [None]:
clf = lgb.LGBMClassifier(
    scale_pos_weight=scale_pos_weight,
    min_gain_to_split=0.001,
    n_jobs=4,
    **lgbm_params,
)
train_model_lgbm(clf, X)

In [None]:
print_results(clf, X_test)

In [None]:
graph_importances(clf, "LightGBM")

In [None]:
clf = RandomForestClassifier(
    class_weight="balanced", 
    random_state=0, 
    **rf_params,
)
clf.fit(X_train, y_train)

In [None]:
print_results(clf, X_test)

In [None]:
graph_importances(clf, "RandomForest")

In [None]:
clf = DecisionTreeClassifier(
    class_weight="balanced", 
    random_state=0, 
    max_depth = 25,
    min_samples_split=3,
    criterion="gini",
)
clf.fit(X_train, y_train)

In [None]:
print_results(clf, X_test)

In [None]:
graph_importances(clf, "DecisionTree")

# Selected features

In [None]:
feature_freqs.head()

In [None]:
feature_freqs.columns = ["freq"]
feature_freqs.sort_values(by=["freq"], ascending=False)

In [None]:
top = feature_freqs.loc[feature_freqs.freq >=3]
top.shape

In [None]:
top.to_csv("m3_top.csv")

In [None]:
# new_df only has top selected features
new_df = df_filtered.loc[:, df_filtered.columns.isin(top.index)]
new_df.shape

In [None]:
new_df.head()

In [None]:
features = new_df.columns

In [None]:
X_sel = new_df.values

In [None]:
X_sel = scaler.fit_transform(X_sel)

In [None]:
X_sel_train, X_sel_test, y_train, y_test = train_test_split(X_sel, y, test_size=0.2, stratify=y, random_state=0)

In [None]:
clf = lgb.LGBMClassifier(
    scale_pos_weight=scale_pos_weight,
    min_gain_to_split=0.001,
    n_jobs=4,
    **lgbm_params,
)
train_model_lgbm(clf, X_sel)

In [None]:
print_results(clf, X_sel_test)

In [None]:
graph_importances(clf, "LightGBM")

In [None]:
clf = RandomForestClassifier(
    class_weight="balanced", 
    random_state=0, 
    **rf_params,
)
clf.fit(X_sel_train, y_train)

In [None]:
print_results(clf, X_sel_test)

In [None]:
graph_importances(clf, "RandomForest")

In [None]:
clf = DecisionTreeClassifier(
    class_weight="balanced", 
    random_state=0, 
    max_depth = 25,
    min_samples_split=3,
    criterion="gini",
)
clf.fit(X_sel_train, y_train)

In [None]:
print_results(clf, X_sel_test)

In [None]:
graph_importances(clf, "DecisionTree")

# Lime

In [None]:
import lime
import lime.lime_tabular

In [None]:
predict_lgbm = lambda x: clf.predict_proba(x).astype(float)

In [None]:
lime_explainer = lime.lime_tabular.LimeTabularExplainer(
    X_sel_train,
    training_labels=y_train,
    feature_names=features,
    feature_selection="lasso_path",
    class_names=["CTRL", "M3"],
)

In [None]:
exp = lime_explainer.explain_instance(X_sel_test[0], predict_lgbm, num_features=5)

In [None]:
exp.show_in_notebook(show_table=True)

In [None]:
from lime import submodular_pick

In [None]:
sp = submodular_pick.SubmodularPick(lime_explainer, X_sel_train, predict_lgbm, method="full", num_features=20, num_exps_desired=5)

In [None]:
for i, exp in enumerate(sp.sp_explanations):
    fig = exp.as_pyplot_figure(label=exp.available_labels()[0])
    fig.savefig(str(i), bbox_inches='tight')

In [None]:
# Make it into a dataframe
W_pick=pd.DataFrame([dict(this.as_list(this.available_labels()[0])) for this in sp.sp_explanations]).fillna(0)
 
W_pick['prediction'] = [this.available_labels()[0] for this in sp.sp_explanations]
 
#Making a dataframe of all the explanations of sampled points
W=pd.DataFrame([dict(this.as_list(this.available_labels()[0])) for this in sp.explanations]).fillna(0)
W['prediction'] = [this.available_labels()[0] for this in sp.explanations]


In [None]:
#Plotting the aggregate importances
fig=np.abs(W.drop("prediction", axis=1)).mean(axis=0).sort_values(ascending=False).head(
    25
).sort_values(ascending=True).iplot(kind="barh")

#Aggregate importances split by classes
grped_coeff = W.groupby("prediction").mean()
 
grped_coeff = grped_coeff.T
grped_coeff["abs"] = np.abs(grped_coeff.iloc[:, 0])
grped_coeff.sort_values("abs", inplace=True, ascending=False)
grped_coeff.head(25).sort_values("abs", ascending=True).drop("abs", axis=1).iplot(
    kind="barh", bargap=0.5
) 


# Learning curves

In [None]:
from sklearn.model_selection import learning_curve

In [None]:
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    fig = plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    fig.savefig(title+".png")
    return plt


In [None]:
skf = StratifiedKFold(n_splits=10)

In [None]:
g = plot_learning_curve(
    lgb.LGBMClassifier(
    scale_pos_weight=scale_pos_weight,
    min_gain_to_split=0.001,
    n_jobs=4,
    **lgbm_params,
    ),
    "LightGBM learning curve (all features)",
    X, y, cv=skf,
)

In [None]:
g = plot_learning_curve(
    lgb.LGBMClassifier(
    scale_pos_weight=scale_pos_weight,
    min_gain_to_split=0.001,
    n_jobs=4,
    **lgbm_params,
    ),
    "LightGBM learning curve (selected features)",
    X_sel, y, cv=skf,
)

In [None]:
g = plot_learning_curve(
    RandomForestClassifier(
    class_weight="balanced", 
    random_state=0, 
    **rf_params,
    ),
    "RandomForest learning curve (all features)",
    X, y, cv=skf,
)

In [None]:
g = plot_learning_curve(
    RandomForestClassifier(
    class_weight="balanced", 
    random_state=0, 
    **rf_params,
    ),
    "RandomForest learning curve (selected features)",
    X_sel, y, cv=skf,
)