In [1]:
# note

# 1, output probability
# 2, the higher the prob, the higher the ranking
# 3, then f1, auc can be calculated

# load data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

%matplotlib inline

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.grid_search import GridSearchCV

from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

In [3]:
res_auc = pd.read_csv("../result/auc.csv")
res_f1 = pd.read_csv("../result/f1.csv")

In [4]:
data_name = res_auc["Unnamed: 0"]

# main 

In [8]:
%%time

rf_res_auc = []
rf_res_f1 = []

logistic_res_auc = []
logistic_res_f1 = []

for i in data_name:
    print(i)
    
    df = pd.read_csv("{}_training.csv".format(i))
    testing = pd.read_csv("{}_testing.csv".format(i))
    
    auc, f1 = rf(df, testing)
    rf_res_auc.append(auc)
    rf_res_f1.append(f1)
    
    auc, f1 = logistic(df, testing)
    logistic_res_auc.append(auc)
    logistic_res_f1.append(f1)
    
    

Celegans
facebook
NS
PB
Power
Router
USAir
Yeast
CPU times: user 1min 12s, sys: 3.25 s, total: 1min 15s
Wall time: 8min 33s


<matplotlib.figure.Figure at 0x7f34905337b8>

<matplotlib.figure.Figure at 0x7f345c714208>

<matplotlib.figure.Figure at 0x7f345f0cc1d0>

<matplotlib.figure.Figure at 0x7f3497f72e10>

<matplotlib.figure.Figure at 0x7f345c26c400>

<matplotlib.figure.Figure at 0x7f345c267b00>

<matplotlib.figure.Figure at 0x7f345c273c18>

<matplotlib.figure.Figure at 0x7f345c275cc0>

In [9]:
res_auc["rf"] = rf_res_auc
res_auc["logistic"] = logistic_res_auc
res_f1["rf"] = rf_res_f1
res_f1["logistic"] = logistic_res_f1

res_auc.to_csv("../result/auc.csv", index=False)
res_f1.to_csv("../result/f1.csv", index=False)

# random forest

In [6]:
def rf(df, testing):
    model = RandomForestClassifier()
    param_grid = { 
        'n_estimators': [100, 200, 300, 500],
        'max_features': ['sqrt', 'log2'],
        'max_depth' : [4,5,6,7,8],
        'criterion' :['gini', 'entropy']
    }
    clf = GridSearchCV(estimator=model, param_grid=param_grid, cv= 3, n_jobs=-1)
    clf.fit(df.iloc[:, :-1], df["label"])
    num_of_features = len(testing.columns)-1
    y_pred = clf.predict_proba(testing.iloc[:, :num_of_features])[:, np.where(clf.classes_==1)[0][0]]
    
    # auc
    auc = roc_auc_score(testing["label"], y_pred)
    
    # f1
    at = testing["label"].value_counts()[1]
    correct = 0
    prediction_list = list(zip(y_pred, testing["label"]))
    prediction_list.sort(reverse = True)
    for pred in prediction_list[:at]:
        if pred[1] == 1 and pred[0] != 0:
            correct += 1
    f1 = correct / at
    
    # feature_importance_figure_output
    importances = clf.best_estimator_.feature_importances_
    indices = np.argsort(importances)[::-1]
    names = [testing.iloc[:, :num_of_features].columns[i] for i in indices]
    fig = plt.figure()
    plt.figure(figsize=(15,9))
    plt.title("Feature Importance")
    plt.bar(range(num_of_features), importances[indices])
    plt.xticks(range(num_of_features), names, rotation=30)
    plt.savefig("{}_feature_importance.png".format(i))
    plt.close() # not showing the plot
    
    return auc, f1

# logistic regression 

In [7]:
def logistic(df, testing):
    model = LogisticRegression()
    param_grid = { 
        'penalty': ['l1', 'l2'], 
        'C': np.logspace(0, 4, 15)
    }
    clf = GridSearchCV(estimator=model, param_grid=param_grid, cv= 3, n_jobs=-1)
    clf.fit(df.iloc[:, :-1], df["label"])
    num_of_features = len(testing.columns)-1
    y_pred = clf.predict_proba(testing.iloc[:, :num_of_features])[:, np.where(clf.classes_==1)[0][0]]
    
    # auc
    auc = roc_auc_score(testing["label"], y_pred)
    
    # f1
    at = testing["label"].value_counts()[1]
    correct = 0
    prediction_list = list(zip(y_pred, testing["label"]))
    prediction_list.sort(reverse = True)
    for pred in prediction_list[:at]:
        if pred[1] == 1 and pred[0] != 0:
            correct += 1
    f1 = correct / at
    
    return auc, f1

# svm

In [None]:
# discarded, very time consuming when predict_proba
# https://stackoverflow.com/questions/15111408/how-does-sklearn-svm-svcs-function-predict-proba-work-internally

In [29]:
def svm(df, testing):
    model = SVC()
    param_grid = {'C': [1, 10, 100, 1000], 'kernel': ['linear'], "probability": [True]}
      #{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf'], "probability": [True]},
        
    
    clf = GridSearchCV(estimator=model, param_grid=param_grid, cv= 3, n_jobs=-1)
    clf.fit(df.iloc[:, :-1], df["label"])
    num_of_features = len(testing.columns)-1
    y_pred = clf.predict_proba(testing.iloc[:, :num_of_features], )[:, np.where(clf.classes_==1)[0][0]]
    
    # auc
    auc = roc_auc_score(testing["label"], y_pred)
    
    # f1
    at = testing["label"].value_counts()[1]
    correct = 0
    prediction_list = list(zip(y_pred, testing["label"]))
    prediction_list.sort(reverse = True)
    for pred in prediction_list[:at]:
        if pred[1] == 1 and pred[0] != 0:
            correct += 1
    f1 = correct / at
    
    return auc, f1