In [1]:
# Do all necessary preprocessing, calling prepro.py
from prepro import *

Using TensorFlow backend.


In [2]:
# Load data
X, X_test_original, y = load_data() 
y = y.ravel()
scores = np.array([])

In [5]:
# Make a SVM pipeline
kf = KFold(n_splits=3, shuffle=True)

estimators = [('standardization', preprocessing.StandardScaler()), 
              ('feature_selection', SelectFromModel(RandomForestClassifier(n_estimators=300, random_state=42))),
              ('classifier', svm.SVC(class_weight='balanced'))
             ]
pipe = Pipeline(estimators)

score = cross_val_score(pipe, X, y, cv=kf, scoring=make_scorer(balanced_accuracy_score))
score

array([0.69330845, 0.67228027, 0.67334642])

In [None]:
# Make an XGBoost pipeline


estimators2 = [('standardization', preprocessing.StandardScaler()), 
              ('feature_selection', SelectFromModel(RandomForestClassifier(n_estimators=300, random_state=42))),
              ('classifier', xgb.XGBClassifier(random_state=42, learning_rate=0.5, n_estimators=300, max_depth=10))
             ]
pipe2 = Pipeline(estimators)
score2 = cross_val_score(pipe, X, y, cv=kf, scoring=make_scorer(balanced_accuracy_score))
score

In [5]:
'''
BEST APPROACH SO FAR
    
Approach 2: 

model assessment via 5 fold CV 
class imbalance is taken care of by undersampling from class 1 
'''

#X and y are training x and y data 
#X_test_original corresponds to X_test.csv as given in the task 

X, X_test_original, y = load_data() 
y = y.ravel()
scores = np.array([])

kf = KFold(n_splits=2)
BMAC_scores = np.array([])

for train_index, test_index in kf.split(X):
    #define X_train and y_train as data in training folds (model is fitted here)
    #similarly, X_test, y_test as data in test fold (model is evaluated here)
    X_train, X_test = X[train_index], X[test_index]
    X_train = pd.DataFrame(X_train)
    X_test = pd.DataFrame(X_test)

    y_train, y_test = y[train_index], y[test_index]
    y_train = pd.DataFrame(y_train)
    y_test = pd.DataFrame(y_test)
    
    y_train.columns = ['y']
    y_test.columns = ['y']
    
    #1. Zero Mean, Unit Variance
    print("Standardize data")
    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train))
    X_test = scaler.transform(X_test)
    
    #2. Feature selection 
    print("Feature Selection")
    select = SelectFromModel(RandomForestClassifier(n_estimators=300, random_state=42))
    select.fit(X_train, y_train)
    X_train = pd.DataFrame(select.transform(X_train))
   
    '''
    #3. Outlier detection
    print("Outlier Detection")
    isf = IsolationForest(n_estimators=300, contamination=0.38)
    outliers = isf.fit_predict(X_train)
    unique, counts = np.unique(outliers, return_counts=True)
    count_dict = dict(zip(unique, counts))
    X_train = X_train[outliers == 1]
    y_train = y_train[outliers == 1]
    #DBScan = DBSCAN(eps = .5, metric=”euclidean”,min_samples = 30, n_jobs = -1)    
    #outliers = DBScan.fit_predict(X_train)
    '''
    
    '''
    #4. Undersampling from class 1 to offset class imbalance
    print('Undersampling')
    X_concat = pd.concat([X_train, y_train], axis=1)
    # separate minority and majority classes
    class_0 = X_concat[X_concat.y==0]
    class_1 = X_concat[X_concat.y==1]
    class_2 = X_concat[X_concat.y==2]

    #undersample majority class (1)
    class_1_under = resample(class_1,
                          replace=False, # sample with replacement
                          n_samples=len(class_0), # match number in minority classes
                          random_state=27) 
    undersampled = pd.concat([class_1_under, class_0, class_2])
    y_train = undersampled.y
    X_train = undersampled.drop('y', axis=1)
    '''
    #5. fitting model
    print("Fitting the model")
    #clf = xgb.XGBClassifier(random_state=42, learning_rate=0.5, n_estimators=300, max_depth=10)
    #clf = RandomForestClassifier(n_estimators=300, max_depth=10)
    class_weight = y_train.shape[0] / (3 * np.bincount((y_train.iloc[:,0]).astype(int)))
    class_weights0 = { 
    0 : class_weight[0],
    1 : class_weight[1],
    2 : class_weight[2]
    }
    
    ########## BO
    def classifier(c0_weight=class_weight[0], c1_weight=class_weight[1], c2_weight=class_weight[2], 
                   xtrain=X_train, ytrain=y_train, xtest=X_test, ytest=y_test):
        class_weights1 = { 
        0 : c0_weight,
        1 : c1_weight,
        2 : c2_weight
        }
        clf = svm.SVC(class_weight=class_weights1)

        clf.fit(xtrain, ytrain)

        #6. prediction 
        #print("Predicting")
        #selecting features based on training results
        #_test_selected = pd.DataFrame(select.transform(xtest))  #note: transform was previosuly fitted on training folds
        pred = clf.predict(xtrain)

        #scoring
        score = balanced_accuracy_score(ytrain, pred)
        #print(score)
        #scores = np.append(scores,score)
        return score


    # specify parameters and distributions to sample from
    param_dist = {"c0_weight": (0, 15), "c1_weight": (0, 12), "c2_weight": (0, 15)}

    optimizer = BayesianOptimization(
        f=classifier,
        pbounds=param_dist,
        verbose=2,
        random_state=5,
    )

    probe_params = {"c0_weight": class_weight[0], "c1_weight": class_weight[1], "c2_weight": class_weight[2]}
    optimizer.probe(
        params=probe_params,
        lazy=True
    )

    optimizer.maximize(
        init_points=3,
        n_iter=350,
    )

    print(optimizer.max)
    

    ########## BO
    class_weights_test = { 
    0 : optimizer.max['params']['c0_weight'],
    1 : optimizer.max['params']['c1_weight'],
    2 : optimizer.max['params']['c2_weight']
    }
    clf2 = svm.SVC(class_weight=class_weights_test)
    clf2.fit(X_train, y_train)
    print("Predicting")
    #selecting features based on training results
    X_test_selected = pd.DataFrame(select.transform(X_test))  #note: transform was previosuly fitted on training folds
    pred2 = clf2.predict(X_test_selected)
    #scoring
    score2 = balanced_accuracy_score(y_test, pred2)
    print('Test score:', score2)    
    scores = np.append(scores,score2)
    
    
##########################################################

truth = np.mean(scores)
std = np.std(scores)
print("mean expected error: ", truth, "std: ", std)

Standardize data
Feature Selection
Fitting the model
|   iter    |  target   | c0_weight | c1_weight | c2_weight |
-------------------------------------------------------------
| [0m 1       [0m | [0m 0.8346  [0m | [0m 2.634   [0m | [0m 0.4472  [0m | [0m 2.602   [0m |
| [0m 2       [0m | [0m 0.6256  [0m | [0m 1.776   [0m | [0m 4.354   [0m | [0m 1.654   [0m |
| [95m 3       [0m | [95m 0.9062  [0m | [95m 7.349   [0m | [95m 2.442   [0m | [95m 4.894   [0m |
| [0m 4       [0m | [0m 0.8347  [0m | [0m 6.127   [0m | [0m 2.592   [0m | [0m 2.374   [0m |
| [0m 5       [0m | [0m 0.6345  [0m | [0m 8.0     [0m | [0m 0.0     [0m | [0m 8.0     [0m |
| [0m 6       [0m | [0m 0.6374  [0m | [0m 0.0     [0m | [0m 5.0     [0m | [0m 8.0     [0m |
| [95m 7       [0m | [95m 0.9194  [0m | [95m 8.0     [0m | [95m 5.0     [0m | [95m 8.0     [0m |
| [0m 8       [0m | [0m 0.3333  [0m | [0m 8.0     [0m | [0m 0.0     [0m | [0m 0.0     [0m

In [49]:
'''
Approach 4: 

model assessment via 5 fold CV 
class imbalance is taken care of by undersampling from class 1 
'''

#X and y are training x and y data 
#X_test_original corresponds to X_test.csv as given in the task 

X, X_test_original, y = load_data() 
y = y.ravel()
scores = np.array([])

kf = KFold(n_splits=5)
BMAC_scores = np.array([])

for train_index, test_index in kf.split(X):
    #define X_train and y_train as data in training folds (model is fitted here)
    #similarly, X_test, y_test as data in test fold (model is evaluated here)
    X_train, X_test = X[train_index], X[test_index]
    X_train = pd.DataFrame(X_train)
    X_test = pd.DataFrame(X_test)

    y_train, y_test = y[train_index], y[test_index]
    y_train = pd.DataFrame(y_train)
    y_test = pd.DataFrame(y_test)
    
    y_train.columns = ['y']
    y_test.columns = ['y']
    
    smote = SMOTE('minority')
    X_train, y_train = smote.fit_sample(X_train, y_train)

    '''
    #undersampling from class 1 to offset class imbalance
    X_concat = pd.concat([X_train, y_train], axis=1)
    
    # separate minority and majority classes
    class_0 = X_concat[X_concat.y==0]
    class_1 = X_concat[X_concat.y==1]
    class_2 = X_concat[X_concat.y==2]

    #upsample minority -- classes 0 and 2
    class_1_under = resample(class_1,
                          replace=False, # sample with replacement
                          n_samples=len(class_0), # match number in minority classes
                          random_state=27) 

    undersampled = pd.concat([class_1_under, class_0, class_2])
   
    y_train = undersampled.y
    X_train = undersampled.drop('y', axis=1)
    '''
    
    
    #1. Zero Mean, Unit Variance
    print("Standardize data")
    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
#################################################################
#begin fitting model to training folds -- X_train 

    #2. Feature selection 
    print("Feature Selection")
    select = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42))
    select.fit(X_train, y_train)
    X_train = select.transform(X_train)

    #3. Outlier detection
    print("Outlier Detection")
    isf = IsolationForest(n_estimators=100, contamination=0.35)
    outliers = isf.fit_predict(X_train)
    
    #DBScan = DBSCAN(eps = .5, metric=”euclidean”,min_samples = 30, n_jobs = -1)    
    #outliers = DBScan.fit_predict(X_train)

    unique, counts = np.unique(outliers, return_counts=True)
    count_dict = dict(zip(unique, counts))
    X_train = X_train[outliers == 1]
    y_train = y_train[outliers == 1]
    
    print("Fitting the model")
    clf = xgb.XGBClassifier(random_state=42, learning_rate=0.5, n_estimators=100, max_depth=10)
    #clf = RandomForestClassifier(n_estimators=300, max_depth=10)
    clf.fit(X_train, y_train)
    
#end model fitting on X_train
############################################################
        
    #prediction 
    print("Predicting")
    #selecting features based on training results
    X_test = select.transform(X_test)
    pred = clf.predict(X_test)
    
    #scoring
    score = balanced_accuracy_score(y_test, pred)
    print(score)
    scores = np.append(scores,score)

##########################################################

truth = np.mean(scores)
std = np.std(scores)
print("mean expected error: ", truth, "std: ", std)

Standardize data
Feature Selection
Outlier Detection
Fitting the model
Predicting
0.5357672053022566
Standardize data
Feature Selection


KeyboardInterrupt: 