In [87]:
#import libraries
from sklearn.feature_selection import SelectPercentile
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn import preprocessing
from sklearn import neighbors
from sklearn import feature_selection
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn import ensemble
from sklearn import neural_network
from sklearn import svm
from sklearn import kernel_ridge
from sklearn.decomposition import KernelPCA
from sklearn.feature_selection import RFECV
import csv
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import IsolationForest
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import KFold
from sklearn.metrics import balanced_accuracy_score
import xgboost as xgb
from sklearn.utils import resample

In [88]:
#define functions for loading data and producing final CSV 

'''
eliminate highly correlated features
'''
def to_be_eliminated(df):
    # Create correlation matrix
    corr_matrix = df.corr().abs()
    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    # Find features with correlation greater than 0.95
    to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
    return to_drop

'''
loading training and test datasets
'''
def load_data():
    X_train = pd.read_csv("X_train.csv")
    X_test = pd.read_csv("X_test.csv")
    y_train = pd.read_csv("y_train.csv")
     
    #dropping id column
    X_train = X_train.drop('id', axis = 1)
    X_test = X_test.drop('id', axis = 1)
    y_train = y_train.drop('id', axis = 1)
   
    #reshuffling data 
    X_train['y'] = y_train
    X_train = X_train.sample(frac=1).reset_index(drop=True)
    y_train = X_train['y']
    X_train = X_train.drop('y', axis = 1)
    
    to_drop = to_be_eliminated(X_train)
    
    for i in range(len(to_drop)):
        X_train = X_train.drop(to_drop[i], axis = 1)
    
    for i in range(len(to_drop)):
        X_test = X_test.drop(to_drop[i], axis = 1)

    X_train = np.array(X_train)
    X_test = np.array(X_test)
    y_train = np.array(y_train)
    return X_train, X_test, y_train


'''
produce submission file
'''
def produce_solution(y):
    with open('out.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter=',', lineterminator="\n")
        writer.writerow(['id', 'y'])
        for i in range(y.shape[0]):
            writer.writerow([float(i), y[i]])


In [None]:
'''
model assessment via 10 fold CV
'''

#X and y are training x and y data 
#X_test_original corresponds to X_test.csv as given in the task 

X, X_test_original, y = load_data() 
y = y.ravel()
scores = np.array([])

kf = KFold(n_splits=10)
BMAC_scores = np.array([])

for train_index, test_index in kf.split(X):
    #define X_train and y_train as data in training folds (model is fitted here)
    #similarly, X_test, y_test as data in test fold (model is evaluated here)
    X_train, X_test = X[train_index], X[test_index]
    X_train = pd.DataFrame(X_train)
    X_test = pd.DataFrame(X_test)

    y_train, y_test = y[train_index], y[test_index]
    y_train = pd.DataFrame(y_train)
    y_test = pd.DataFrame(y_test)
    
    y_train.columns = ['y']
    y_test.columns = ['y']

    #oversampling to offset class imbalance
    X = pd.concat([X_train, y_train], axis=1)
    
    # separate minority and majority classes
    class_0 = X[X.y==0]
    class_1 = X[X.y==1]
    class_2 = X[X.y==2]

    #upsample minority -- classes 0 and 2
    class_0_upsampled = resample(class_0,
                          replace=True, # sample with replacement
                          n_samples=len(class_1), # match number in majority class
                          random_state=27) 
    class_2_upsampled = resample(class_2,
                          replace=True, # sample with replacement
                          n_samples=len(class_1), # match number in majority class
                          random_state=32)

    upsampled = pd.concat([class_1, class_0_upsampled, class_2_upsampled])
    
    #check new class counts 
    #print(upsampled.Class.value_counts())
   
    y_train = upsampled.y
    X_train = upsampled.drop('y', axis=1)
    
    #1. Zero Mean, Unit Variance
    print("Standardize data")
    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
#################################################################
#begin fitting model to training folds -- X_train 

    #2. Outlier detection
    print("Outlier Detection")
    isf = IsolationForest(n_estimators=200, contamination=0.005)
    
    outliers = isf.fit_predict(X_train)

    unique, counts = np.unique(outliers, return_counts=True)
    count_dict = dict(zip(unique, counts))
    X_train = X_train[outliers == 1]
    y_train = y_train[outliers == 1]
    
    
    #3. Feature selection 
    print("Feature Selection")
    select = SelectFromModel(RandomForestClassifier(n_estimators=300, random_state=42))
    select.fit(X_train, y_train)
    X_train = select.transform(X_train)
    
    print("Fitting the model")
    clf = xgb.XGBClassifier(random_state=42, learning_rate=0.6, n_estimators=300, max_depth=10)
    #clf = RandomForestClassifier(n_estimators=300, max_depth=10)
    clf.fit(X_train, y_train)
    
#end model fitting on X_train
############################################################
        
    #prediction 
    print("Predicting")
    #selecting features based on training results
    X_test = select.transform(X_test)
    pred = clf.predict(X_test)
    
    #scoring
    score = balanced_accuracy_score(y_test, pred)
    print(score)
    scores = np.append(scores,score)


Standardize data
Outlier Detection




In [None]:
truth = np.mean(scores)
std = np.std(scores)
print("mean expected error: ", truth, "std: ", std)

In [None]:
#prediction
X_test_original = select.transform(X_test_original)
pred = reg.predict(X_test_original)
produce_solution(pred)