In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel, SelectKBest
from sklearn.feature_selection import VarianceThreshold
from sklearn.cluster import DBSCAN
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import r2_score
from sklearn.linear_model import Ridge
from sklearn.svm import SVC
from sklearn.metrics import balanced_accuracy_score
from sklearn.ensemble import BaggingClassifier
import xgboost as xgb
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.neighbors import LocalOutlierFactor

In [13]:
xtrain = pd.read_csv("X_train.csv")
xtrain = xtrain.iloc[:, 1:]
xtest = pd.read_csv("X_test.csv")
xtest = xtest.iloc[:, 1:]
ytrain = pd.read_csv("Y_train.csv")

# Remove id from ytrain
ytrain = ytrain.drop('id', axis = 1)
ytrain.head()

Unnamed: 0,y
0,1
1,0
2,1
3,1
4,1


In [None]:
kf = KFold(n_splits=5, shuffle = True)

k_values = np.array([600, 550, 500, 450, 400, 350, 300, 250, 200, 150, 100])
BMAC_means_test = np.array([])
BMAC_stds_test = np.array([])
BMAC_means_train = np.array([])
BMAC_stds_train = np.array([])
for kbest in k_values:
    BMAC_scores_test = np.array([])
    BMAC_scores_train = np.array([])
    for train_index, test_index in kf.split(xtrain.values):

        x_train = xtrain.loc[train_index, :]
        x_test = xtrain.loc[test_index, :]

        y_train = ytrain.loc[train_index, :]
        y_test = ytrain.loc[test_index, :]

        # Handle class imbalance with smote function
        sm = SMOTE(random_state=42)
        x_tmp, y_tmp = sm.fit_resample(x_train.values, y_train.values.ravel()) #x_train and y_train are now arrays
        x_train = pd.DataFrame(x_tmp, columns = xtrain.columns)
        y_train = pd.DataFrame(y_tmp, columns = ytrain.columns)
        
        # Scale the data (should this be done for each bootstrap sample? in that case how)
        scaler = preprocessing.StandardScaler()
        xtrain_scaled = scaler.fit_transform(x_train)
        x_train = pd.DataFrame(xtrain_scaled, columns = x_train.columns)
        xtest_scaled = scaler.fit_transform(x_test)
        x_test = pd.DataFrame(xtest_scaled, columns = x_test.columns)
        
        '''
        # Run random forest to select features
        sel = SelectFromModel(RandomForestClassifier(n_estimators = 100))
        sel.fit(x_train, y_train.values.ravel())
        selected_feat = x_train.columns[(sel.get_support())]
        x_train = x_train.loc[:,selected_feat]
        x_test = x_test.loc[:,selected_feat]
        '''
        
        # Try K best
        sel = SelectKBest(k=kbest)
        sel.fit(x_train, y_train.values.ravel())
        selected_feat = x_train.columns[(sel.get_support())]
        x_train = x_train.loc[:,selected_feat]
        x_test = x_test.loc[:,selected_feat]
        
        # Model to fit
        estimator = SVC()
        #estimator = xgb.XGBClassifier(objective="multi:softprob", random_state=40)
        '''
        bagging = BaggingClassifier(base_estimator = estimator,
                                   n_estimators = 20)
        bagging.fit(x_train, y_train)
        bagging_prediction = bagging.predict(x_test)
        BMAC = balanced_accuracy_score(y_test, bagging_prediction)
        '''
        estimator.fit(x_train, y_train.values.ravel())
        pred = estimator.predict(x_test)
        BMAC_test = balanced_accuracy_score(y_test.values.ravel(), pred)
        BMAC_scores_test = np.append(BMAC_scores_test, BMAC_test)
        
        pred = estimator.predict(x_train)
        BMAC_train = balanced_accuracy_score(y_train.values.ravel(), pred)
        BMAC_scores_train = np.append(BMAC_scores_train, BMAC_train)

    BMAC_means_test = np.append(BMAC_means_test, np.mean(BMAC_scores_test))
    BMAC_stds_test = np.append(BMAC_stds_test, np.std(BMAC_scores_test))
    BMAC_means_train = np.append(BMAC_means_train, np.mean(BMAC_scores_train))
    BMAC_stds_train = np.append(BMAC_stds_train, np.std(BMAC_scores_train))

print(BMAC_means_test)
print(BMAC_stds_test)
#print(BMAC_means_train)
#print(BMAC_stds_train)

In [11]:
# Try logistic regression

kf = KFold(n_splits=5, shuffle = True)

c_values = np.array([0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30])
BMAC_means_test = np.array([])
BMAC_stds_test = np.array([])
BMAC_means_train = np.array([])
BMAC_stds_train = np.array([])
for c in c_values:
    BMAC_scores_test = np.array([])
    BMAC_scores_train = np.array([])
    for train_index, test_index in kf.split(xtrain.values):

        x_train = xtrain.loc[train_index, :]
        x_test = xtrain.loc[test_index, :]

        y_train = ytrain.loc[train_index, :]
        y_test = ytrain.loc[test_index, :]

        # Handle class imbalance with smote function
        sm = SMOTE(random_state=42)
        x_tmp, y_tmp = sm.fit_resample(x_train.values, y_train.values.ravel()) #x_train and y_train are now arrays
        x_train = pd.DataFrame(x_tmp, columns = xtrain.columns)
        y_train = pd.DataFrame(y_tmp, columns = ytrain.columns)
        
        # Scale the data (should this be done for each bootstrap sample? in that case how)
        scaler = preprocessing.StandardScaler()
        xtrain_scaled = scaler.fit_transform(x_train)
        x_train = pd.DataFrame(xtrain_scaled, columns = x_train.columns)
        xtest_scaled = scaler.fit_transform(x_test)
        x_test = pd.DataFrame(xtest_scaled, columns = x_test.columns)
        
        '''
        # Run random forest to select features
        sel = SelectFromModel(RandomForestClassifier(n_estimators = 100))
        sel.fit(x_train, y_train.values.ravel())
        selected_feat = x_train.columns[(sel.get_support())]
        x_train = x_train.loc[:,selected_feat]
        x_test = x_test.loc[:,selected_feat]
        
        
        # Try K best
        sel = SelectKBest(k=kbest)
        sel.fit(x_train, y_train.values.ravel())
        selected_feat = x_train.columns[(sel.get_support())]
        x_train = x_train.loc[:,selected_feat]
        x_test = x_test.loc[:,selected_feat]
        '''
        
        # Model to fit
        estimator = LogisticRegression(C=c, multi_class ='multinomial', solver = 'lbfgs', max_iter = 300)
        #estimator = xgb.XGBClassifier(objective="multi:softprob", random_state=40)
        '''
        bagging = BaggingClassifier(base_estimator = estimator,
                                   n_estimators = 20)
        bagging.fit(x_train, y_train)
        bagging_prediction = bagging.predict(x_test)
        BMAC = balanced_accuracy_score(y_test, bagging_prediction)
        '''
        estimator.fit(x_train, y_train.values.ravel())
        pred = estimator.predict(x_test)
        BMAC_test = balanced_accuracy_score(y_test.values.ravel(), pred)
        BMAC_scores_test = np.append(BMAC_scores_test, BMAC_test)
        
        pred = estimator.predict(x_train)
        BMAC_train = balanced_accuracy_score(y_train.values.ravel(), pred)
        BMAC_scores_train = np.append(BMAC_scores_train, BMAC_train)

    BMAC_means_test = np.append(BMAC_means_test, np.mean(BMAC_scores_test))
    BMAC_stds_test = np.append(BMAC_stds_test, np.std(BMAC_scores_test))
    BMAC_means_train = np.append(BMAC_means_train, np.mean(BMAC_scores_train))
    BMAC_stds_train = np.append(BMAC_stds_train, np.std(BMAC_scores_train))

print(BMAC_means_test)
print(BMAC_stds_test)
#print(BMAC_means_train)
#print(BMAC_stds_train)



[0.65475038 0.65466016 0.64087563 0.62456544 0.62852498 0.61756295
 0.62720573 0.60467486]
[0.00751363 0.01648666 0.01288261 0.02034335 0.01719967 0.02319487
 0.01133015 0.01613176]




In [29]:
# Simple pipeline with grid search

steps = [("scaler", preprocessing.StandardScaler()), ("classifier", SVC())]

pipeline = Pipeline(steps = steps)

parameters = {"classifier__kernel": ["rbf"],
              "classifier__gamma": ["auto"],
              "classifier__C": [0.1, 0.5, 1, 5, 10, 50, 100],
              "classifier__degree": [1, 2, 3, 4, 5, 6],
              "classifier__class_weight": ["balanced"]
             }

grid = GridSearchCV(pipeline, parameters, cv = 5, scoring = 'balanced_accuracy')

grid.fit(xtrain.values, ytrain.values.ravel())

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('classifier',
                                        SVC(C=1.0, cache_size=200,
                                            class_weight=None, coef0=0.0,
                                            decision_function_shape='ovr',
                                            degree=3, gamma='auto_deprecated',
                                            kernel='rbf', max_iter=-1,
                                            probability=False,
                                            random_state=None, shrin...,
                                            tol=0.001, verbose=False))],
               

In [31]:
print(grid.best_score_)
print(grid.best_params_)

0.7015740740740741
{'classifier__C': 0.5, 'classifier__class_weight': 'balanced', 'classifier__degree': 1, 'classifier__gamma': 'auto', 'classifier__kernel': 'rbf'}


In [32]:
# Submit model


xtrain = pd.read_csv("X_train.csv")
xtrain = xtrain.iloc[:, 1:]
xtest = pd.read_csv("X_test.csv")
xtest = xtest.iloc[:, 1:]
ytrain = pd.read_csv("Y_train.csv")

# Remove id from ytrain
ytrain = ytrain.drop('id', axis = 1)
ytrain.head()

'''
# Handle class imbalance with smote function
sm = SMOTE(random_state=42)
x_tmp, y_tmp = sm.fit_resample(xtrain.values, ytrain.values.ravel()) #x_train and y_train are now arrays
xtrain = pd.DataFrame(x_tmp, columns = xtrain.columns)
ytrain = pd.DataFrame(y_tmp, columns = ytrain.columns)
'''

# Scale the data (should this be done for each bootstrap sample? in that case how)
scaler = preprocessing.StandardScaler()
xtrain_scaled = scaler.fit_transform(xtrain)
xtrain = pd.DataFrame(xtrain_scaled, columns = xtrain.columns)
xtest_scaled = scaler.fit_transform(xtest)
xtest = pd.DataFrame(xtest_scaled, columns = xtest.columns)

'''
# Try K best
sel = SelectKBest(k=700)
sel.fit(xtrain, ytrain.values.ravel())
selected_feat = xtrain.columns[(sel.get_support())]
xtrain = xtrain.loc[:,selected_feat]
xtest = xtest.loc[:,selected_feat]
'''

# Model to fit
estimator = SVC(C = 0.5, gamma = 'auto', class_weight = 'balanced', degree = 1, kernel = 'rbf')
#estimator = xgb.XGBClassifier(objective="multi:softprob", random_state=40)

estimator.fit(xtrain, ytrain.values.ravel())
pred = estimator.predict(xtest)

index = pd.read_csv("sample.csv")
index['y'] = pred

index.to_csv("predictionsSimpleGridCVSVC.csv")