## Setup

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif,mutual_info_classif
import matplotlib.pyplot as plt
import matplotlib
from sklearn.decomposition import PCA
import matplotlib.cm as cm
from sklearn import preprocessing
from sklearn.metrics import balanced_accuracy_score
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
import math
import pandas as pd
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
y = pd.read_csv("y_train.csv")
y = y.drop(['id'],axis=1)
X = pd.read_csv("X_train.csv")
X = X.drop(['id'],axis=1)

X_normal = X
y_normal = y

df_train = pd.concat([X,y],axis=1)

## Downsampling

In [4]:
df_class0 = df_train[df_train['y']==0]
df_class1 = df_train[df_train['y']==1]
df_class2 = df_train[df_train['y']==2]

num0 = df_class0.shape[0]
num1 = df_class1.shape[0]
num2 = df_class2.shape[0]

df0_down = df_class0.sample(num0)
df1_down = df_class1.sample(num0)
df2_down = df_class2.sample(num0)

df_down = pd.concat([df0_down, df1_down, df2_down]).sample(frac=1)
X_down = df_down.iloc[:,0:1000]
y_down = df_down.iloc[:,1000:1001]

## Upsampling

In [5]:
df_class0 = df_train[df_train['y']==0]
df_class1 = df_train[df_train['y']==1]
df_class2 = df_train[df_train['y']==2]

num0 = df_class0.shape[0]
num1 = df_class1.shape[0]
num2 = df_class2.shape[0]

df0_up = df_class0.sample(num1, replace=True)
df1_up = df_class1.sample(num1)
df2_up = df_class2.sample(num1, replace=True)

df_up = pd.concat([df0_up, df1_up, df2_up]).sample(frac=1)
X_up = df_up.iloc[:,0:1000]
y_up = df_up.iloc[:,1000:1001]

## Loop

In [6]:
def myscoring(clf,X,y):
    pred = clf.predict(X)
    return balanced_accuracy_score(y, pred)

In [7]:
def calculate(ndu, stan, f):
    '''
    ndu = normal or downsampling or undersampling: Default/down/up
    stand = standartize True/False
    f = features selection (how many): Int
    '''
    
    # Which Dataset
    X = X_normal
    y = y_normal
        
    # Standardize 
    if (stan):
        scaler = preprocessing.StandardScaler().fit(X)
        X = scaler.transform(X)
        
    # Feature Selection
    X = SelectKBest(f_classif,k=f).fit_transform(X,np.ravel(y))
    
    X = pd.DataFrame(X).reset_index(drop=True)
    y = pd.DataFrame(y).reset_index(drop=True)
    
    # SVM
    for c in [0.1, 0.5, 0.7, 0.8, 0.9, 1, 1.1, 1.2, 1.3, 5, 10]:
        svc = svm.SVC(kernel='rbf',C=c, random_state=42, class_weight='balanced', decision_function_shape='ovo')
        scores = crossval(svc,X,y,scoring=myscoring,f=f)
        avg = np.average(np.array(scores))
        print(ndu,', stan=',stan,'f=',f,' => SVM: rbf, ovo, c=',c,' crossvalscore: ',avg)
    
    # SVM2
    for c in [0.1, 0.5, 0.7, 0.8, 0.9, 1, 1.1, 1.2, 1.3, 5, 10]:
        svc = svm.SVC(kernel='rbf',C=c, random_state=42, class_weight='balanced', decision_function_shape='ovr')
        scores = crossval(svc,X,y,scoring=myscoring,f=f)
        avg = np.average(np.array(scores))
        print(ndu,', stan=',stan,'f=',f,' => SVM: rbf, ovr, c=',c,' crossvalscore: ',avg)
    
    # Logistic Regression
    for solver in ['liblinear','sag','lbfgs']:
        lr = LogisticRegression(class_weight='balanced', solver=solver)
        scores = crossval(lr,X,y,scoring=myscoring,f=f)
        avg = np.average(np.array(scores))
        print(ndu,', stan=',stan,'f=',f,' => LogReg: solver=',solver,' crossvalscore: ',avg)
    
    # LDA
    for shrinkage in [0.5,0.88,0.75,0.99]:
        lda = LinearDiscriminantAnalysis(solver="lsqr", store_covariance=True,shrinkage=shrinkage)
        scores = crossval(lda,X,y,scoring=myscoring,f=f)
        avg = np.average(np.array(scores))
        print(ndu,', stan=',stan,'f=',f,' => LDA: shrinkage=',shrinkage,' crossvalscore: ',avg)
    
    # RF
    rf = RandomForestClassifier(random_state=0,class_weight="balanced")
    scores = crossval(rf,X,y,scoring=myscoring,f=f)
    avg = np.average(np.array(scores))
    print(ndu,', stan=',stan,'f=',f,' => RF: crossvalscore: ',avg)
    print('-----------------------------------------------------------------------------')

In [8]:
def crossval(clf, X, y, scoring,f):
    scores = []
    for i in range(0,5):
        
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=i)
        X_train = pd.DataFrame(X_train)
        X_val = pd.DataFrame(X_val)
        y_train = pd.DataFrame(y_train)
        y_val = pd.DataFrame(y_val)
        
        df_train = pd.concat([X_train,y_train],axis=1)
        df_class0 = df_train[df_train['y']==0]
        df_class1 = df_train[df_train['y']==1]
        df_class2 = df_train[df_train['y']==2]

        num1 = df_class1.shape[0]

        df0_resampled = df_class0.sample(num1, replace=True)
        df1_resampled = df_class1.sample(num1)
        df2_resampled = df_class2.sample(num1, replace=True)

        df_resampled = pd.concat([df0_resampled, df1_resampled, df2_resampled]).sample(frac=1)
        X_resampled = df_resampled.iloc[:,0:f]
        y_resampled = df_resampled.iloc[:,f:f+1]
        
        
        clf.fit(X_resampled,y_resampled)
        
        score_i = scoring(clf,X_val.to_numpy(),y_val.to_numpy())
        scores.append(score_i)
    
    return scores

In [9]:
for ndu in ['up']:
    for stan in [True,False]:
        for f in [10,50,75,100,200,500,1000]:
            calculate(ndu,stan,f)

up , stan= True f= 10  => SVM: rbf, ovo, c= 0.1  crossvalscore:  0.6335803960416406
up , stan= True f= 10  => SVM: rbf, ovo, c= 0.5  crossvalscore:  0.631126216251563
up , stan= True f= 10  => SVM: rbf, ovo, c= 0.7  crossvalscore:  0.6354173154899895
up , stan= True f= 10  => SVM: rbf, ovo, c= 0.8  crossvalscore:  0.6341351677327183
up , stan= True f= 10  => SVM: rbf, ovo, c= 0.9  crossvalscore:  0.6362332943253138
up , stan= True f= 10  => SVM: rbf, ovo, c= 1  crossvalscore:  0.6348839003254907
up , stan= True f= 10  => SVM: rbf, ovo, c= 1.1  crossvalscore:  0.6341666424945235
up , stan= True f= 10  => SVM: rbf, ovo, c= 1.2  crossvalscore:  0.627155218148048
up , stan= True f= 10  => SVM: rbf, ovo, c= 1.3  crossvalscore:  0.6327306076911893
up , stan= True f= 10  => SVM: rbf, ovo, c= 5  crossvalscore:  0.6351245162618582
up , stan= True f= 10  => SVM: rbf, ovo, c= 10  crossvalscore:  0.6184150375676536
up , stan= True f= 10  => SVM: rbf, ovr, c= 0.1  crossvalscore:  0.6272232681388564