In [45]:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
import pandas as pd
import time

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectKBest, f_classif

In [46]:
def get_x(df: pd.DataFrame) -> None:
    condition = (df.columns != 'status') & (df.columns != 'account_id') & (df.columns != 'loan_id') & (
        df.columns != 'client_id') & (df.columns != 'district_id') & (df.columns != 'disp_id') & (df.columns != 'district_id_y') & (
        df.columns != 'district_id_x') & (df.columns != 'id')
    return df.loc[:, condition]

def get_y(df: pd.DataFrame) -> None:
    return df.status

In [47]:
df_dev  = pd.read_csv("../dados/ready/dev.csv")
df_comp = pd.read_csv("../dados/ready/comp.csv")

In [48]:
start = time.time()

x = get_x(df_dev)
y = get_y(df_dev)

k_best = SelectKBest(f_classif, k=7)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y)

smp = SMOTE()
x_res, y_res = smp.fit_resample(x_train, y_train)
x_res = k_best.fit_transform(x_res, y_res)

scaler = StandardScaler()
scaler.fit(x_res)
x_res = scaler.transform(x_res)
x_res = scaler.transform(x_res)

clf = SVC(gamma='auto', probability=True, kernel="linear")
clf.fit(x_res, y_res)

# Selecting columns
cols = k_best.get_support()

x_test = x_test.iloc[:, cols]
predicted = clf.predict_proba(x_test)[::, 1]
expected = y_test

print(f"score {roc_auc_score(expected.values, predicted)}")