### **Library Imports**

In [1]:
import os
import pickle
import numpy as np
import pandas as pd
import random as r
import seaborn as sns
import matplotlib.pyplot as plt

from time import time
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import KFold, GridSearchCV

from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, classification_report

### **Configuration**

In [2]:
class CFG(object):
    def __init__(self,
                 seed: int = 42,
                 n_splits: int = 5,
                 show_info: bool = False,
                 ):

        self.seed = seed
        self.n_splits = n_splits
        self.show_info = show_info
        self.train_data_read_path = "../input/predict-potential-spammers-on-fiverr/train.csv"
        self.test_data_read_path = "../input/predict-potential-spammers-on-fiverr/test.csv"
        self.ss_data_read_path = "../input/predict-potential-spammers-on-fiverr/sample_submission.csv"
        self.model_save_path = "models"
        if not os.path.exists(self.model_save_path): os.makedirs(self.model_save_path)

cfg = CFG(seed=42, show_info=True)

### **Helpers**

In [3]:
def breaker(num: int=50, char: str="*") -> None:
    print("\n" + num*char + "\n")

    
def get_object_columns(df) -> list:
    object_columns: list = []
    for col in df.columns:
        if df[col].dtype == "object":
            object_columns.append(col)

    return object_columns
    

def print_scores(accuracy: float, auc: float, precision: np.ndarray, recall: np.ndarray, f_score: np.ndarray) -> None:
    print(f"Accuracy  : {accuracy:.5f}")
    print(f"ROC-AUC   : {auc:.5f}")
    print(f"Precision : {precision}")
    print(f"Recall    : {recall}")
    print(f"F-Score   : {f_score}")
    

def get_scores(y_true: np.ndarray, y_pred: np.ndarray) -> tuple:
    accuracy = accuracy_score(y_pred, y_true)
    try:
        auc = roc_auc_score(y_pred, y_true)
    except: 
        auc = 0.0
    precision, recall, f_score, _ = precision_recall_fscore_support(y_pred, y_true)

    return accuracy, auc, precision, recall, f_score

### **Model**

In [4]:
class Model(object):
    def __init__(self, model_name: str, preprocessor, seed: int):
        self.model_name = model_name

        if self.model_name == "lgr":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", LogisticRegression(random_state=seed)),
                ]
            )
        
        elif self.model_name == "knc":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", KNeighborsClassifier()),
                ]
            )

        
        elif self.model_name == "dtc":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", DecisionTreeClassifier(random_state=seed)),
                ]
            )

        elif self.model_name == "etc":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", ExtraTreeClassifier(random_state=seed)),
                ]
            )
        
        elif self.model_name == "rfc":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", RandomForestClassifier(random_state=seed)),
                ]
            )
        
        elif self.model_name == "gbc":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", GradientBoostingClassifier(random_state=seed)),
                ]
            )
        
        elif self.model_name == "abc":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", AdaBoostClassifier(random_state=seed)),
                ]
            )
        
        elif self.model_name == "etcs":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", ExtraTreesClassifier(random_state=seed)),
                ]
            )
        
        elif self.model_name == "gnb":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", GaussianNB()),
                ]
            )
        
        elif self.model_name == "xgbc":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", XGBClassifier(random_state=seed)),
                ]
            )

### **Train**

In [5]:
df = pd.read_csv(cfg.train_data_read_path)
df = df.drop(columns=["user_id"])
object_columns = get_object_columns(df)
df = df.drop(columns=object_columns)

if cfg.show_info:
    breaker()
    for val in set(df.label):
        print(f"Class {val} count : {df[df.label == val].shape[0]}")

X = df.iloc[:, :-1].copy().values
y = df.iloc[:, -1].copy().values

features = [i for i in range(X.shape[1])]


**************************************************

Class 0 count : 446477
Class 1 count : 12321


In [6]:
names: list = ["lgr", "knc", "gnb", "dtc", "etc", "abc", "gbc", "etcs", "rfc", "xgbc"]
# names: list = ["lgr"]

feature_transformer = Pipeline(
    steps=[
        ("Simple_Imputer", SimpleImputer(missing_values=np.nan, strategy="mean")),
        ("Standard_Scaler", StandardScaler())
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("features", feature_transformer, features),
    ]
)

best_acc: float = 0.0
for name in names:
    fold = 1
    if cfg.show_info: breaker()
    for tr_idx, va_idx in KFold(n_splits=cfg.n_splits, random_state=cfg.seed, shuffle=True).split(X):
        X_train, X_valid, y_train, y_valid = X[tr_idx], X[va_idx], y[tr_idx], y[va_idx]
        my_pipeline = Model(name, preprocessor, cfg.seed)
        my_pipeline.model.fit(X_train, y_train)

        y_pred = my_pipeline.model.predict(X_valid)
        if cfg.show_info:
            acc, auc, pre, rec, f1 = get_scores(y_valid, y_pred)
            print(f"{my_pipeline.model_name}, {fold}\n")
            print_scores(acc, auc, pre, rec, f1)
            print("")
        else:
            _, auc, _, _, _ = get_scores(y_valid, y_pred)

        if acc > best_acc:
            best_acc = acc
            model_fold_name = f"{name}_{fold}"
            
            with open(os.path.join(cfg.model_save_path, f"best_model.pkl"), "wb") as fp:
                pickle.dump(my_pipeline.model, fp)
        fold += 1
    

if cfg.show_info: 
    breaker()
    print(f"Best Model : {model_fold_name.split('_')[0]}, Best Fold : {model_fold_name.split('_')[1]}")

breaker()


**************************************************



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  _warn_prf(average, modifier, msg_start, len(result))


lgr, 1

Accuracy  : 0.99990
ROC-AUC   : 0.00000
Precision : [1. 0.]
Recall    : [0.99990192 0.        ]
F-Score   : [0.99995096 0.        ]



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  _warn_prf(average, modifier, msg_start, len(result))


lgr, 2

Accuracy  : 0.99991
ROC-AUC   : 0.00000
Precision : [1. 0.]
Recall    : [0.99991282 0.        ]
F-Score   : [0.99995641 0.        ]



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  _warn_prf(average, modifier, msg_start, len(result))


lgr, 3

Accuracy  : 0.99992
ROC-AUC   : 0.00000
Precision : [1. 0.]
Recall    : [0.99992371 0.        ]
F-Score   : [0.99996186 0.        ]



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  _warn_prf(average, modifier, msg_start, len(result))


lgr, 4

Accuracy  : 0.99995
ROC-AUC   : 0.00000
Precision : [1. 0.]
Recall    : [0.99994551 0.        ]
F-Score   : [0.99997275 0.        ]



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  _warn_prf(average, modifier, msg_start, len(result))


lgr, 5

Accuracy  : 0.99996
ROC-AUC   : 0.00000
Precision : [1. 0.]
Recall    : [0.99995641 0.        ]
F-Score   : [0.9999782 0.       ]


**************************************************



  _warn_prf(average, modifier, msg_start, len(result))


knc, 1

Accuracy  : 0.99990
ROC-AUC   : 0.00000
Precision : [1. 0.]
Recall    : [0.99990192 0.        ]
F-Score   : [0.99995096 0.        ]



  _warn_prf(average, modifier, msg_start, len(result))


knc, 2

Accuracy  : 0.99991
ROC-AUC   : 0.00000
Precision : [1. 0.]
Recall    : [0.99991282 0.        ]
F-Score   : [0.99995641 0.        ]



  _warn_prf(average, modifier, msg_start, len(result))


knc, 3

Accuracy  : 0.99992
ROC-AUC   : 0.00000
Precision : [1. 0.]
Recall    : [0.99992371 0.        ]
F-Score   : [0.99996186 0.        ]



  _warn_prf(average, modifier, msg_start, len(result))


knc, 4

Accuracy  : 0.99995
ROC-AUC   : 0.00000
Precision : [1. 0.]
Recall    : [0.99994551 0.        ]
F-Score   : [0.99997275 0.        ]



  _warn_prf(average, modifier, msg_start, len(result))


knc, 5

Accuracy  : 0.99996
ROC-AUC   : 0.00000
Precision : [1. 0.]
Recall    : [0.99995641 0.        ]
F-Score   : [0.9999782 0.       ]


**************************************************

gnb, 1

Accuracy  : 0.52245
ROC-AUC   : 0.50010
Precision : [0.52240303 1.        ]
Recall    : [1.00000000e+00 2.05343494e-04]
F-Score   : [6.86287424e-01 4.10602673e-04]

gnb, 2

Accuracy  : 0.51372
ROC-AUC   : 0.50009
Precision : [0.51367818 1.        ]
Recall    : [1.00000000e+00 1.79255641e-04]
F-Score   : [6.78715178e-01 3.58447028e-04]

gnb, 3

Accuracy  : 0.51311
ROC-AUC   : 0.50008
Precision : [0.51307314 1.        ]
Recall    : [1.00000000e+00 1.56655626e-04]
F-Score   : [6.78186834e-01 3.13262178e-04]

gnb, 4

Accuracy  : 0.53875
ROC-AUC   : 0.50004
Precision : [0.53873401 0.8       ]
Recall    : [9.99979770e-01 9.45023271e-05]
F-Score   : [7.00225235e-01 1.88982330e-04]

gnb, 5

Accuracy  : 0.52747
ROC-AUC   : 0.50002
Precision : [0.52745899 0.75      ]
Recall    : [9.99979338e-01 6.91

  _warn_prf(average, modifier, msg_start, len(result))


abc, 1

Accuracy  : 0.99990
ROC-AUC   : 0.00000
Precision : [1. 0.]
Recall    : [0.99990192 0.        ]
F-Score   : [0.99995096 0.        ]



  _warn_prf(average, modifier, msg_start, len(result))


abc, 2

Accuracy  : 0.99991
ROC-AUC   : 0.00000
Precision : [1. 0.]
Recall    : [0.99991282 0.        ]
F-Score   : [0.99995641 0.        ]

abc, 3

Accuracy  : 0.99990
ROC-AUC   : 0.49996
Precision : [0.9999782 0.       ]
Recall    : [0.99992371 0.        ]
F-Score   : [0.99995096 0.        ]

abc, 4

Accuracy  : 0.99993
ROC-AUC   : 0.49997
Precision : [0.9999891 0.       ]
Recall    : [0.99994551 0.        ]
F-Score   : [0.9999673 0.       ]

abc, 5

Accuracy  : 0.99995
ROC-AUC   : 0.49998
Precision : [0.9999891 0.       ]
Recall    : [0.99995641 0.        ]
F-Score   : [0.99997275 0.        ]


**************************************************

gbc, 1

Accuracy  : 0.99979
ROC-AUC   : 0.49995
Precision : [0.99989101 0.        ]
Recall    : [0.99990191 0.        ]
F-Score   : [0.99989646 0.        ]

gbc, 2

Accuracy  : 0.99990
ROC-AUC   : 0.49996
Precision : [0.9999891 0.       ]
Recall    : [0.99991282 0.        ]
F-Score   : [0.99995096 0.        ]

gbc, 3

Accuracy  : 0.99991
ROC

  _warn_prf(average, modifier, msg_start, len(result))


etcs, 1

Accuracy  : 0.99990
ROC-AUC   : 0.00000
Precision : [1. 0.]
Recall    : [0.99990192 0.        ]
F-Score   : [0.99995096 0.        ]



  _warn_prf(average, modifier, msg_start, len(result))


etcs, 2

Accuracy  : 0.99991
ROC-AUC   : 0.00000
Precision : [1. 0.]
Recall    : [0.99991282 0.        ]
F-Score   : [0.99995641 0.        ]



  _warn_prf(average, modifier, msg_start, len(result))


etcs, 3

Accuracy  : 0.99992
ROC-AUC   : 0.00000
Precision : [1. 0.]
Recall    : [0.99992371 0.        ]
F-Score   : [0.99996186 0.        ]



  _warn_prf(average, modifier, msg_start, len(result))


etcs, 4

Accuracy  : 0.99995
ROC-AUC   : 0.00000
Precision : [1. 0.]
Recall    : [0.99994551 0.        ]
F-Score   : [0.99997275 0.        ]



  _warn_prf(average, modifier, msg_start, len(result))


etcs, 5

Accuracy  : 0.99996
ROC-AUC   : 0.00000
Precision : [1. 0.]
Recall    : [0.99995641 0.        ]
F-Score   : [0.9999782 0.       ]


**************************************************



  _warn_prf(average, modifier, msg_start, len(result))


rfc, 1

Accuracy  : 0.99990
ROC-AUC   : 0.00000
Precision : [1. 0.]
Recall    : [0.99990192 0.        ]
F-Score   : [0.99995096 0.        ]



  _warn_prf(average, modifier, msg_start, len(result))


rfc, 2

Accuracy  : 0.99991
ROC-AUC   : 0.00000
Precision : [1. 0.]
Recall    : [0.99991282 0.        ]
F-Score   : [0.99995641 0.        ]



  _warn_prf(average, modifier, msg_start, len(result))


rfc, 3

Accuracy  : 0.99992
ROC-AUC   : 0.00000
Precision : [1. 0.]
Recall    : [0.99992371 0.        ]
F-Score   : [0.99996186 0.        ]



  _warn_prf(average, modifier, msg_start, len(result))


rfc, 4

Accuracy  : 0.99995
ROC-AUC   : 0.00000
Precision : [1. 0.]
Recall    : [0.99994551 0.        ]
F-Score   : [0.99997275 0.        ]



  _warn_prf(average, modifier, msg_start, len(result))


rfc, 5

Accuracy  : 0.99996
ROC-AUC   : 0.00000
Precision : [1. 0.]
Recall    : [0.99995641 0.        ]
F-Score   : [0.9999782 0.       ]


**************************************************



  _warn_prf(average, modifier, msg_start, len(result))


xgbc, 1

Accuracy  : 0.99990
ROC-AUC   : 0.00000
Precision : [1. 0.]
Recall    : [0.99990192 0.        ]
F-Score   : [0.99995096 0.        ]



  _warn_prf(average, modifier, msg_start, len(result))


xgbc, 2

Accuracy  : 0.99991
ROC-AUC   : 0.00000
Precision : [1. 0.]
Recall    : [0.99991282 0.        ]
F-Score   : [0.99995641 0.        ]



  _warn_prf(average, modifier, msg_start, len(result))


xgbc, 3

Accuracy  : 0.99992
ROC-AUC   : 0.00000
Precision : [1. 0.]
Recall    : [0.99992371 0.        ]
F-Score   : [0.99996186 0.        ]



  _warn_prf(average, modifier, msg_start, len(result))


xgbc, 4

Accuracy  : 0.99995
ROC-AUC   : 0.00000
Precision : [1. 0.]
Recall    : [0.99994551 0.        ]
F-Score   : [0.99997275 0.        ]

xgbc, 5

Accuracy  : 0.99996
ROC-AUC   : 0.00000
Precision : [1. 0.]
Recall    : [0.99995641 0.        ]
F-Score   : [0.9999782 0.       ]


**************************************************

Best Model : lgr, Best Fold : 5

**************************************************



  _warn_prf(average, modifier, msg_start, len(result))


### **Predict**

In [7]:
df = pd.read_csv(cfg.test_data_read_path)
df = df.drop(columns=["user_id"])

X = df.copy().values

model = pickle.load(open(os.path.join(cfg.model_save_path, f"best_model.pkl"), "rb"))
y_pred = model.predict_proba(X)[:, 1]

ss_df = pd.read_csv(cfg.ss_data_read_path)
ss_df["label"] = y_pred.astype("uint8")
ss_df.to_csv("submission.csv", index=False)