### Basic configuration

In [45]:
class Config:
    notebook = "Linear/Baseline"
    script = "linear/baseline"

    n_splits = 5
    seed = 42

    # Reka Env
    dir_path = "/home/abe/kaggle/signate-sc2022"

    def is_notebook():
        if 'get_ipython' not in globals():
            return False
        env_name = get_ipython().__class__.__name__  # type: ignore
        if env_name == 'TerminalInteractiveShell':
            return False
        return True

### Import basic libraries

In [46]:
from tqdm.auto import tqdm
import seaborn as sns
import os
import gc
import warnings
import random
from glob import glob
import subprocess
from subprocess import PIPE
import ntpath
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('seaborn-pastel')
sns.set_palette("winter_r")
warnings.filterwarnings('ignore')
tqdm.pandas()

### Seeding

In [47]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(Config.seed)

### Path configuration

In [48]:
def path_setup(cfg):
    cfg.INPUT = os.path.join(Config.dir_path, 'input')
    cfg.OUTPUT = os.path.join(Config.dir_path, 'output')
    cfg.SUBMISSION = os.path.join(Config.dir_path, 'submissions')
    cfg.OUTPUT_EXP = os.path.join(cfg.OUTPUT, Config.script)
    cfg.EXP_MODEL = os.path.join(cfg.OUTPUT_EXP, "model")
    cfg.EXP_PREDS = os.path.join(cfg.OUTPUT_EXP, "preds")
    cfg.EXP_FIG = os.path.join(cfg.OUTPUT_EXP, "fig")
    cfg.NOTEBOOK = os.path.join(Config.dir_path, "Notebooks")
    cfg.SCRIPT = os.path.join(Config.dir_path, "scripts")

    # make dir
    for dir in [
            cfg.INPUT,
            cfg.OUTPUT,
            cfg.SUBMISSION,
            cfg.OUTPUT_EXP,
            cfg.EXP_MODEL,
            cfg.EXP_PREDS,
            cfg.EXP_FIG,
            cfg.NOTEBOOK,
            cfg.SCRIPT]:
        os.makedirs(dir, exist_ok=True)
    return cfg

cfg = path_setup(Config)

# Load data

In [49]:
# load data
train = pd.read_csv(os.path.join(cfg.INPUT, 'train.csv'))
test = pd.read_csv(os.path.join(cfg.INPUT, 'test.csv'))
sub = pd.read_csv(os.path.join(cfg.INPUT, 'submit_sample.csv'), header=None)

# preprocess target
train['jobflag'] -= 1

In [50]:
train.head(5)

Unnamed: 0,id,description,jobflag
0,0,<li>Develop cutting-edge web applications that...,2
1,1,"<li> Designs and develops high quality, scalab...",2
2,2,<li>Functions as a point person for Network St...,3
3,3,"<li> Work on the technical design, development...",2
4,4,<li>Quantify the resources required for a task...,3


## TFIDF-Vectorizer

In [51]:
import texthero as hero

def vectorize(train : pd.DataFrame, test : pd.DataFrame):
    
    train['tfidf'] = train['description'].pipe(hero.clean).pipe(hero.tfidf)
    test['tfidf'] = test['description'].pipe(hero.clean).pipe(hero.tfidf)
    return train, test
    
train, test = vectorize(train, test)

In [52]:
assert "tfidf" in train.columns, "tfidf does not exist in train."
assert "tfidf" in test.columns, "tfidf does not exist in test."
# TODO : try GPLVM
train['pca'] = train['tfidf'].pipe(hero.pca)
test['pca'] = test['tfidf'].pipe(hero.pca)

hero.scatterplot(train, 'pca', color='jobflag', title="PCA Description")

In [53]:
train['labels'] = train['tfidf'].pipe(hero.kmeans, n_clusters=4).astype(str)
hero.scatterplot(train, 'pca', color='labels', title="K-means Description")

In [56]:
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline


def decompose(train: pd.DataFrame, test: pd.DataFrame):
    tfidf_svd = Pipeline(steps=[
        ("TfidfVectorizer", TfidfVectorizer()),
        ("TruncatedSVD", TruncatedSVD(n_components=50, random_state=42))
    ])
    train = tfidf_svd.fit_transform(train['description'].pipe(hero.clean))
    test = tfidf_svd.fit_transform(test['description'].pipe(hero.clean))
    return pd.DataFrame(train), pd.DataFrame(test)

train_feat, test_feat = decompose(train, test)

## LinearSVM

In [68]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

def fit_lsvb(X, y):
    models = []
    scores = []
    
    skf = StratifiedKFold(
        n_splits=cfg.n_splits,
        shuffle=True,
        random_state=cfg.seed)

    for fold, (trn_index, val_index) in enumerate(skf.split(X, y)):
        X_train, y_train = X.iloc[trn_index], y.iloc[trn_index]
        X_valid, y_valid = X.iloc[val_index], y.iloc[val_index]

        model = LinearSVC(
            penalty='l2',
            loss='squared_hinge',
            multi_class='ovr',
            random_state=cfg.seed,
            verbose=True)

        model.fit(X_train, y_train)
        # --------- prediction --------- 
        pred = model.predict(X_valid)
        score = f1_score(y_valid, y_pred=pred ,average='macro')  
        print(f"fold{fold} : {score}")
        
        # --------- save --------- 
        models.append(model)
        scores.append(score)
    
    print("oof score: {}".format(np.mean(scores)))
    return models

In [72]:
models = fit_lsvb(train_feat, train['jobflag'])

[LibLinear]fold0 : 0.6194221684539792
[LibLinear]fold1 : 0.6276027075001605
[LibLinear]fold2 : 0.6589581853809936
[LibLinear]fold3 : 0.600303728529535
[LibLinear]fold4 : 0.6219815052437176
oof score: 0.6256536590216772
