In [1]:
import pandas as pd
import numpy as np
import re
import gc

from tqdm import tqdm
from scipy.sparse import hstack, csr_matrix

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, r2_score

import lightgbm as lgb
from sentence_transformers import SentenceTransformer


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
SEED = 42
np.random.seed(SEED)

N_SPLITS = 5

PRIMARY_WEIGHT = 0.3
SECONDARY_WEIGHT = 0.4
SEVERITY_WEIGHT = 0.3


In [3]:
train_df = pd.read_csv("train_complaints.csv")
test_df = pd.read_csv("test_complaints.csv")

print(train_df.shape)
print(test_df.shape)


(2999, 5)
(499, 2)


In [4]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"<.*?>", " ", text)
    text = re.sub(r"http\S+|www\S+", " ", text)
    text = re.sub(r"\S+@\S+", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

train_df["clean_text"] = train_df["complaint_text"].apply(clean_text)
test_df["clean_text"] = test_df["complaint_text"].apply(clean_text)


In [5]:
train_df["text_len"] = train_df["clean_text"].apply(len)
test_df["text_len"] = test_df["clean_text"].apply(len)

train_df["num_count"] = train_df["clean_text"].str.count(r"\d")
test_df["num_count"] = test_df["clean_text"].str.count(r"\d")

train_df["money_flag"] = train_df["clean_text"].str.contains(r"\$|usd|rs", case=False).astype(int)
test_df["money_flag"] = test_df["clean_text"].str.contains(r"\$|usd|rs", case=False).astype(int)


In [6]:
primary_le = LabelEncoder()
secondary_le = LabelEncoder()

train_df["primary_label"] = primary_le.fit_transform(train_df["primary_category"])
train_df["secondary_label"] = secondary_le.fit_transform(train_df["secondary_category"])

severity = train_df["severity"].values


In [7]:
word_vectorizer = TfidfVectorizer(
    max_features=100000,
    ngram_range=(1,2),
    sublinear_tf=True
)

char_vectorizer = TfidfVectorizer(
    analyzer="char",
    ngram_range=(3,5),
    max_features=70000
)

word_train = word_vectorizer.fit_transform(train_df["clean_text"])
word_test = word_vectorizer.transform(test_df["clean_text"])

char_train = char_vectorizer.fit_transform(train_df["clean_text"])
char_test = char_vectorizer.transform(test_df["clean_text"])

tfidf_train = hstack([word_train, char_train])
tfidf_test = hstack([word_test, char_test])


In [8]:
bert_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def generate_embeddings(texts):
    embeddings = []
    for i in tqdm(range(0, len(texts), 64)):
        batch = texts[i:i+64]
        emb = bert_model.encode(batch, show_progress_bar=False)
        embeddings.append(emb)
    return np.vstack(embeddings)

bert_train = generate_embeddings(train_df["clean_text"].tolist())
bert_test = generate_embeddings(test_df["clean_text"].tolist())

bert_train_sparse = csr_matrix(bert_train)
bert_test_sparse = csr_matrix(bert_test)


Loading weights: 100%|██████████| 103/103 [00:00<00:00, 995.20it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
100%|██████████| 47/47 [00:43<00:00,  1.08it/s]
100%|██████████| 8/8 [00:07<00:00,  1.04it/s]


In [9]:
meta_train = csr_matrix(train_df[["text_len","num_count","money_flag"]].values)
meta_test = csr_matrix(test_df[["text_len","num_count","money_flag"]].values)


In [10]:
X_train = hstack([tfidf_train, bert_train_sparse, meta_train])
X_test = hstack([tfidf_test, bert_test_sparse, meta_test])


In [11]:
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

primary_preds = np.zeros(len(train_df))
secondary_preds = np.zeros(len(train_df))
severity_preds = np.zeros(len(train_df))

primary_oof = np.zeros(len(train_df))
severity_oof = np.zeros(len(train_df))

primary_test_preds = np.zeros((len(test_df), N_SPLITS))
secondary_test_preds = np.zeros((len(test_df), len(secondary_le.classes_), N_SPLITS))
severity_test_preds = np.zeros((len(test_df), N_SPLITS))


In [None]:
for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, train_df["secondary_label"])):

    print(f"\n===== Fold {fold+1} =====")

    X_tr, X_val = X_train[train_idx], X_train[val_idx]

    y_primary_tr = train_df.loc[train_idx, "primary_label"]
    y_secondary_tr = train_df.loc[train_idx, "secondary_label"]
    y_severity_tr = severity[train_idx]

    # PRIMARY
    primary_model = LogisticRegression(max_iter=1000, class_weight="balanced")
    primary_model.fit(X_tr, y_primary_tr)

    primary_preds[val_idx] = primary_model.predict(X_val)
    primary_oof[val_idx] = primary_model.predict(X_val)
    primary_test_preds[:, fold] = primary_model.predict(X_test)

    # SECONDARY
    secondary_model = lgb.LGBMClassifier(
        objective="multiclass",
        n_estimators=800,
        learning_rate=0.04,
        num_leaves=48,
        subsample=0.85,
        colsample_bytree=0.85,
        class_weight="balanced",
        random_state=SEED
    )

    secondary_model.fit(X_tr, y_secondary_tr)
    secondary_preds[val_idx] = secondary_model.predict(X_val)
    secondary_test_preds[:, :, fold] = secondary_model.predict_proba(X_test)

    # SEVERITY
    severity_model = lgb.LGBMRegressor(
        n_estimators=800,
        learning_rate=0.04,
        num_leaves=48,
        subsample=0.85,
        colsample_bytree=0.85,
        random_state=SEED
    )

    severity_model.fit(X_tr, y_severity_tr)
    severity_preds[val_idx] = severity_model.predict(X_val)
    severity_oof[val_idx] = severity_model.predict(X_val)
    severity_test_preds[:, fold] = severity_model.predict(X_test)

    gc.collect()



===== Fold 1 =====


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.480461 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1401002
[LightGBM] [Info] Number of data points in the train set: 2399, number of used features: 31681
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.427069 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1401002
[LightGBM] [Info] Number of data points in the train set: 2399, number of used features: 31681
[LightGBM] [Info] Start training from score 1.962484





===== Fold 2 =====


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.409569 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1385865
[LightGBM] [Info] Number of data points in the train set: 2399, number of used features: 31282
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.403473 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1385865
[LightGBM] [Info] Number of data points in the train set: 2399, number of used features: 31282
[LightGBM] [Info] Start training from score 1.947478





===== Fold 3 =====


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.572783 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1388853
[LightGBM] [Info] Number of data points in the train set: 2399, number of used features: 31431
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.632754 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1388853
[LightGBM] [Info] Number of data points in the train set: 2399, number of used features: 31431
[LightGBM] [Info] Start training from score 1.959983





===== Fold 4 =====


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.589842 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1398400
[LightGBM] [Info] Number of data points in the train set: 2399, number of used features: 31521
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585


In [14]:
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import csr_matrix, hstack

ohe = OneHotEncoder(sparse_output=True, handle_unknown="ignore")

primary_oof_oh = ohe.fit_transform(primary_oof.reshape(-1,1))
severity_oof_feat = csr_matrix(severity_oof.reshape(-1,1))

X_secondary_stack = hstack([X_train, primary_oof_oh, severity_oof_feat])



In [15]:
# ----- TEST STACK FEATURES -----

primary_test_final = np.round(primary_test_preds.mean(axis=1)).astype(int)
severity_test_final = severity_test_preds.mean(axis=1)

primary_test_stack = ohe.transform(primary_test_final.reshape(-1,1))
severity_test_stack = csr_matrix(severity_test_final.reshape(-1,1))

X_secondary_test_stack = hstack([
    X_test,
    primary_test_stack,
    severity_test_stack
])


In [None]:
secondary_stack_model = lgb.LGBMClassifier(
    objective="multiclass",
    n_estimators=900,
    learning_rate=0.035,
    num_leaves=64,
    subsample=0.85,
    colsample_bytree=0.85,
    class_weight="balanced",
    random_state=SEED,
    verbosity=-1
)

secondary_stack_model.fit(
    X_secondary_stack,
    train_df["secondary_label"]
)



NameError: name 'lgb' is not defined

: 