In [None]:

# =============================
# Core Libraries
# =============================
import pandas as pd
import numpy as np
import re
import os
import gc

from tqdm import tqdm
from scipy.sparse import hstack, csr_matrix

# Sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, r2_score

# LightGBM
import lightgbm as lgb

# BERT Embeddings
from sentence_transformers import SentenceTransformer


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
SEED = 42
np.random.seed(SEED)
N_SPLITS = 5

PRIMARY_WEIGHT = 0.3
SECONDARY_WEIGHT = 0.4
SEVERITY_WEIGHT = 0.3


In [3]:
train_df = pd.read_csv("train_complaints.csv")
test_df = pd.read_csv("test_complaints.csv")

print(train_df.shape)
print(test_df.shape)

train_df.head()


(2999, 5)
(499, 2)


Unnamed: 0,complaint_id,complaint_text,primary_category,secondary_category,severity
0,1634299,Back into XXXX of 2010 during this mortgage cr...,Mortgage,"Loan modification,collection,foreclosure",2
1,5505088,I checked my credit report and I am upset on w...,"Credit reporting, credit repair services, or o...",Problem with a credit reporting company's inve...,1
2,10979675,I am writing to dispute the accuracy of the in...,Credit reporting or other personal consumer re...,Problem with a company's investigation into an...,1
3,7520351,A transaction from XXXX XXXX XXXX submitted a ...,Checking or savings account,Managing an account,1
4,5847870,I was recently alerted to an account in collec...,Debt collection,Attempts to collect debt not owed,5


In [6]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"<.*?>", " ", text)
    text = re.sub(r"http\S+|www\S+", " ", text)
    text = re.sub(r"\S+@\S+", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


train_df["clean_text"] = train_df["complaint_text"].apply(clean_text)
test_df["clean_text"] = test_df["complaint_text"].apply(clean_text)


In [7]:
primary_le = LabelEncoder()
secondary_le = LabelEncoder()

train_df["primary_label"] = primary_le.fit_transform(train_df["primary_category"])
train_df["secondary_label"] = secondary_le.fit_transform(train_df["secondary_category"])

severity = train_df["severity"].values


In [9]:
word_vectorizer = TfidfVectorizer(
    max_features=100000,
    ngram_range=(1,2),
    sublinear_tf=True
)

char_vectorizer = TfidfVectorizer(
    analyzer="char",
    ngram_range=(3,5),
    max_features=50000
)

print("Fitting TF-IDF...")

word_train = word_vectorizer.fit_transform(train_df["clean_text"])
word_test = word_vectorizer.transform(test_df["clean_text"])

char_train = char_vectorizer.fit_transform(train_df["clean_text"])
char_test = char_vectorizer.transform(test_df["clean_text"])

tfidf_train = hstack([word_train, char_train])
tfidf_test = hstack([word_test, char_test])

print("TF-IDF shape:", tfidf_train.shape)


Fitting TF-IDF...
TF-IDF shape: (2999, 150000)


In [10]:
print("Loading MiniLM...")
bert_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def generate_embeddings(texts, batch_size=64):
    embeddings = []
    
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size]
        emb = bert_model.encode(batch, show_progress_bar=False)
        embeddings.append(emb)
        
    return np.vstack(embeddings)


bert_train = generate_embeddings(train_df["clean_text"].tolist())
bert_test = generate_embeddings(test_df["clean_text"].tolist())

print("BERT shape:", bert_train.shape)


Loading MiniLM...


Loading weights: 100%|██████████| 103/103 [00:00<00:00, 425.28it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
100%|██████████| 47/47 [00:46<00:00,  1.00it/s]
100%|██████████| 8/8 [00:07<00:00,  1.05it/s]

BERT shape: (2999, 384)





In [11]:
bert_train_sparse = csr_matrix(bert_train)
bert_test_sparse = csr_matrix(bert_test)

X_train = hstack([tfidf_train, 1.3 * bert_train_sparse])
X_test = hstack([tfidf_test, 1.3 * bert_test_sparse])

print("Final Feature Shape:", X_train.shape)


Final Feature Shape: (2999, 150384)


In [12]:
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)


In [13]:
primary_preds = np.zeros(len(train_df))
secondary_preds = np.zeros(len(train_df))
severity_preds = np.zeros(len(train_df))

primary_test_preds = np.zeros((len(test_df), N_SPLITS))
num_classes = len(secondary_le.classes_)
secondary_test_preds = np.zeros((len(test_df), num_classes, N_SPLITS))
severity_test_preds = np.zeros((len(test_df), N_SPLITS))


In [14]:
for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, train_df["secondary_label"])):

    print(f"\n===== Fold {fold+1} =====")

    X_tr, X_val = X_train[train_idx], X_train[val_idx]

    y_primary_tr = train_df.loc[train_idx, "primary_label"]
    y_secondary_tr = train_df.loc[train_idx, "secondary_label"]
    y_severity_tr = severity[train_idx]

    # ---------------- PRIMARY ----------------
    primary_model = LogisticRegression(
        max_iter=1000,
        class_weight="balanced"
    )

    primary_model.fit(X_tr, y_primary_tr)

    primary_preds[val_idx] = primary_model.predict(X_val)
    primary_test_preds[:, fold] = primary_model.predict(X_test)

    # ---------------- SECONDARY ----------------
    secondary_model = lgb.LGBMClassifier(
        objective="multiclass",
        n_estimators=800,
        learning_rate=0.04,
        num_leaves=48,
        max_depth=-1,
        min_child_samples=20,
        subsample=0.85,
        colsample_bytree=0.85,
        class_weight="balanced",
        random_state=SEED,
        verbosity=-1
    )

    secondary_model.fit(X_tr, y_secondary_tr)

    secondary_preds[val_idx] = secondary_model.predict(X_val)
    secondary_test_preds[:, :, fold] = secondary_model.predict_proba(X_test)

    # ---------------- SEVERITY ----------------
    severity_model = lgb.LGBMRegressor(
        n_estimators=800,
        learning_rate=0.04,
        num_leaves=48,
        subsample=0.85,
        colsample_bytree=0.85,
        random_state=SEED,
        verbosity=-1
    )

    severity_model.fit(X_tr, y_severity_tr)

    severity_preds[val_idx] = severity_model.predict(X_val)
    severity_test_preds[:, fold] = severity_model.predict(X_test)



===== Fold 1 =====





===== Fold 2 =====





===== Fold 3 =====





===== Fold 4 =====





===== Fold 5 =====




In [20]:
primary_acc = accuracy_score(train_df["primary_label"], primary_preds)
secondary_acc = accuracy_score(train_df["secondary_label"], secondary_preds)
severity_r2 = r2_score(severity, severity_preds)

final_score = (
    PRIMARY_WEIGHT * primary_acc +
    SECONDARY_WEIGHT * secondary_acc +
    SEVERITY_WEIGHT * severity_r2
)

print("Primary Accuracy:", primary_acc)
print("Secondary Accuracy:", secondary_acc)
print("Severity R2:", severity_r2)
print("FINAL SCORE:", final_score)


Primary Accuracy: 0.7639213071023675
Secondary Accuracy: 0.6552184061353784
Severity R2: 0.9971679477549317
FINAL SCORE: 0.790414138911341


In [21]:
primary_test_final = np.round(primary_test_preds.mean(axis=1)).astype(int)

# ----- SECONDARY PROBABILITY BOOST -----

# Weighted fold averaging
weights = np.array([0.15, 0.2, 0.2, 0.22, 0.23])

secondary_test_final = np.tensordot(
    secondary_test_preds,
    weights,
    axes=(2, 0)
)

# Temperature smoothing
secondary_test_final = np.power(secondary_test_final, 1.1)

# Re-normalize probabilities
secondary_test_final = secondary_test_final / secondary_test_final.sum(axis=1, keepdims=True)


# ----- SEVERITY -----
severity_test_final = severity_test_preds.mean(axis=1)
severity_test_final = np.clip(severity_test_final, 1, 5)
severity_test_final = np.floor(severity_test_final + 0.55).astype(int)




In [22]:
primary_labels = primary_le.inverse_transform(primary_test_final)
secondary_labels = secondary_le.inverse_transform(
    np.argmax(secondary_test_final, axis=1)
)



In [23]:
submission = pd.DataFrame({
    "complaint_id": test_df["complaint_id"],
    "primary_category": primary_labels,
    "secondary_category": secondary_labels,
    "severity": severity_test_final
})

submission.to_csv("submission.csv", index=False)

submission.head()


Unnamed: 0,complaint_id,primary_category,secondary_category,severity
0,7799230,Credit reporting or other personal consumer re...,Incorrect information on your report,1
1,15754196,Debt collection,Written notification about debt,1
2,10989146,Credit reporting or other personal consumer re...,Problem with a company's investigation into an...,2
3,3617850,"Credit reporting, credit repair services, or o...",Problem with a credit reporting company's inve...,1
4,5253879,Credit reporting or other personal consumer re...,Improper use of your report,4
