In [None]:
!pip install transformers torch wandb tqdm lightgbm

In [None]:
import pandas as pd
import lightgbm as lgb
import numpy as np
import os
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils.class_weight import compute_sample_weight
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import OneCycleLR
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from tqdm.notebook import tqdm
import wandb

print(torch.__version__)

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("All good")
    torch.cuda.empty_cache()
else:
    device = torch.device("cpu")
    print("No GPU!!!")

In [None]:
wandb.login()
wandb.init(project="FSPD", config={"architecture": "LightGBM", "epochs": 100, "batch_size": "NA", "learning_rate": 0.05, "hidden_size": NA})

In [None]:
checkpoint_dir = "kaggle/ModelCheckpoints"
os.makedirs(checkpoint_dir, exist_ok=True)

In [None]:
# Create clean_fspd function. This function will take in the fspd dataframe and return a cleaned version of it.

def clean_fspd(fspd_f):
    """This function takes in the fspd dataframe and returns a cleaned version of it.
    """
    # Create a list of columns that are not needed
    drop_cols = ["lever", "itype", "source1link", "framework", "iso", "region_wb", "income_group2", "defn", "initialdate", "inclusion", "envitarget", "diethealth"]
    # Drop the columns in drop_cols from fspd_f
    fspd_f = fspd_f.drop(columns=drop_cols)

    # Replace the values in "covid_mentioned" with 0 if they are "nan"
    fspd_f["covid_mentioned"] = fspd_f["covid_mentioned"].replace(np.nan, 0)

    # Replace the values in "targeted" with 0 if the are "" o "N" and with 1 if they are "Y"
    fspd_f["targeted"] = fspd_f["targeted"].replace("", 0)
    fspd_f["targeted"] = fspd_f["targeted"].replace("N", 0)
    fspd_f["targeted"] = fspd_f["targeted"].replace("Y", 1)

    # replace "policy_code" with 0 if it is empty
    fspd_f["policy_code"] = fspd_f["policy_code"].replace(np.nan, 0)
    fspd_f["y_end"] = fspd_f["y_end"].replace(np.nan, 0)
    fspd_f["y_start"] = fspd_f["y_start"].replace("", 0)
    
    return fspd_f



def encode_fspd(fspd_f):
    """This function takes in the fspd dataframe and returns the dataframe with one-hot encoding of a list of variables.
    """
    to_encode = ["country", "db", "policy_code", "y_start", "y_end", "income_group", "fsd_group"]
    fspd_f = pd.get_dummies(fspd_f, columns=to_encode)
    return fspd_f


def get_non_text_features(batch_data, non_text_features):
    batch_indices = batch_data["index"].numpy()
    batch_non_text_features = non_text_features.loc[batch_indices]
    batch_non_text_features_tensor = torch.tensor(batch_non_text_features.values, dtype=torch.float32)
    return batch_non_text_features_tensor


def compute_class_weights(y):
    unique_classes = np.unique(y)
    class_weights = compute_class_weight('balanced', classes=unique_classes, y=y)
    return dict(zip(unique_classes, class_weights))


def get_sample_weights(y, class_weights):
    return np.array([class_weights[cls] for cls in y])

In [None]:
class FSPData(Dataset):
    def __init__(self, data, target_segment):
        self.data = data
        #self.target_lever = target_lever
        self.target_segment = target_segment

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        return {
            "index": torch.tensor(idx, dtype=torch.long),  # Add this line
            "policydecision_details": item["policydecision_details_tokens"],
            "policy_description": item["policy_description_tokens"],
            "contextoradditionalinfo": item["contextoradditionalinfo_tokens"],
            "source1name": item["source1name_tokens"],
            # Include other features as needed
            #"lever": torch.tensor(self.target_lever[idx], dtype=torch.long),
            "segment": torch.tensor(self.target_segment[idx], dtype=torch.long)
        }


In [None]:
def extract_features(data_loader):
    features = []
    labels = []

    with torch.no_grad():
        for batch_idx, batch_data in enumerate(tqdm(data_loader, desc="Extracting features")):
            # Move tensors to the device
            batch_data["policydecision_details"] = batch_data["policydecision_details"].to(device)
            batch_data["policy_description"] = batch_data["policy_description"].to(device)
            batch_data["contextoradditionalinfo"] = batch_data["contextoradditionalinfo"].to(device)
            batch_data["source1name"] = batch_data["source1name"].to(device)

            # Obtain embeddings for each text feature
            policydecision_details_embeddings = bert_model(batch_data["policydecision_details"])
            policy_description_embeddings = bert_model(batch_data["policy_description"])
            contextoradditionalinfo_embeddings = bert_model(batch_data["contextoradditionalinfo"])
            source1name_embeddings = bert_model(batch_data["source1name"])

            # Concatenate embeddings
            combined_embeddings = torch.cat((policydecision_details_embeddings.last_hidden_state[:, 0, :],
                                            policy_description_embeddings.last_hidden_state[:, 0, :],
                                            contextoradditionalinfo_embeddings.last_hidden_state[:, 0, :],
                                            source1name_embeddings.last_hidden_state[:, 0, :]), dim=1)
            
            # Concatenate non-text features
            batch_non_text_features = get_non_text_features(batch_data, non_text_features)
            batch_non_text_features = batch_non_text_features.to(device)
            combined_features = torch.cat((combined_embeddings, batch_non_text_features), dim=1)

            features.append(combined_features.cpu().numpy())
            labels.append(batch_data["segment"].cpu().numpy())

    features = np.vstack(features)
    labels = np.hstack(labels)
    return features, labels

In [None]:
fspd_f = pd.read_stata("/kaggle/input/fspdata/FSPD.dta", index_col="id")

In [None]:
fspd = clean_fspd(fspd_f)
encfspd = encode_fspd(fspd)

In [None]:
from transformers import DistilBertTokenizer, DistilBertModel 

# Initialize DistilBERT model and tokenizer
pretrained_model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(pretrained_model_name)
bert_model = DistilBertModel.from_pretrained(pretrained_model_name)

bert_model = bert_model.to(device)

In [None]:
encfspd["policydecision_details_tokens"] = encfspd["policydecision_details"].apply(lambda x: tokenizer.encode(x, truncation=True, max_length=128))
encfspd["policy_description_tokens"] = encfspd["policy_description"].apply(lambda x: tokenizer.encode(x, truncation=True, max_length=128))
encfspd["contextoradditionalinfo_tokens"] = encfspd["contextoradditionalinfo"].apply(lambda x: tokenizer.encode(x, truncation=True, max_length=96))
encfspd["source1name_tokens"] = encfspd["source1name"].apply(lambda x: tokenizer.encode(x, truncation=True, max_length=8))

In [None]:
max_length = max(encfspd[["policydecision_details_tokens", "policy_description_tokens", "contextoradditionalinfo_tokens", "source1name"]].applymap(len).max())

encfspd["policydecision_details_tokens"] = encfspd["policydecision_details_tokens"].apply(lambda x: x + [0] * (max_length - len(x)))
encfspd["policy_description_tokens"] = encfspd["policy_description_tokens"].apply(lambda x: x + [0] * (max_length - len(x)))
encfspd["contextoradditionalinfo_tokens"] = encfspd["contextoradditionalinfo_tokens"].apply(lambda x: x + [0] * (max_length - len(x)))
encfspd["source1name_tokens"] = encfspd["source1name_tokens"].apply(lambda x: x + [0] * (max_length - len(x)))

In [None]:
encfspd["policydecision_details_tokens"] = encfspd["policydecision_details_tokens"].apply(lambda x: torch.tensor(x))
encfspd["policy_description_tokens"] = encfspd["policy_description_tokens"].apply(lambda x: torch.tensor(x))
encfspd["contextoradditionalinfo_tokens"] = encfspd["contextoradditionalinfo_tokens"].apply(lambda x: torch.tensor(x))
encfspd["source1name_tokens"] = encfspd["source1name_tokens"].apply(lambda x: torch.tensor(x))

In [None]:
# Create LabelEncoder instances for lever and segment
#lever_encoder = LabelEncoder()
segment_encoder = LabelEncoder()

# Fit the encoders on the respective target labels and transform them
#encfspd["lever"] = lever_encoder.fit_transform(encfspd["lever"])
encfspd["segment"] = segment_encoder.fit_transform(encfspd["segment"])

# Extract lever and segment labels from the encfspd DataFrame
#lever_labels = encfspd["lever"].values
segment_labels = encfspd["segment"].values

In [None]:
# Create non-text-feature dataframe. It contains column 6 and then from 8 to the end

slice1 = encfspd.iloc[:, 3]
slice2 = encfspd.iloc[:, 6:]
slice3 = encfspd.iloc[:, 10:411]

non_text_features = pd.concat([slice1, slice2, slice3], axis=1).reset_index(drop=True)

In [None]:
print(non_text_features.select_dtypes(include=['object']).columns)

In [None]:
non_text_features = non_text_features.drop(non_text_features.select_dtypes(include=['object']).columns, axis=1)
print(non_text_features.select_dtypes(include=['object']).columns)

In [None]:
## Careful: no test!
# train_data, val_data, train_segment, val_segment = train_test_split(encfspd, segment_labels, test_size=0.2, random_state=42)

# With test
train_data, temp_data, train_segment, temp_segment = train_test_split(encfspd, segment_labels, test_size=0.3, random_state=42)
val_data, test_data, val_segment, test_segment = train_test_split(temp_data, temp_segment, test_size=0.5, random_state=42)

# Compute class weights
class_weights = compute_class_weights(train_segment)

# Compute sample weights
train_sample_weights = get_sample_weights(train_segment, class_weights)

train_dataset = FSPData(train_data, train_segment)
val_dataset = FSPData(val_data, val_segment)
test_dataset = FSPData(test_data, test_segment)

## Final: train on whole dataset
# train_dataset = FSPData(encfspd, segment_labels)

batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
#train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=weighted_sampler)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

train_features, train_labels = extract_features(train_loader)
val_features, val_labels = extract_features(val_loader)
test_features, test_labels = extract_features(test_loader)

In [None]:
train_data_lgb = lgb.Dataset(train_features, label=train_labels, weight=train_sample_weights)
val_data_lgb = lgb.Dataset(val_features, label=val_labels)

lgb_params = {
    "objective": "multiclass",
    "num_class": len(np.unique(segment_labels)),
    "metric": "multi_logloss",
    "max_depth": 6,
    "lambda_l1": 0.05,
    "lambda_l2": 0.05,
    "min_data_in_leaf": 5,
    "early_stopping_round":10,
    "boosting_type": "gbdt",
    "num_leaves": 15,
    "learning_rate": 0.05,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "verbose": 0,
    "num_threads": -1,
    "seed": 42,
}

In [None]:
# from sklearn.model_selection import GridSearchCV

# # Define the initial parameters
# lgb_params = {
#     "boosting_type": "gbdt",
#     "objective": "multiclass",
#     "num_class": len(np.unique(segment_labels)),
#     "metric": "multi_logloss",
#     "verbose": 0,
#     "num_threads": -1,
#     "seed": 42,
# }

# # Create a LightGBM classifier
# clf = lgb.LGBMClassifier(**lgb_params)

# # Specify the parameters to search
# param_grid = {
#     "num_leaves": [31, 40, 50],
#     "learning_rate": [0.01, 0.05, 0.1],
#     "feature_fraction": [0.8, 0.9, 1.0],
#     "bagging_fraction": [0.7, 0.8, 0.9],
#     "bagging_freq": [3, 5, 7],
# }

# # Create the grid search object
# grid = GridSearchCV(
#     estimator=clf,
#     param_grid=param_grid,
#     scoring='neg_log_loss',
#     cv=3,  # number of cross-validation folds
#     verbose=1,
#     n_jobs=-1,  # use all available cores for parallel processing
# )

# # Perform the grid search using the train features and labels
# grid.fit(train_features, train_labels)

# # Print the best parameters found by the grid search
# print("Best parameters found by grid search:", grid.best_params_)

# # Get the best model found by the grid search
# best_model = grid.best_estimator_


In [None]:
lgb_model = lgb.train(lgb_params, train_data_lgb, num_boost_round=50, valid_sets=[train_data_lgb, val_data_lgb], verbose_eval=-100)

In [None]:
val_preds = lgb_model.predict(val_features)
val_preds = np.argmax(val_preds, axis=1)
val_accuracy = np.sum(val_preds == val_labels) / len(val_labels)

test_preds = lgb_model.predict(test_features)
test_preds = np.argmax(test_preds, axis=1)
test_accuracy = np.sum(test_preds == test_labels) / len(test_labels)

print("Validation accuracy:", val_accuracy)
print("Test accuracy:", test_accuracy)

In [None]:
lgb_model.save_model('lightgbm_model.txt')