Test Classification Kaggle competition

In [1]:
import numpy as np
import pandas as pd

#### Load data

In [2]:
""" This is a NumPy array representing term vector counts for training, where each row corresponds to a document and each column represents a term in the vocabulary. 
    The values (mostly 0s) indicate the count of each term in the respective document, forming a sparse matrix.
"""
data_train = np.load("data_train.npy", allow_pickle=True)

In [3]:
"""
A similar NumPy array for testing. You need to create labels for this test set and submit.
"""
data_test = np.load("data_test.npy", allow_pickle=True)

In [4]:
"""
Contains a mapping between the terms (words) and their corresponding indices in the term vector matrix.
"""
vocab_map = np.load("vocab_map.npy", allow_pickle=True)

In [11]:
"""
Contains the labels or target values for the training dataset (0 or 1).
"""
# Load the labels from the CSV file
label_train = pd.read_csv('label_train.csv', index_col=False)

#### Inspect Data

In [12]:
display(data_train, data_test, vocab_map, label_train)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

array(['00', '000', '0001', ..., 'zs', 'zsda', 'zsl'], dtype=object)

Unnamed: 0,ID,label
0,0,0
1,1,0
2,2,0
3,3,0
4,4,1
...,...,...
9417,9417,0
9418,9418,1
9419,9419,0
9420,9420,0


In [None]:
# first create a data frame with all the features 
# Visulize the data .. what type of distribution is this ?

In [16]:
vocab_map.shape

(26354,)

In [18]:
data_train.shape

(9422, 26354)

In [21]:
import numpy as np
import pandas as pd

# Step 1: Load the Data

# Load the training data
X_train = np.load('data_train.npy')

# Load the labels from the CSV file
labels_df = pd.read_csv('label_train.csv', index_col=False)

# Ensure the labels are sorted by 'ID' to match X_train
labels_df = labels_df.sort_values('ID').reset_index(drop=True)

# Extract the 'label' column as a NumPy array
y_train = labels_df['label'].values

# Load the test data
X_test = np.load('data_test.npy')

# Step 2: Preprocess the Data

# Compute TF
tf_train = X_train / np.sum(X_train, axis=1, keepdims=True)
tf_test = X_test / np.sum(X_test, axis=1, keepdims=True)

# Replace any NaN values resulting from division by zero
tf_train = np.nan_to_num(tf_train)
tf_test = np.nan_to_num(tf_test)

# Compute IDF
df = np.count_nonzero(X_train, axis=0)
N = X_train.shape[0]
idf = np.log((N + 1) / (df + 1)) + 1

# Compute TF-IDF
X_train_tfidf = tf_train * idf
X_test_tfidf = tf_test * idf

# Select Top K Features
tfidf_sums = np.sum(X_train_tfidf, axis=0)
k = 5000  # Adjust based on memory constraints
top_k_indices = np.argsort(tfidf_sums)[-k:]
X_train_reduced = X_train_tfidf[:, top_k_indices]
X_test_reduced = X_test_tfidf[:, top_k_indices]

# Add Intercept Term
X_train_aug = np.hstack((np.ones((X_train_reduced.shape[0], 1)), X_train_reduced))
X_test_aug = np.hstack((np.ones((X_test_reduced.shape[0], 1)), X_test_reduced))

# Step 3: Split the Data into Training and Validation Sets
np.random.seed(42)
indices = np.arange(X_train_aug.shape[0])
np.random.shuffle(indices)
train_size = int(0.8 * len(indices))
train_indices = indices[:train_size]
val_indices = indices[train_size:]
X_train_fold = X_train_aug[train_indices]
y_train_fold = y_train[train_indices]
X_val_fold = X_train_aug[val_indices]
y_val_fold = y_train[val_indices]

# Step 4: Implement Logistic Regression Functions
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def compute_cost(X, y, theta, lambda_):
    m = len(y)
    h = sigmoid(X @ theta)
    epsilon = 1e-5
    cost = (-1/m) * (y @ np.log(h + epsilon) + (1 - y) @ np.log(1 - h + epsilon))
    reg_cost = cost + (lambda_ / (2 * m)) * np.sum(np.square(theta[1:]))
    return reg_cost

def compute_gradient(X, y, theta, lambda_):
    m = len(y)
    h = sigmoid(X @ theta)
    error = h - y
    gradient = (1/m) * (X.T @ error)
    gradient[1:] += (lambda_ / m) * theta[1:]
    return gradient

def gradient_descent(X, y, theta, alpha, num_iters, lambda_):
    cost_history = []
    for i in range(num_iters):
        gradient = compute_gradient(X, y, theta, lambda_)
        theta -= alpha * gradient
        cost = compute_cost(X, y, theta, lambda_)
        cost_history.append(cost)
        if i % 100 == 0:
            print(f'Iteration {i}: Cost {cost}')
    return theta, cost_history

# Step 5: Train the Model on Training Fold
theta = np.zeros(X_train_fold.shape[1])
alpha = 0.1
num_iters = 1000
lambda_ = 0.1
theta, cost_history = gradient_descent(X_train_fold, y_train_fold, theta, alpha, num_iters, lambda_)

# Step 6: Validate the Model
probabilities_val = sigmoid(X_val_fold @ theta)
y_val_pred = (probabilities_val >= 0.5).astype(int)

def compute_macro_f1(y_true, y_pred):
    classes = np.unique(y_true)
    f1_scores = []
    for cls in classes:
        tp = np.sum((y_pred == cls) & (y_true == cls))
        fp = np.sum((y_pred == cls) & (y_true != cls))
        fn = np.sum((y_pred != cls) & (y_true == cls))
        precision = tp / (tp + fp + 1e-7)
        recall = tp / (tp + fn + 1e-7)
        f1 = 2 * precision * recall / (precision + recall + 1e-7)
        f1_scores.append(f1)
    macro_f1 = np.mean(f1_scores)
    return macro_f1

macro_f1 = compute_macro_f1(y_val_fold, y_val_pred)
print(f'Macro F1 Score on Validation Set: {macro_f1}')

# Step 7: Retrain on Full Training Data
theta = np.zeros(X_train_aug.shape[1])
theta, cost_history = gradient_descent(X_train_aug, y_train, theta, alpha, num_iters, lambda_)

# Step 8: Make Predictions on Test Data
probabilities = sigmoid(X_test_aug @ theta)
y_pred = (probabilities >= 0.5).astype(int)

# Step 9: Prepare Submission File
submission = pd.DataFrame({
    'ID': np.arange(len(y_pred)),
    'label': y_pred
})
submission.to_csv('submission.csv', index=False)

Iteration 0: Cost 0.6865464351606382
Iteration 100: Cost 0.5567154342318743
Iteration 200: Cost 0.5546245399374068
Iteration 300: Cost 0.5542287125959212
Iteration 400: Cost 0.5538698230913401
Iteration 500: Cost 0.5535128134175942
Iteration 600: Cost 0.5531568778095659
Iteration 700: Cost 0.5528019932218066
Iteration 800: Cost 0.5524481555477394
Iteration 900: Cost 0.5520953612762602
Macro F1 Score on Validation Set: 0.43034146835737835
Iteration 0: Cost 0.6865548714503225
Iteration 100: Cost 0.5569029955651373
Iteration 200: Cost 0.5548225588677439
Iteration 300: Cost 0.5544332181284596
Iteration 400: Cost 0.5540806492700086
Iteration 500: Cost 0.5537299302272252
Iteration 600: Cost 0.5533802598146694
Iteration 700: Cost 0.553031615246713
Iteration 800: Cost 0.5526839925425006
Iteration 900: Cost 0.5523373883116114


In [22]:
import numpy as np
import pandas as pd

# Load the Data
X_train = np.load('data_train.npy')
labels_df = pd.read_csv('label_train.csv', index_col=False)
labels_df = labels_df.sort_values('ID').reset_index(drop=True)
y_train = labels_df['label'].values
X_test = np.load('data_test.npy')

# Implement Multinomial Naive Bayes

# Step 1: Calculate Class Priors
unique_classes, class_counts = np.unique(y_train, return_counts=True)
class_priors = class_counts / y_train.shape[0]

# Step 2: Calculate Conditional Probabilities with Laplace Smoothing
num_classes = len(unique_classes)
num_features = X_train.shape[1]
alpha = 1  # Laplace smoothing parameter

class_term_counts = np.zeros((num_classes, num_features))
class_total_counts = np.zeros(num_classes)

for idx, cls in enumerate(unique_classes):
    X_cls = X_train[y_train == cls]
    class_term_counts[idx, :] = np.sum(X_cls, axis=0)
    class_total_counts[idx] = np.sum(class_term_counts[idx, :])

conditional_probs = (class_term_counts + alpha) / (class_total_counts[:, None] + alpha * num_features)

# Step 3: Convert to Log Probabilities
log_class_priors = np.log(class_priors)
log_conditional_probs = np.log(conditional_probs)

# Step 4: Define Predict Function
def predict_multinomial_nb(X):
    log_probs = np.zeros((X.shape[0], num_classes))
    for idx in range(num_classes):
        log_prob = X @ log_conditional_probs[idx, :].T
        log_probs[:, idx] = log_class_priors[idx] + log_prob
    return unique_classes[np.argmax(log_probs, axis=1)]

# Step 5: Evaluate on Validation Set
# Split data into training and validation sets
np.random.seed(42)
indices = np.arange(X_train.shape[0])
np.random.shuffle(indices)
train_size = int(0.8 * len(indices))
train_indices = indices[:train_size]
val_indices = indices[train_size:]

X_train_fold = X_train[train_indices]
y_train_fold = y_train[train_indices]
X_val_fold = X_train[val_indices]
y_val_fold = y_train[val_indices]

# Recalculate counts on training fold
class_term_counts = np.zeros((num_classes, num_features))
class_total_counts = np.zeros(num_classes)

for idx, cls in enumerate(unique_classes):
    X_cls = X_train_fold[y_train_fold == cls]
    class_term_counts[idx, :] = np.sum(X_cls, axis=0)
    class_total_counts[idx] = np.sum(class_term_counts[idx, :])

conditional_probs = (class_term_counts + alpha) / (class_total_counts[:, None] + alpha * num_features)
log_class_priors = np.log(class_counts / y_train_fold.shape[0])
log_conditional_probs = np.log(conditional_probs)

# Predict on validation set
y_val_pred = predict_multinomial_nb(X_val_fold)

# Define Macro F1 Score Function
def compute_macro_f1(y_true, y_pred):
    classes = np.unique(y_true)
    f1_scores = []
    for cls in classes:
        tp = np.sum((y_pred == cls) & (y_true == cls))
        fp = np.sum((y_pred == cls) & (y_true != cls))
        fn = np.sum((y_pred != cls) & (y_true == cls))
        precision = tp / (tp + fp + 1e-7)
        recall = tp / (tp + fn + 1e-7)
        f1 = 2 * precision * recall / (precision + recall + 1e-7)
        f1_scores.append(f1)
    macro_f1 = np.mean(f1_scores)
    return macro_f1

macro_f1 = compute_macro_f1(y_val_fold, y_val_pred)
print(f'Macro F1 Score on Validation Set (Naive Bayes): {macro_f1}')

# Step 6: Retrain on Full Training Data and Predict on Test Set
# Recalculate counts using full training data
class_term_counts = np.zeros((num_classes, num_features))
class_total_counts = np.zeros(num_classes)

for idx, cls in enumerate(unique_classes):
    X_cls = X_train[y_train == cls]
    class_term_counts[idx, :] = np.sum(X_cls, axis=0)
    class_total_counts[idx] = np.sum(class_term_counts[idx, :])

conditional_probs = (class_term_counts + alpha) / (class_total_counts[:, None] + alpha * num_features)
log_class_priors = np.log(class_priors)
log_conditional_probs = np.log(conditional_probs)

# Predict on test data
y_pred = predict_multinomial_nb(X_test)

# Step 7: Prepare Submission File
submission = pd.DataFrame({
    'ID': np.arange(len(y_pred)),
    'label': y_pred
})
submission.to_csv('submission_nb.csv', index=False)


Macro F1 Score on Validation Set (Naive Bayes): 0.7122338438977459


In [1]:
# import pdfplumber
from paddleocr import PaddleOCR
# import numpy as np

In [9]:
ocr = PaddleOCR(lang="en",
                # det_model_dir = "C:/Users/ns99a/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer/",
                # rec_model_dir = "C:/Users/ns99a/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/",
                # cls_model_dir = "C:/Users/ns99a/.paddleocr/whl/cls/en/ch_ppocr_mobile_v2.0_cls_infer/")
)

[2024/10/21 01:14:49] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='C:\\Users\\ns99a/.paddleocr/whl\\det\\en\\en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='C:\\Users\\ns99a/.paddleocr/whl\\rec\\en\\en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=

In [3]:
ocr = PaddleOCR(lang="en")

[2024/10/21 01:03:43] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='C:\\Users\\ns99a/.paddleocr/whl\\det\\en\\en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='C:\\Users\\ns99a/.paddleocr/whl\\rec\\en\\en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=

In [4]:
ocr = PaddleOCR(lang="fr")

download https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/latin_PP-OCRv3_rec_infer.tar to C:\Users\ns99a/.paddleocr/whl\rec\latin\latin_PP-OCRv3_rec_infer\latin_PP-OCRv3_rec_infer.tar


100%|██████████| 9930/9930 [00:34<00:00, 288.89it/s] 

[2024/10/21 01:04:30] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='C:\\Users\\ns99a/.paddleocr/whl\\det\\en\\en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='C:\\Users\\ns99a/.paddleocr/whl\\rec\\latin\\latin_PP-OCRv3_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batc


