In [1]:
# Import the functions
import pandas as pd
from LabelGenius import classification_CLIP_0_shot, finetune_CLIP, classification_CLIP_finetuned, auto_verification

Using device: cpu


Demo 1: Single-Category Classification using N24News Dataset
-------------------------------------------------------------

This demo shows how to classify a single news article into one of 24 category
using the N24News dataset. Each article in the dataset includes both textual
and visual information.

Source: https://aclanthology.org/2022.lrec-1.729/


Each article contains the following fields:
- 'section': Ground truth label (one of 24 category)
- 'headline': Title of the article
- 'abstract': Short summary of the article
- 'article': Full text content
- 'article_url': Link to the original article
- 'image': Encoded image or metadata (optional)
- 'caption': Image caption
- 'image_id': Unique image identifier
- 'image_path': Path to the associated image (e.g., 'N24News/imgs_200_sample1/12345.jpg')
- 'article_id': Unique article identifier

Image file: Multimodal_image

Example category (See prompt_D1 for the complete category):
------------------------
1. Health
2. Science
3. Television
...
24. Global Business

Reference:
----------
Wang, Z., Shan, X., Zhang, X., & Yang, J. (2022).
N24News: A New Dataset for Multimodal News Classification.
In *Proceedings of the Thirteenth Language Resources and Evaluation Conference* (pp. 6768–6775). LREC.


### Demo 1a: Single-Category Text Classification

**Datasets:**
- `D1_1.csv`: Used for initial labeling and fine-tuning.
- `D1_2.csv`: Used for testing the fine-tuned model's performance.


In [2]:
# Define the descriptive prompt for each category
prompt_D1_CLIP = [
    "a news article about health, including medical news, public health issues, fitness, mental health, and wellness.",
    "a news article about science, covering scientific discoveries, research studies, space exploration, and innovations.",
    "a news article about television, featuring TV shows, series reviews, industry news, and streaming platforms.",
    "a news article about travel, focusing on tourism, destinations, travel guides, airlines, and vacation trends.",
    "a news article about movies, including film industry news, reviews, box office reports, and upcoming releases.",
    "a news article about dance, covering ballet, contemporary styles, street dance, performances, and dance competitions.",
    "a news article about real estate, highlighting housing market trends, property sales, architecture, and urban planning.",
    "a news article about the economy, featuring macroeconomics, inflation, stock markets, GDP growth, and financial policies.",
    "a news article about sports, covering professional sports, competitions, athlete news, and game Demo_results.",
    "a news article about theater, featuring plays, Broadway shows, live performances, and stage production reviews.",
    "a news article about opinion pieces, including editorials, analysis, and expert commentaries.",
    "a news article about music, covering albums, artists, concerts, festivals, and industry trends.",
    "a news article about books, featuring literature, bestsellers, author interviews, and book reviews.",
    "a news article about art and design, showcasing fine arts, visual arts, museums, exhibitions, and design trends.",
    "a news article about style, including fashion trends, beauty, personal style, and cultural aesthetics.",
    "a news article about media, covering journalism, publishing, digital media, and mass communication.",
    "a news article about food, featuring restaurants, cooking, recipes, culinary trends, and food culture.",
    "a news article about well-being, focusing on lifestyle, personal development, mental well-being, and self-care.",
    "a news article about fashion, covering clothing, designers, fashion weeks, and industry insights.",
    "a news article about technology, featuring AI, gadgets, software, cybersecurity, and tech innovations.",
    "a news article about personal finance, including investing, budgeting, and financial planning.",
    "a news article about education, featuring schools, universities, learning methods, and education policies.",
    "a news article about automobiles, covering car industry news, electric vehicles, reviews, and trends.",
    "a news article about global business, featuring international trade, corporations, mergers, and global markets."
]

# Define the list of 24 category labels
category_D1_CLIP = [
    "1", "2", "3", "4", "5", "6",
    "7", "8", "9", "10", "11", "12",
    "13", "14", "15", "16", "17", "18",
    "19", "20", "21", "22", "23", "24"
]

## CLIP: local labeling

In [3]:
D1a_CLIP_inital_labeling = classification_CLIP_0_shot(
    text_path="Demo_data/D1_1.csv",
    mode="text",
    prompt=prompt_D1_CLIP,
    text_column=["headline", "abstract"],
    predict_column="D1a_CLIP_inital_labeling",
)


D1a_CLIP_inital_labeling.to_csv("Demo_result/D1a_CLIP_inital_labeling.csv", index=False)

Loaded 200 records


Predicting: 100%|██████████| 200/200 [00:03<00:00, 51.50it/s]


In [4]:
## Check the accuracy: CLIP initial labeling
auto_verification(
    D1a_CLIP_inital_labeling,
    predicted_cols="D1a_CLIP_inital_labeling",
    true_cols="section_numeric",
    category=category_D1_CLIP
)


== Verification of 'D1a_CLIP_inital_labeling' vs. 'section_numeric' ==
Accuracy:   43.50%
Macro F1:   36.72%
Micro  F1:  43.50%

Full classification report:
              precision    recall  f1-score   support

           1       0.25      0.08      0.12        13
           2       0.33      0.09      0.14        11
           3       0.60      0.50      0.55        12
           4       0.75      0.33      0.46         9
           5       1.00      0.17      0.29         6
           6       0.80      0.44      0.57         9
           7       0.33      1.00      0.50         2
           8       0.50      0.40      0.44         5
           9       0.00      0.00      0.00         9
          10       0.39      0.82      0.53        11
          11       0.33      0.07      0.12        14
          12       0.00      0.00      0.00        10
          13       0.62      0.56      0.59         9
          14       0.93      0.72      0.81        18
          15       0.50      0.

### finetune: CLIP

In [5]:
# finetune CLIP
finetune_CLIP(
    mode="text",
    text_path="Demo_data/D1_1.csv",
    text_column=["headline", "abstract"],
    true_label="section_numeric",
    model_name="Demo_finetuned_CLIP/D1a_CLIP_model_finetuned.pth",
    num_epochs=20,
    batch_size=8,
    learning_rate=1e-5,
)


Loaded 200 records
Number of classes: 24
Use text: True, Use image: False
Text columns: ['headline', 'abstract']
Label column: section_numeric
Training set class distribution: section_numeric
14    17
1     12
3     11
10    10
19    10
6      9
11     9
12     9
9      8
13     8
23     8
4      7
5      6
2      6
16     6
17     6
15     4
20     4
8      3
22     3
7      2
18     1
21     1
Name: count, dtype: int64
Validation set class distribution: section_numeric
2     5
11    5
20    4
23    3
15    3
19    3
17    3
8     2
4     2
18    1
24    1
12    1
13    1
1     1
10    1
3     1
9     1
16    1
14    1
Name: count, dtype: int64


Epoch 1/20 [Train]:  50%|█████     | 10/20 [00:04<00:03,  2.73it/s]

Batch 10/20: Loss: 3.1757 | Acc: 5.00%


Epoch 1/20 [Train]: 100%|██████████| 20/20 [00:06<00:00,  2.89it/s]


Batch 20/20: Loss: 3.1316 | Acc: 9.38%


Epoch 1/20 [Val]: 100%|██████████| 5/5 [00:00<00:00, 12.58it/s]


Epoch 1/20: Train Loss: 3.1650 | Train Acc: 9.38% | Val Loss: 3.1641 | Val Acc: 7.50%
Model saved! Best validation accuracy: 7.50%


Epoch 2/20 [Train]:  50%|█████     | 10/20 [00:02<00:02,  3.33it/s]

Batch 10/20: Loss: 3.0662 | Acc: 28.75%


Epoch 2/20 [Train]: 100%|██████████| 20/20 [00:05<00:00,  3.39it/s]


Batch 20/20: Loss: 3.1208 | Acc: 32.50%


Epoch 2/20 [Val]: 100%|██████████| 5/5 [00:00<00:00, 11.19it/s]


Epoch 2/20: Train Loss: 3.0905 | Train Acc: 32.50% | Val Loss: 3.1480 | Val Acc: 7.50%


Epoch 3/20 [Train]:  50%|█████     | 10/20 [00:02<00:02,  3.76it/s]

Batch 10/20: Loss: 3.0136 | Acc: 68.75%


Epoch 3/20 [Train]: 100%|██████████| 20/20 [00:05<00:00,  3.58it/s]


Batch 20/20: Loss: 3.0151 | Acc: 68.12%


Epoch 3/20 [Val]: 100%|██████████| 5/5 [00:00<00:00,  9.88it/s]


Epoch 3/20: Train Loss: 3.0203 | Train Acc: 68.12% | Val Loss: 3.1208 | Val Acc: 20.00%
Model saved! Best validation accuracy: 20.00%


Epoch 4/20 [Train]:  50%|█████     | 10/20 [00:02<00:02,  3.41it/s]

Batch 10/20: Loss: 2.9384 | Acc: 91.25%


Epoch 4/20 [Train]: 100%|██████████| 20/20 [00:05<00:00,  3.54it/s]


Batch 20/20: Loss: 2.8977 | Acc: 92.50%


Epoch 4/20 [Val]: 100%|██████████| 5/5 [00:00<00:00, 14.10it/s]


Epoch 4/20: Train Loss: 2.9472 | Train Acc: 92.50% | Val Loss: 3.0903 | Val Acc: 25.00%
Model saved! Best validation accuracy: 25.00%


Epoch 5/20 [Train]:  50%|█████     | 10/20 [00:03<00:02,  3.39it/s]

Batch 10/20: Loss: 2.9298 | Acc: 96.25%


Epoch 5/20 [Train]: 100%|██████████| 20/20 [00:05<00:00,  3.41it/s]


Batch 20/20: Loss: 2.8663 | Acc: 94.38%


Epoch 5/20 [Val]: 100%|██████████| 5/5 [00:00<00:00, 12.02it/s]


Epoch 5/20: Train Loss: 2.8812 | Train Acc: 94.38% | Val Loss: 3.0633 | Val Acc: 45.00%
Model saved! Best validation accuracy: 45.00%


Epoch 6/20 [Train]:  50%|█████     | 10/20 [00:02<00:02,  3.58it/s]

Batch 10/20: Loss: 2.8628 | Acc: 97.50%


Epoch 6/20 [Train]: 100%|██████████| 20/20 [00:05<00:00,  3.65it/s]


Batch 20/20: Loss: 2.8271 | Acc: 98.12%


Epoch 6/20 [Val]: 100%|██████████| 5/5 [00:00<00:00, 12.87it/s]


Epoch 6/20: Train Loss: 2.8271 | Train Acc: 98.12% | Val Loss: 3.0390 | Val Acc: 50.00%
Model saved! Best validation accuracy: 50.00%


Epoch 7/20 [Train]:  50%|█████     | 10/20 [00:02<00:02,  3.72it/s]

Batch 10/20: Loss: 2.7593 | Acc: 100.00%


Epoch 7/20 [Train]: 100%|██████████| 20/20 [00:05<00:00,  3.71it/s]


Batch 20/20: Loss: 2.7629 | Acc: 100.00%


Epoch 7/20 [Val]: 100%|██████████| 5/5 [00:00<00:00, 12.14it/s]


Epoch 7/20: Train Loss: 2.7855 | Train Acc: 100.00% | Val Loss: 3.0188 | Val Acc: 55.00%
Model saved! Best validation accuracy: 55.00%


Epoch 8/20 [Train]:  50%|█████     | 10/20 [00:02<00:02,  3.58it/s]

Batch 10/20: Loss: 2.7594 | Acc: 100.00%


Epoch 8/20 [Train]: 100%|██████████| 20/20 [00:05<00:00,  3.53it/s]


Batch 20/20: Loss: 2.7518 | Acc: 100.00%


Epoch 8/20 [Val]: 100%|██████████| 5/5 [00:00<00:00, 12.83it/s]


Epoch 8/20: Train Loss: 2.7537 | Train Acc: 100.00% | Val Loss: 2.9994 | Val Acc: 60.00%
Model saved! Best validation accuracy: 60.00%


Epoch 9/20 [Train]:  50%|█████     | 10/20 [00:02<00:02,  3.72it/s]

Batch 10/20: Loss: 2.7018 | Acc: 100.00%


Epoch 9/20 [Train]: 100%|██████████| 20/20 [00:05<00:00,  3.71it/s]


Batch 20/20: Loss: 2.7549 | Acc: 100.00%


Epoch 9/20 [Val]: 100%|██████████| 5/5 [00:00<00:00, 13.34it/s]


Epoch 9/20: Train Loss: 2.7231 | Train Acc: 100.00% | Val Loss: 2.9826 | Val Acc: 57.50%


Epoch 10/20 [Train]:  50%|█████     | 10/20 [00:02<00:02,  3.71it/s]

Batch 10/20: Loss: 2.7003 | Acc: 100.00%


Epoch 10/20 [Train]: 100%|██████████| 20/20 [00:05<00:00,  3.64it/s]


Batch 20/20: Loss: 2.6945 | Acc: 100.00%


Epoch 10/20 [Val]: 100%|██████████| 5/5 [00:00<00:00,  9.75it/s]


Epoch 10/20: Train Loss: 2.7072 | Train Acc: 100.00% | Val Loss: 2.9723 | Val Acc: 62.50%
Model saved! Best validation accuracy: 62.50%


Epoch 11/20 [Train]:  50%|█████     | 10/20 [00:03<00:03,  3.19it/s]

Batch 10/20: Loss: 2.7063 | Acc: 100.00%


Epoch 11/20 [Train]: 100%|██████████| 20/20 [00:06<00:00,  3.08it/s]


Batch 20/20: Loss: 2.7097 | Acc: 100.00%


Epoch 11/20 [Val]: 100%|██████████| 5/5 [00:00<00:00, 10.91it/s]


Epoch 11/20: Train Loss: 2.6853 | Train Acc: 100.00% | Val Loss: 2.9580 | Val Acc: 62.50%


Epoch 12/20 [Train]:  50%|█████     | 10/20 [00:03<00:03,  2.84it/s]

Batch 10/20: Loss: 2.6544 | Acc: 100.00%


Epoch 12/20 [Train]: 100%|██████████| 20/20 [00:06<00:00,  3.02it/s]


Batch 20/20: Loss: 2.6948 | Acc: 100.00%


Epoch 12/20 [Val]: 100%|██████████| 5/5 [00:00<00:00, 12.58it/s]


Epoch 12/20: Train Loss: 2.6688 | Train Acc: 100.00% | Val Loss: 2.9466 | Val Acc: 70.00%
Model saved! Best validation accuracy: 70.00%


Epoch 13/20 [Train]:  50%|█████     | 10/20 [00:02<00:02,  3.48it/s]

Batch 10/20: Loss: 2.6695 | Acc: 100.00%


Epoch 13/20 [Train]: 100%|██████████| 20/20 [00:05<00:00,  3.53it/s]


Batch 20/20: Loss: 2.6951 | Acc: 100.00%


Epoch 13/20 [Val]: 100%|██████████| 5/5 [00:00<00:00, 13.23it/s]


Epoch 13/20: Train Loss: 2.6585 | Train Acc: 100.00% | Val Loss: 2.9389 | Val Acc: 67.50%


Epoch 14/20 [Train]:  50%|█████     | 10/20 [00:02<00:02,  3.59it/s]

Batch 10/20: Loss: 2.6638 | Acc: 100.00%


Epoch 14/20 [Train]: 100%|██████████| 20/20 [00:05<00:00,  3.57it/s]


Batch 20/20: Loss: 2.6487 | Acc: 100.00%


Epoch 14/20 [Val]: 100%|██████████| 5/5 [00:00<00:00, 13.70it/s]


Epoch 14/20: Train Loss: 2.6484 | Train Acc: 100.00% | Val Loss: 2.9323 | Val Acc: 67.50%


Epoch 15/20 [Train]:  50%|█████     | 10/20 [00:02<00:02,  3.56it/s]

Batch 10/20: Loss: 2.6445 | Acc: 100.00%


Epoch 15/20 [Train]: 100%|██████████| 20/20 [00:05<00:00,  3.65it/s]


Batch 20/20: Loss: 2.6228 | Acc: 100.00%


Epoch 15/20 [Val]: 100%|██████████| 5/5 [00:00<00:00, 12.88it/s]


Epoch 15/20: Train Loss: 2.6388 | Train Acc: 100.00% | Val Loss: 2.9261 | Val Acc: 67.50%


Epoch 16/20 [Train]:  50%|█████     | 10/20 [00:02<00:02,  3.58it/s]

Batch 10/20: Loss: 2.6165 | Acc: 100.00%


Epoch 16/20 [Train]: 100%|██████████| 20/20 [00:05<00:00,  3.62it/s]


Batch 20/20: Loss: 2.6407 | Acc: 100.00%


Epoch 16/20 [Val]: 100%|██████████| 5/5 [00:00<00:00, 13.23it/s]


Epoch 16/20: Train Loss: 2.6356 | Train Acc: 100.00% | Val Loss: 2.9201 | Val Acc: 65.00%


Epoch 17/20 [Train]:  50%|█████     | 10/20 [00:02<00:02,  3.64it/s]

Batch 10/20: Loss: 2.6261 | Acc: 100.00%


Epoch 17/20 [Train]: 100%|██████████| 20/20 [00:05<00:00,  3.54it/s]


Batch 20/20: Loss: 2.6356 | Acc: 100.00%


Epoch 17/20 [Val]: 100%|██████████| 5/5 [00:00<00:00, 11.65it/s]


Epoch 17/20: Train Loss: 2.6278 | Train Acc: 100.00% | Val Loss: 2.9135 | Val Acc: 67.50%


Epoch 18/20 [Train]:  50%|█████     | 10/20 [00:02<00:02,  3.49it/s]

Batch 10/20: Loss: 2.6450 | Acc: 100.00%


Epoch 18/20 [Train]: 100%|██████████| 20/20 [00:05<00:00,  3.53it/s]


Batch 20/20: Loss: 2.6126 | Acc: 100.00%


Epoch 18/20 [Val]: 100%|██████████| 5/5 [00:00<00:00, 10.83it/s]


Epoch 18/20: Train Loss: 2.6228 | Train Acc: 100.00% | Val Loss: 2.9101 | Val Acc: 67.50%


Epoch 19/20 [Train]:  50%|█████     | 10/20 [00:02<00:02,  3.56it/s]

Batch 10/20: Loss: 2.6202 | Acc: 100.00%


Epoch 19/20 [Train]: 100%|██████████| 20/20 [00:05<00:00,  3.57it/s]


Batch 20/20: Loss: 2.6166 | Acc: 100.00%


Epoch 19/20 [Val]: 100%|██████████| 5/5 [00:00<00:00, 11.82it/s]


Epoch 19/20: Train Loss: 2.6176 | Train Acc: 100.00% | Val Loss: 2.9033 | Val Acc: 65.00%


Epoch 20/20 [Train]:  50%|█████     | 10/20 [00:02<00:02,  3.52it/s]

Batch 10/20: Loss: 2.6245 | Acc: 100.00%


Epoch 20/20 [Train]: 100%|██████████| 20/20 [00:05<00:00,  3.45it/s]


Batch 20/20: Loss: 2.6406 | Acc: 100.00%


Epoch 20/20 [Val]: 100%|██████████| 5/5 [00:00<00:00, 12.96it/s]

Epoch 20/20: Train Loss: 2.6134 | Train Acc: 100.00% | Val Loss: 2.8928 | Val Acc: 67.50%
Fine-tuning complete! Best validation accuracy: 70.00%





70.0

In [6]:
# Classify with a fine‑tuned model
D1a_CLIP_finetuned = classification_CLIP_finetuned(
    mode="text",
    text_path="Demo_data/D1_2.csv",
    text_column=["headline", "abstract"],
    model_name="Demo_finetuned_CLIP/D1a_CLIP_model_finetuned.pth",
    predict_column="D1a_CLIP_finetuned",
)


D1a_CLIP_finetuned.to_csv("Demo_result/D1a_CLIP_finetuned.csv", index=False)

200 pieces of data loaded


predicting: 100%|██████████| 25/25 [00:02<00:00,  9.18it/s]


Labeling completed!


In [7]:
## Check the accuracy: CLIP after finetune
auto_verification(
    D1a_CLIP_finetuned,
    predicted_cols="D1a_CLIP_finetuned",
    true_cols="section_numeric",
    category=category_D1_CLIP
)


== Verification of 'D1a_CLIP_finetuned' vs. 'section_numeric' ==
Accuracy:   62.50%
Macro F1:   54.86%
Micro  F1:  62.50%

Full classification report:
              precision    recall  f1-score   support

           1       0.59      0.71      0.65        14
           2       0.80      0.57      0.67         7
           3       1.00      0.65      0.79        17
           4       0.29      0.80      0.42         5
           5       1.00      0.50      0.67         4
           6       1.00      1.00      1.00         8
           7       1.00      0.07      0.12        15
           8       1.00      0.14      0.25         7
           9       1.00      0.83      0.91         6
          10       0.57      0.92      0.71        13
          11       0.34      1.00      0.51        10
          12       0.57      0.73      0.64        11
          13       0.91      0.83      0.87        12
          14       0.59      0.91      0.71        11
          15       0.00      0.00    

# Demo 1b: Classify single-category image data

**Datasets:**

- `D1_1.csv`: Used for initial labeling and fine-tuning.
- `imgs_40_1`: Used for initial labeling and fine-tuning.

- `D1_2.csv`: Used for testing the fine-tuned model's performance.
- `imgs_40_2`: Used for testing the fine-tuned model's performance.


### CSV File Requirements

Each CSV file must include at minimum:

- `image_id`  
  – The base filename (without extension) of each image in the corresponding folder.

## CLIP: local labeling

In [8]:
D1b_CLIP_inital_labeling = classification_CLIP_0_shot(
    text_path="Demo_data/D1_1.csv",
    img_dir="Demo_data/D1_imgs_1",
    mode="image",
    prompt=prompt_D1_CLIP,
    predict_column="D1b_CLIP_inital_labeling",
)


D1b_CLIP_inital_labeling.to_csv("Demo_result/D1b_CLIP_inital_labeling.csv", index=False)

Loaded 200 records


Predicting: 100%|██████████| 200/200 [00:10<00:00, 18.90it/s]


In [9]:
D1b_CLIP_inital_labeling

Unnamed: 0,section,headline,article_url,article,abstract,article_id,image,caption,image_id,image_path,section_numeric,D1b_CLIP_inital_labeling
0,Fashion & Style,"On This Runway, Non-Models and Cool Kids",https://www.nytimes.com/2016/09/10/fashion/eck...,"Over Labor Day weekend, a steady stream of hop...",How the Eckhaus Latta designers select fashion...,e9cd6477-5eb6-58b2-8e33-fd2d881bf656,https://static01.nyt.com/images/2016/09/10/fas...,"Mike Eckhaus, center right, and Zoe Latta, cen...",e9cd6477-5eb6-58b2-8e33-fd2d881bf656,Demo_data/D1_imgs/1e9cd6477-5eb6-58b2-8e33-fd2...,19,11
1,Theater,"The New 42nd Street, a Theater Nonprofit, Name...",https://www.nytimes.com/2019/06/11/theater/new...,The New 42nd Street -- the nonprofit organizat...,Russell Granet will be the new president and c...,acdcd1ef-71a9-55f2-9993-7a231d57396f,https://static01.nyt.com/images/2019/06/11/art...,"The New 42nd Street board chairwoman, Fiona Ru...",acdcd1ef-71a9-55f2-9993-7a231d57396f,Demo_data/D1_imgs/1acdcd1ef-71a9-55f2-9993-7a2...,10,21
2,Economy,Tax Tactics Threaten Public Funds,https://www.nytimes.com/2014/10/02/business/ec...,When the European Commission charged this week...,If global corporations can continue to evade t...,1de5c156-9cea-5971-9fdf-b6a4ce9bf35c,https://static01.nyt.com/images/2012/02/29/bus...,Eduardo Porter,1de5c156-9cea-5971-9fdf-b6a4ce9bf35c,Demo_data/D1_imgs/11de5c156-9cea-5971-9fdf-b6a...,8,13
3,Television,Review: 'The Chi' Returns to the South Side of...,https://www.nytimes.com/2019/04/05/arts/televi...,"""The Chi"" was built around a series of shootin...",Lena Waithe's neighborhood drama on Showtime d...,7bb11a9c-2f6d-57f9-bb36-db7f00f59589,https://static01.nyt.com/images/2019/04/05/art...,"Alex Hibbert as Kevin in ""The Chi,"" a sprawlin...",7bb11a9c-2f6d-57f9-bb36-db7f00f59589,Demo_data/D1_imgs/17bb11a9c-2f6d-57f9-bb36-db7...,3,3
4,Opinion,Banks Should Face History and Pay Reparations,https://www.nytimes.com/2020/06/26/opinion/sun...,Ms. Blackwell is founder in residence at Polic...,The financial industry can close the wealth ga...,165426ae-1e7c-5193-8596-ef96357645a8,https://static01.nyt.com/images/2020/06/25/opi...,A Wells Fargo bank in Minneapolis was set on f...,165426ae-1e7c-5193-8596-ef96357645a8,Demo_data/D1_imgs/1165426ae-1e7c-5193-8596-ef9...,11,12
...,...,...,...,...,...,...,...,...,...,...,...,...
195,Automobiles,Wheelies: The Ram On Edition,https://www.nytimes.com/2014/02/05/automobiles...,A roundup of motoring news from the web:\n\n# ...,Chrysler executives consider opening a third f...,ad722160-9bde-5350-abf3-a636bc1b6c22,https://static01.nyt.com/images/2014/02/04/aut...,The Ram 1500.,ad722160-9bde-5350-abf3-a636bc1b6c22,Demo_data/D1_imgs/1ad722160-9bde-5350-abf3-a63...,23,23
196,Science,Geminids Meteor Shower 2020: Watch It Peak in ...,https://www.nytimes.com/2020/12/13/science/gem...,All year long as Earth revolves around the sun...,Meteor showers can light up night skies from d...,bd8aca0f-81f3-560c-a1ca-20fbefd4af67,https://static01.nyt.com/images/2017/07/19/sci...,A meteor from the Geminids streaking between t...,bd8aca0f-81f3-560c-a1ca-20fbefd4af67,Demo_data/D1_imgs/1bd8aca0f-81f3-560c-a1ca-20f...,2,4
197,Opinion,The Justice Department's Shameful Rush to Fede...,https://www.nytimes.com/2020/07/17/opinion/jus...,"Early on Tuesday morning, while much of the co...",The push to impose the federal death penalty n...,5da51505-585c-5c01-8f29-d78297ac0aa4,https://static01.nyt.com/images/2020/07/16/opi...,The execution chamber in the U.S. Penitentiary...,5da51505-585c-5c01-8f29-d78297ac0aa4,Demo_data/D1_imgs/15da51505-585c-5c01-8f29-d78...,11,1
198,Style,How Do You Dress a Russian Doll?,https://www.nytimes.com/2019/02/14/style/russi...,What would you wear if you could attend your o...,"Glumly, it seems, but that hasn't stopped styl...",8a5113f7-3a30-56c0-99b2-ab36e31c6c1f,https://static01.nyt.com/images/2019/02/14/fas...,Rebecca Henderson and Greta Lee in the Netflix...,8a5113f7-3a30-56c0-99b2-ab36e31c6c1f,Demo_data/D1_imgs/18a5113f7-3a30-56c0-99b2-ab3...,15,15


In [10]:
## Check the accuracy: CLIP initial labeling
auto_verification(
    D1b_CLIP_inital_labeling,
    predicted_cols="D1b_CLIP_inital_labeling",
    true_cols="section_numeric",
    category=category_D1_CLIP
);


== Verification of 'D1b_CLIP_inital_labeling' vs. 'section_numeric' ==
Accuracy:   38.00%
Macro F1:   33.23%
Micro  F1:  38.00%

Full classification report:
              precision    recall  f1-score   support

           1       0.57      0.31      0.40        13
           2       0.56      0.45      0.50        11
           3       0.67      0.50      0.57        12
           4       0.50      0.11      0.18         9
           5       0.00      0.00      0.00         6
           6       1.00      0.67      0.80         9
           7       0.50      0.50      0.50         2
           8       0.14      0.20      0.17         5
           9       0.71      0.56      0.62         9
          10       0.33      0.45      0.38        11
          11       0.11      0.50      0.18        14
          12       0.40      0.60      0.48        10
          13       0.30      0.33      0.32         9
          14       0.75      0.33      0.46        18
          15       0.25      0.

### finetune: CLIP

In [11]:
finetune_CLIP(
    mode="image",
    text_path="Demo_result/D1b_CLIP_inital_labeling.csv",
    img_dir="Demo_data/D1_imgs_1",
    true_label="section_numeric",
    model_name="Demo_finetuned_CLIP/D1a_CLIP_model_finetuned.pth",
    num_epochs=20,
    batch_size=8,
    learning_rate=1e-5,
)


Loaded 200 records
Number of classes: 24
Use text: False, Use image: True
Text columns: ['headline']
Label column: section_numeric
Training set class distribution: section_numeric
14    17
1     12
3     11
10    10
19    10
6      9
11     9
12     9
9      8
13     8
23     8
4      7
5      6
2      6
16     6
17     6
15     4
20     4
8      3
22     3
7      2
18     1
21     1
Name: count, dtype: int64
Validation set class distribution: section_numeric
2     5
11    5
20    4
23    3
15    3
19    3
17    3
8     2
4     2
18    1
24    1
12    1
13    1
1     1
10    1
3     1
9     1
16    1
14    1
Name: count, dtype: int64


Epoch 1/20 [Train]:  50%|█████     | 10/20 [00:04<00:03,  2.56it/s]

Batch 10/20: Loss: 3.1574 | Acc: 5.00%


Epoch 1/20 [Train]: 100%|██████████| 20/20 [00:08<00:00,  2.46it/s]


Batch 20/20: Loss: 3.1665 | Acc: 6.25%


Epoch 1/20 [Val]: 100%|██████████| 5/5 [00:00<00:00,  6.59it/s]


Epoch 1/20: Train Loss: 3.1679 | Train Acc: 6.25% | Val Loss: 3.1639 | Val Acc: 2.50%
Model saved! Best validation accuracy: 2.50%


Epoch 2/20 [Train]:  50%|█████     | 10/20 [00:03<00:04,  2.46it/s]

Batch 10/20: Loss: 3.0831 | Acc: 15.00%


Epoch 2/20 [Train]: 100%|██████████| 20/20 [00:07<00:00,  2.52it/s]


Batch 20/20: Loss: 3.0814 | Acc: 18.12%


Epoch 2/20 [Val]: 100%|██████████| 5/5 [00:00<00:00,  6.45it/s]


Epoch 2/20: Train Loss: 3.1053 | Train Acc: 18.12% | Val Loss: 3.1501 | Val Acc: 7.50%
Model saved! Best validation accuracy: 7.50%


Epoch 3/20 [Train]:  50%|█████     | 10/20 [00:03<00:04,  2.49it/s]

Batch 10/20: Loss: 3.0438 | Acc: 35.00%


Epoch 3/20 [Train]: 100%|██████████| 20/20 [00:07<00:00,  2.54it/s]


Batch 20/20: Loss: 3.0102 | Acc: 38.75%


Epoch 3/20 [Val]: 100%|██████████| 5/5 [00:00<00:00,  6.89it/s]


Epoch 3/20: Train Loss: 3.0483 | Train Acc: 38.75% | Val Loss: 3.1474 | Val Acc: 7.50%


Epoch 4/20 [Train]:  50%|█████     | 10/20 [00:03<00:03,  2.57it/s]

Batch 10/20: Loss: 3.0022 | Acc: 70.00%


Epoch 4/20 [Train]: 100%|██████████| 20/20 [00:07<00:00,  2.57it/s]


Batch 20/20: Loss: 2.9848 | Acc: 72.50%


Epoch 4/20 [Val]: 100%|██████████| 5/5 [00:00<00:00,  6.22it/s]


Epoch 4/20: Train Loss: 2.9902 | Train Acc: 72.50% | Val Loss: 3.1343 | Val Acc: 15.00%
Model saved! Best validation accuracy: 15.00%


Epoch 5/20 [Train]:  50%|█████     | 10/20 [00:03<00:03,  2.53it/s]

Batch 10/20: Loss: 2.9651 | Acc: 91.25%


Epoch 5/20 [Train]: 100%|██████████| 20/20 [00:07<00:00,  2.52it/s]


Batch 20/20: Loss: 2.9143 | Acc: 94.38%


Epoch 5/20 [Val]: 100%|██████████| 5/5 [00:00<00:00,  6.29it/s]


Epoch 5/20: Train Loss: 2.9376 | Train Acc: 94.38% | Val Loss: 3.1197 | Val Acc: 22.50%
Model saved! Best validation accuracy: 22.50%


Epoch 6/20 [Train]:  50%|█████     | 10/20 [00:03<00:04,  2.49it/s]

Batch 10/20: Loss: 2.8814 | Acc: 97.50%


Epoch 6/20 [Train]: 100%|██████████| 20/20 [00:08<00:00,  2.48it/s]


Batch 20/20: Loss: 2.8843 | Acc: 98.12%


Epoch 6/20 [Val]: 100%|██████████| 5/5 [00:00<00:00,  6.33it/s]


Epoch 6/20: Train Loss: 2.8817 | Train Acc: 98.12% | Val Loss: 3.1112 | Val Acc: 25.00%
Model saved! Best validation accuracy: 25.00%


Epoch 7/20 [Train]:  50%|█████     | 10/20 [00:03<00:04,  2.49it/s]

Batch 10/20: Loss: 2.8403 | Acc: 100.00%


Epoch 7/20 [Train]: 100%|██████████| 20/20 [00:07<00:00,  2.53it/s]


Batch 20/20: Loss: 2.7928 | Acc: 100.00%


Epoch 7/20 [Val]: 100%|██████████| 5/5 [00:00<00:00,  6.03it/s]


Epoch 7/20: Train Loss: 2.8294 | Train Acc: 100.00% | Val Loss: 3.0985 | Val Acc: 30.00%
Model saved! Best validation accuracy: 30.00%


Epoch 8/20 [Train]:  50%|█████     | 10/20 [00:03<00:03,  2.55it/s]

Batch 10/20: Loss: 2.7944 | Acc: 100.00%


Epoch 8/20 [Train]: 100%|██████████| 20/20 [00:08<00:00,  2.45it/s]


Batch 20/20: Loss: 2.7671 | Acc: 100.00%


Epoch 8/20 [Val]: 100%|██████████| 5/5 [00:00<00:00,  6.21it/s]


Epoch 8/20: Train Loss: 2.7910 | Train Acc: 100.00% | Val Loss: 3.0964 | Val Acc: 27.50%


Epoch 9/20 [Train]:  50%|█████     | 10/20 [00:04<00:03,  2.56it/s]

Batch 10/20: Loss: 2.7645 | Acc: 100.00%


Epoch 9/20 [Train]: 100%|██████████| 20/20 [00:08<00:00,  2.44it/s]


Batch 20/20: Loss: 2.7845 | Acc: 100.00%


Epoch 9/20 [Val]: 100%|██████████| 5/5 [00:00<00:00,  6.01it/s]


Epoch 9/20: Train Loss: 2.7556 | Train Acc: 100.00% | Val Loss: 3.0901 | Val Acc: 30.00%


Epoch 10/20 [Train]:  50%|█████     | 10/20 [00:04<00:05,  1.99it/s]

Batch 10/20: Loss: 2.7221 | Acc: 100.00%


Epoch 10/20 [Train]: 100%|██████████| 20/20 [00:08<00:00,  2.34it/s]


Batch 20/20: Loss: 2.7074 | Acc: 100.00%


Epoch 10/20 [Val]: 100%|██████████| 5/5 [00:00<00:00,  7.02it/s]


Epoch 10/20: Train Loss: 2.7316 | Train Acc: 100.00% | Val Loss: 3.0841 | Val Acc: 30.00%


Epoch 11/20 [Train]:  50%|█████     | 10/20 [00:03<00:03,  2.61it/s]

Batch 10/20: Loss: 2.7089 | Acc: 100.00%


Epoch 11/20 [Train]: 100%|██████████| 20/20 [00:07<00:00,  2.55it/s]


Batch 20/20: Loss: 2.6935 | Acc: 100.00%


Epoch 11/20 [Val]: 100%|██████████| 5/5 [00:00<00:00,  6.97it/s]


Epoch 11/20: Train Loss: 2.7053 | Train Acc: 100.00% | Val Loss: 3.0835 | Val Acc: 32.50%
Model saved! Best validation accuracy: 32.50%


Epoch 12/20 [Train]:  50%|█████     | 10/20 [00:04<00:04,  2.47it/s]

Batch 10/20: Loss: 2.6981 | Acc: 100.00%


Epoch 12/20 [Train]: 100%|██████████| 20/20 [00:08<00:00,  2.44it/s]


Batch 20/20: Loss: 2.6815 | Acc: 100.00%


Epoch 12/20 [Val]: 100%|██████████| 5/5 [00:00<00:00,  5.97it/s]


Epoch 12/20: Train Loss: 2.6881 | Train Acc: 100.00% | Val Loss: 3.0787 | Val Acc: 32.50%


Epoch 13/20 [Train]:  50%|█████     | 10/20 [00:03<00:04,  2.49it/s]

Batch 10/20: Loss: 2.6447 | Acc: 100.00%


Epoch 13/20 [Train]: 100%|██████████| 20/20 [00:07<00:00,  2.55it/s]


Batch 20/20: Loss: 2.6850 | Acc: 100.00%


Epoch 13/20 [Val]: 100%|██████████| 5/5 [00:00<00:00,  6.74it/s]


Epoch 13/20: Train Loss: 2.6719 | Train Acc: 100.00% | Val Loss: 3.0790 | Val Acc: 30.00%


Epoch 14/20 [Train]:  50%|█████     | 10/20 [00:03<00:03,  2.62it/s]

Batch 10/20: Loss: 2.6638 | Acc: 100.00%


Epoch 14/20 [Train]: 100%|██████████| 20/20 [00:07<00:00,  2.54it/s]


Batch 20/20: Loss: 2.6602 | Acc: 100.00%


Epoch 14/20 [Val]: 100%|██████████| 5/5 [00:00<00:00,  6.24it/s]


Epoch 14/20: Train Loss: 2.6606 | Train Acc: 100.00% | Val Loss: 3.0746 | Val Acc: 35.00%
Model saved! Best validation accuracy: 35.00%


Epoch 15/20 [Train]:  50%|█████     | 10/20 [00:03<00:03,  2.64it/s]

Batch 10/20: Loss: 2.6382 | Acc: 100.00%


Epoch 15/20 [Train]: 100%|██████████| 20/20 [00:07<00:00,  2.61it/s]


Batch 20/20: Loss: 2.6430 | Acc: 100.00%


Epoch 15/20 [Val]: 100%|██████████| 5/5 [00:00<00:00,  7.28it/s]


Epoch 15/20: Train Loss: 2.6483 | Train Acc: 100.00% | Val Loss: 3.0775 | Val Acc: 30.00%


Epoch 16/20 [Train]:  50%|█████     | 10/20 [00:04<00:04,  2.43it/s]

Batch 10/20: Loss: 2.6377 | Acc: 100.00%


Epoch 16/20 [Train]: 100%|██████████| 20/20 [00:08<00:00,  2.43it/s]


Batch 20/20: Loss: 2.6420 | Acc: 100.00%


Epoch 16/20 [Val]: 100%|██████████| 5/5 [00:00<00:00,  6.04it/s]


Epoch 16/20: Train Loss: 2.6442 | Train Acc: 100.00% | Val Loss: 3.0732 | Val Acc: 32.50%


Epoch 17/20 [Train]:  50%|█████     | 10/20 [00:03<00:04,  2.45it/s]

Batch 10/20: Loss: 2.6404 | Acc: 100.00%


Epoch 17/20 [Train]: 100%|██████████| 20/20 [00:08<00:00,  2.44it/s]


Batch 20/20: Loss: 2.6613 | Acc: 100.00%


Epoch 17/20 [Val]: 100%|██████████| 5/5 [00:00<00:00,  5.42it/s]


Epoch 17/20: Train Loss: 2.6391 | Train Acc: 100.00% | Val Loss: 3.0754 | Val Acc: 30.00%


Epoch 18/20 [Train]:  50%|█████     | 10/20 [00:04<00:04,  2.26it/s]

Batch 10/20: Loss: 2.6214 | Acc: 100.00%


Epoch 18/20 [Train]: 100%|██████████| 20/20 [00:08<00:00,  2.41it/s]


Batch 20/20: Loss: 2.6403 | Acc: 100.00%


Epoch 18/20 [Val]: 100%|██████████| 5/5 [00:00<00:00,  6.20it/s]


Epoch 18/20: Train Loss: 2.6338 | Train Acc: 100.00% | Val Loss: 3.0733 | Val Acc: 30.00%


Epoch 19/20 [Train]:  50%|█████     | 10/20 [00:03<00:04,  2.44it/s]

Batch 10/20: Loss: 2.6029 | Acc: 100.00%


Epoch 19/20 [Train]: 100%|██████████| 20/20 [00:07<00:00,  2.51it/s]


Batch 20/20: Loss: 2.6425 | Acc: 100.00%


Epoch 19/20 [Val]: 100%|██████████| 5/5 [00:00<00:00,  6.61it/s]


Epoch 19/20: Train Loss: 2.6287 | Train Acc: 100.00% | Val Loss: 3.0684 | Val Acc: 32.50%


Epoch 20/20 [Train]:  50%|█████     | 10/20 [00:03<00:04,  2.50it/s]

Batch 10/20: Loss: 2.6148 | Acc: 100.00%


Epoch 20/20 [Train]: 100%|██████████| 20/20 [00:07<00:00,  2.55it/s]


Batch 20/20: Loss: 2.6011 | Acc: 100.00%


Epoch 20/20 [Val]: 100%|██████████| 5/5 [00:00<00:00,  6.99it/s]

Epoch 20/20: Train Loss: 2.6234 | Train Acc: 100.00% | Val Loss: 3.0733 | Val Acc: 32.50%
Fine-tuning complete! Best validation accuracy: 35.00%





35.0

In [12]:
# Classify with a fine‑tuned model
D1b_CLIP_finetuned = classification_CLIP_finetuned(
    mode="image",
    text_path="Demo_data/D1_2.csv",
    img_dir="Demo_data/D1_imgs_2",
    model_name="Demo_finetuned_CLIP/D1a_CLIP_model_finetuned.pth",
    predict_column="D1b_CLIP_finetuned",
)

D1b_CLIP_finetuned.to_csv("Demo_result/D1b_CLIP_finetuned.csv", index=False)

200 pieces of data loaded


predicting: 100%|██████████| 25/25 [00:04<00:00,  5.81it/s]


Labeling completed!


In [13]:
## Check the accuracy: CLIP after finetune
# Merge the Demo_result of the image with the human label
auto_verification(
    D1b_CLIP_finetuned,
    predicted_cols="D1b_CLIP_finetuned",
    true_cols="section_numeric",
    category=category_D1_CLIP
)


== Verification of 'D1b_CLIP_finetuned' vs. 'section_numeric' ==
Accuracy:   28.50%
Macro F1:   22.57%
Micro  F1:  28.50%

Full classification report:
              precision    recall  f1-score   support

           1       0.26      0.79      0.39        14
           2       0.50      0.14      0.22         7
           3       0.35      0.47      0.40        17
           4       0.16      0.60      0.25         5
           5       0.00      0.00      0.00         4
           6       0.67      0.75      0.71         8
           7       0.00      0.00      0.00        15
           8       0.50      0.14      0.22         7
           9       0.50      0.17      0.25         6
          10       0.00      0.00      0.00        13
          11       0.20      0.30      0.24        10
          12       0.33      0.18      0.24        11
          13       0.56      0.42      0.48        12
          14       0.09      0.18      0.12        11
          15       0.00      0.00    

# Demo 1c: Classify single-category text + image data

**Datasets:**
- `D1_1.csv`: Used for initial labeling and fine-tuning.
- `D1_imgs_1`: Used for initial labeling and fine-tuning.

- `D1_2.csv`: Used for testing the fine-tuned model's performance.
- `D1_imgs_2`: Used for testing the fine-tuned model's performance.


The text dataset should contain a column `image_path` to map the images for each row.

## CLIP: local labeling

In [14]:
D1c_CLIP_inital_labeling = classification_CLIP_0_shot(
    text_path="Demo_data/D1_1.csv",
    img_dir="Demo_data/D1_imgs_1",
    mode="both",
    prompt=prompt_D1_CLIP,
    text_column=["headline", "abstract"],
    predict_column="D1c_CLIP_inital_labeling",
)

D1c_CLIP_inital_labeling.to_csv("Demo_result/D1c_CLIP_inital_labeling.csv", index=False)

Loaded 200 records


Predicting: 100%|██████████| 200/200 [00:12<00:00, 15.66it/s]


In [15]:
## Check the accuracy: CLIP initial labeling
auto_verification(
    D1c_CLIP_inital_labeling,
    predicted_cols="D1c_CLIP_inital_labeling",
    true_cols="section_numeric",
    category=category_D1_CLIP
);


== Verification of 'D1c_CLIP_inital_labeling' vs. 'section_numeric' ==
Accuracy:   57.00%
Macro F1:   48.48%
Micro  F1:  57.00%

Full classification report:
              precision    recall  f1-score   support

           1       0.50      0.38      0.43        13
           2       0.83      0.45      0.59        11
           3       0.75      0.75      0.75        12
           4       0.57      0.44      0.50         9
           5       1.00      0.17      0.29         6
           6       1.00      0.56      0.71         9
           7       0.33      1.00      0.50         2
           8       0.67      0.40      0.50         5
           9       0.83      0.56      0.67         9
          10       0.38      0.91      0.54        11
          11       0.31      0.36      0.33        14
          12       0.56      0.50      0.53        10
          13       0.64      0.78      0.70         9
          14       0.83      0.56      0.67        18
          15       0.50      0.

### finetune: CLIP

In [16]:
# finetune CLIP
finetune_CLIP(
    mode="both",
    text_path="Demo_data/D1_1.csv",
    text_column=["headline", "abstract"],
    img_dir="Demo_data/D1_imgs_1",
    true_label="section_numeric",
    model_name="Demo_finetuned_CLIP/D1c_CLIP_model_finetuned.pth",
    num_epochs=20,
    batch_size=8,
    learning_rate=1e-5,
)


Loaded 200 records
Number of classes: 24
Use text: True, Use image: True
Text columns: ['headline', 'abstract']
Label column: section_numeric
Training set class distribution: section_numeric
14    17
1     12
3     11
10    10
19    10
6      9
11     9
12     9
9      8
13     8
23     8
4      7
5      6
2      6
16     6
17     6
15     4
20     4
8      3
22     3
7      2
18     1
21     1
Name: count, dtype: int64
Validation set class distribution: section_numeric
2     5
11    5
20    4
23    3
15    3
19    3
17    3
8     2
4     2
18    1
24    1
12    1
13    1
1     1
10    1
3     1
9     1
16    1
14    1
Name: count, dtype: int64


Epoch 1/20 [Train]:  50%|█████     | 10/20 [00:06<00:06,  1.59it/s]

Batch 10/20: Loss: 3.1790 | Acc: 3.75%


Epoch 1/20 [Train]: 100%|██████████| 20/20 [00:13<00:00,  1.54it/s]


Batch 20/20: Loss: 3.1692 | Acc: 5.00%


Epoch 1/20 [Val]: 100%|██████████| 5/5 [00:01<00:00,  4.84it/s]


Epoch 1/20: Train Loss: 3.1738 | Train Acc: 5.00% | Val Loss: 3.1644 | Val Acc: 5.00%
Model saved! Best validation accuracy: 5.00%


Epoch 2/20 [Train]:  50%|█████     | 10/20 [00:06<00:06,  1.62it/s]

Batch 10/20: Loss: 3.0665 | Acc: 37.50%


Epoch 2/20 [Train]: 100%|██████████| 20/20 [00:12<00:00,  1.61it/s]


Batch 20/20: Loss: 3.1040 | Acc: 35.62%


Epoch 2/20 [Val]: 100%|██████████| 5/5 [00:01<00:00,  4.47it/s]


Epoch 2/20: Train Loss: 3.0850 | Train Acc: 35.62% | Val Loss: 3.1488 | Val Acc: 5.00%


Epoch 3/20 [Train]:  50%|█████     | 10/20 [00:07<00:06,  1.43it/s]

Batch 10/20: Loss: 2.9648 | Acc: 73.75%


Epoch 3/20 [Train]: 100%|██████████| 20/20 [00:13<00:00,  1.43it/s]


Batch 20/20: Loss: 2.9657 | Acc: 73.75%


Epoch 3/20 [Val]: 100%|██████████| 5/5 [00:01<00:00,  4.61it/s]


Epoch 3/20: Train Loss: 3.0011 | Train Acc: 73.75% | Val Loss: 3.1175 | Val Acc: 17.50%
Model saved! Best validation accuracy: 17.50%


Epoch 4/20 [Train]:  50%|█████     | 10/20 [00:06<00:06,  1.56it/s]

Batch 10/20: Loss: 2.8944 | Acc: 93.75%


Epoch 4/20 [Train]: 100%|██████████| 20/20 [00:12<00:00,  1.58it/s]


Batch 20/20: Loss: 2.9210 | Acc: 96.88%


Epoch 4/20 [Val]: 100%|██████████| 5/5 [00:01<00:00,  4.92it/s]


Epoch 4/20: Train Loss: 2.9147 | Train Acc: 96.88% | Val Loss: 3.0891 | Val Acc: 32.50%
Model saved! Best validation accuracy: 32.50%


Epoch 5/20 [Train]:  50%|█████     | 10/20 [00:06<00:06,  1.63it/s]

Batch 10/20: Loss: 2.8935 | Acc: 100.00%


Epoch 5/20 [Train]: 100%|██████████| 20/20 [00:12<00:00,  1.61it/s]


Batch 20/20: Loss: 2.8425 | Acc: 100.00%


Epoch 5/20 [Val]: 100%|██████████| 5/5 [00:01<00:00,  4.74it/s]


Epoch 5/20: Train Loss: 2.8392 | Train Acc: 100.00% | Val Loss: 3.0712 | Val Acc: 37.50%
Model saved! Best validation accuracy: 37.50%


Epoch 6/20 [Train]:  50%|█████     | 10/20 [00:06<00:06,  1.58it/s]

Batch 10/20: Loss: 2.7686 | Acc: 100.00%


Epoch 6/20 [Train]: 100%|██████████| 20/20 [00:12<00:00,  1.56it/s]


Batch 20/20: Loss: 2.7646 | Acc: 100.00%


Epoch 6/20 [Val]: 100%|██████████| 5/5 [00:01<00:00,  4.63it/s]


Epoch 6/20: Train Loss: 2.7851 | Train Acc: 100.00% | Val Loss: 3.0476 | Val Acc: 45.00%
Model saved! Best validation accuracy: 45.00%


Epoch 7/20 [Train]:  50%|█████     | 10/20 [00:06<00:06,  1.44it/s]

Batch 10/20: Loss: 2.7638 | Acc: 100.00%


Epoch 7/20 [Train]: 100%|██████████| 20/20 [00:13<00:00,  1.49it/s]


Batch 20/20: Loss: 2.7583 | Acc: 100.00%


Epoch 7/20 [Val]: 100%|██████████| 5/5 [00:01<00:00,  4.40it/s]


Epoch 7/20: Train Loss: 2.7429 | Train Acc: 100.00% | Val Loss: 3.0345 | Val Acc: 47.50%
Model saved! Best validation accuracy: 47.50%


Epoch 8/20 [Train]:  50%|█████     | 10/20 [00:06<00:06,  1.49it/s]

Batch 10/20: Loss: 2.6878 | Acc: 100.00%


Epoch 8/20 [Train]: 100%|██████████| 20/20 [00:13<00:00,  1.53it/s]


Batch 20/20: Loss: 2.7144 | Acc: 100.00%


Epoch 8/20 [Val]: 100%|██████████| 5/5 [00:01<00:00,  4.78it/s]


Epoch 8/20: Train Loss: 2.7130 | Train Acc: 100.00% | Val Loss: 3.0190 | Val Acc: 47.50%


Epoch 9/20 [Train]:  50%|█████     | 10/20 [00:06<00:06,  1.54it/s]

Batch 10/20: Loss: 2.7138 | Acc: 100.00%


Epoch 9/20 [Train]: 100%|██████████| 20/20 [00:12<00:00,  1.57it/s]


Batch 20/20: Loss: 2.6597 | Acc: 100.00%


Epoch 9/20 [Val]: 100%|██████████| 5/5 [00:01<00:00,  4.91it/s]


Epoch 9/20: Train Loss: 2.6942 | Train Acc: 100.00% | Val Loss: 3.0088 | Val Acc: 50.00%
Model saved! Best validation accuracy: 50.00%


Epoch 10/20 [Train]:  50%|█████     | 10/20 [00:06<00:06,  1.59it/s]

Batch 10/20: Loss: 2.6798 | Acc: 100.00%


Epoch 10/20 [Train]: 100%|██████████| 20/20 [00:12<00:00,  1.57it/s]


Batch 20/20: Loss: 2.6708 | Acc: 100.00%


Epoch 10/20 [Val]: 100%|██████████| 5/5 [00:01<00:00,  3.56it/s]


Epoch 10/20: Train Loss: 2.6755 | Train Acc: 100.00% | Val Loss: 3.0014 | Val Acc: 55.00%
Model saved! Best validation accuracy: 55.00%


Epoch 11/20 [Train]:  50%|█████     | 10/20 [00:06<00:06,  1.46it/s]

Batch 10/20: Loss: 2.6767 | Acc: 100.00%


Epoch 11/20 [Train]: 100%|██████████| 20/20 [00:13<00:00,  1.49it/s]


Batch 20/20: Loss: 2.6780 | Acc: 100.00%


Epoch 11/20 [Val]: 100%|██████████| 5/5 [00:01<00:00,  3.97it/s]


Epoch 11/20: Train Loss: 2.6622 | Train Acc: 100.00% | Val Loss: 2.9931 | Val Acc: 50.00%


Epoch 12/20 [Train]:  50%|█████     | 10/20 [00:07<00:06,  1.46it/s]

Batch 10/20: Loss: 2.6333 | Acc: 100.00%


Epoch 12/20 [Train]: 100%|██████████| 20/20 [00:14<00:00,  1.39it/s]


Batch 20/20: Loss: 2.6545 | Acc: 100.00%


Epoch 12/20 [Val]: 100%|██████████| 5/5 [00:01<00:00,  3.56it/s]


Epoch 12/20: Train Loss: 2.6517 | Train Acc: 100.00% | Val Loss: 2.9872 | Val Acc: 52.50%


Epoch 13/20 [Train]:  50%|█████     | 10/20 [00:07<00:07,  1.30it/s]

Batch 10/20: Loss: 2.6790 | Acc: 100.00%


Epoch 13/20 [Train]: 100%|██████████| 20/20 [00:14<00:00,  1.35it/s]


Batch 20/20: Loss: 2.6241 | Acc: 100.00%


Epoch 13/20 [Val]: 100%|██████████| 5/5 [00:01<00:00,  4.36it/s]


Epoch 13/20: Train Loss: 2.6442 | Train Acc: 100.00% | Val Loss: 2.9829 | Val Acc: 50.00%


Epoch 14/20 [Train]:  50%|█████     | 10/20 [00:07<00:08,  1.20it/s]

Batch 10/20: Loss: 2.6530 | Acc: 100.00%


Epoch 14/20 [Train]: 100%|██████████| 20/20 [00:14<00:00,  1.34it/s]


Batch 20/20: Loss: 2.6224 | Acc: 100.00%


Epoch 14/20 [Val]: 100%|██████████| 5/5 [00:01<00:00,  4.72it/s]


Epoch 14/20: Train Loss: 2.6367 | Train Acc: 100.00% | Val Loss: 2.9738 | Val Acc: 52.50%


Epoch 15/20 [Train]:  50%|█████     | 10/20 [00:06<00:07,  1.41it/s]

Batch 10/20: Loss: 2.6199 | Acc: 100.00%


Epoch 15/20 [Train]: 100%|██████████| 20/20 [00:14<00:00,  1.41it/s]


Batch 20/20: Loss: 2.6301 | Acc: 100.00%


Epoch 15/20 [Val]: 100%|██████████| 5/5 [00:01<00:00,  3.03it/s]


Epoch 15/20: Train Loss: 2.6292 | Train Acc: 100.00% | Val Loss: 2.9703 | Val Acc: 55.00%


Epoch 16/20 [Train]:  50%|█████     | 10/20 [00:08<00:08,  1.24it/s]

Batch 10/20: Loss: 2.6340 | Acc: 100.00%


Epoch 16/20 [Train]: 100%|██████████| 20/20 [00:15<00:00,  1.28it/s]


Batch 20/20: Loss: 2.6144 | Acc: 100.00%


Epoch 16/20 [Val]: 100%|██████████| 5/5 [00:01<00:00,  4.45it/s]


Epoch 16/20: Train Loss: 2.6260 | Train Acc: 100.00% | Val Loss: 2.9694 | Val Acc: 55.00%


Epoch 17/20 [Train]:  50%|█████     | 10/20 [00:07<00:07,  1.34it/s]

Batch 10/20: Loss: 2.6383 | Acc: 100.00%


Epoch 17/20 [Train]: 100%|██████████| 20/20 [00:15<00:00,  1.33it/s]


Batch 20/20: Loss: 2.6109 | Acc: 100.00%


Epoch 17/20 [Val]: 100%|██████████| 5/5 [00:01<00:00,  4.48it/s]


Epoch 17/20: Train Loss: 2.6220 | Train Acc: 100.00% | Val Loss: 2.9631 | Val Acc: 55.00%


Epoch 18/20 [Train]:  50%|█████     | 10/20 [00:07<00:07,  1.42it/s]

Batch 10/20: Loss: 2.6308 | Acc: 100.00%


Epoch 18/20 [Train]: 100%|██████████| 20/20 [00:14<00:00,  1.36it/s]


Batch 20/20: Loss: 2.6160 | Acc: 100.00%


Epoch 18/20 [Val]: 100%|██████████| 5/5 [00:01<00:00,  4.00it/s]


Epoch 18/20: Train Loss: 2.6179 | Train Acc: 100.00% | Val Loss: 2.9584 | Val Acc: 55.00%


Epoch 19/20 [Train]:  50%|█████     | 10/20 [00:06<00:07,  1.43it/s]

Batch 10/20: Loss: 2.6096 | Acc: 100.00%


Epoch 19/20 [Train]: 100%|██████████| 20/20 [00:13<00:00,  1.47it/s]


Batch 20/20: Loss: 2.6114 | Acc: 100.00%


Epoch 19/20 [Val]: 100%|██████████| 5/5 [00:01<00:00,  3.98it/s]


Epoch 19/20: Train Loss: 2.6168 | Train Acc: 100.00% | Val Loss: 2.9577 | Val Acc: 55.00%


Epoch 20/20 [Train]:  50%|█████     | 10/20 [00:06<00:06,  1.55it/s]

Batch 10/20: Loss: 2.6353 | Acc: 100.00%


Epoch 20/20 [Train]: 100%|██████████| 20/20 [00:13<00:00,  1.53it/s]


Batch 20/20: Loss: 2.6181 | Acc: 100.00%


Epoch 20/20 [Val]: 100%|██████████| 5/5 [00:01<00:00,  4.20it/s]

Epoch 20/20: Train Loss: 2.6104 | Train Acc: 100.00% | Val Loss: 2.9508 | Val Acc: 55.00%
Fine-tuning complete! Best validation accuracy: 55.00%





55.0

In [17]:
# Classify with a fine‑tuned model
D1c_CLIP_finetuned = classification_CLIP_finetuned(
    mode="both",
    text_path="Demo_data/D1_2.csv",
    text_column=["headline", "abstract"],
    img_dir="Demo_data/D1_imgs_2",
    model_name="Demo_finetuned_CLIP/D1c_CLIP_model_finetuned.pth",
    predict_column="D1c_CLIP_finetuned",
    
)
D1c_CLIP_finetuned.to_csv("Demo_result/D1c_CLIP_finetuned.csv", index=False)

200 pieces of data loaded


predicting: 100%|██████████| 25/25 [00:07<00:00,  3.17it/s]

Labeling completed!





In [18]:
D1c_CLIP_finetuned

Unnamed: 0,section,headline,article_url,article,abstract,article_id,image,caption,image_id,image_path,section_numeric,D1c_CLIP_finetuned
0,Technology,The Sun Sets. The Wind Dies. But Energy Data I...,https://www.nytimes.com/2018/09/26/technology/...,How do New York Times journalists use technolo...,Alternative power sources have upended the ele...,5631bfd7-67ff-5fa3-8efd-37d65f3e234e,https://static01.nyt.com/images/2018/09/27/bus...,"Ivan Penn, who reports on alternative energy, ...",5631bfd7-67ff-5fa3-8efd-37d65f3e234e,Demo_data/D1_img2/15631bfd7-67ff-5fa3-8efd-37d...,20,14
1,Automobiles,Monday Motorsports: Ed Carpenter Wins Another ...,https://www.nytimes.com/2014/05/20/automobiles...,Ed Carpenter hasn't run an IndyCar race since ...,"Carpenter collected a $100,000 prize for his s...",c55a5863-a87d-5470-8210-055e307b8956,https://static01.nyt.com/images/2014/05/20/aut...,Ed Carpenter won pole position Sunday for the ...,c55a5863-a87d-5470-8210-055e307b8956,Demo_data/D1_img2/1c55a5863-a87d-5470-8210-055...,23,23
2,Real Estate,Recent Commercial Real Estate Transactions,https://www.nytimes.com/2019/01/15/realestate/...,497 Atlantic Avenue (between Nevins Street and...,Recent commercial real estate transactions in ...,f3f0dc8e-3241-5e5e-a0d9-b9a3bfc7862f,https://static01.nyt.com/images/2019/01/16/rea...,A four-story walk-up is on the market for $2.5...,f3f0dc8e-3241-5e5e-a0d9-b9a3bfc7862f,Demo_data/D1_img2/1f3f0dc8e-3241-5e5e-a0d9-b9a...,7,7
3,Television,The Best Movies and TV Shows Coming to Netflix...,https://www.nytimes.com/2019/12/01/arts/televi...,Watching is The New York Times's TV and film r...,"Every month, subscription streaming services a...",f0ae30f1-09b9-5334-a32c-04cf793a2aef,https://static01.nyt.com/images/2019/12/01/art...,"Clockwise from top left: Scenes from ""Moonligh...",f0ae30f1-09b9-5334-a32c-04cf793a2aef,Demo_data/D1_img2/1f0ae30f1-09b9-5334-a32c-04c...,3,3
4,Theater,"Review: In 'Leopoldstadt,' Tom Stoppard Reckon...",https://www.nytimes.com/2020/02/12/theater/leo...,LONDON -- Do you remember? Don't you remember?...,In what he says will probably be his last work...,da8070be-edc1-5d6f-a78c-3962249598cc,https://static01.nyt.com/images/2020/02/12/art...,"Adrian Scarborough, left, and Luke Thallon in ...",da8070be-edc1-5d6f-a78c-3962249598cc,Demo_data/D1_img2/1da8070be-edc1-5d6f-a78c-396...,10,10
...,...,...,...,...,...,...,...,...,...,...,...,...
195,Opinion,The Postal Service Is the Most American Thing ...,https://www.nytimes.com/2020/05/15/opinion/pos...,It would be difficult to think of a time when ...,Trump calls it 'a joke.' Washington and Lincol...,30bc3f44-f79c-5b24-9c12-e4c9afc870b1,https://static01.nyt.com/images/2020/05/15/opi...,Delivering the mail in Manhattan. Regular deli...,30bc3f44-f79c-5b24-9c12-e4c9afc870b1,Demo_data/D1_img2/130bc3f44-f79c-5b24-9c12-e4c...,11,11
196,Real Estate,Priced Out of a Childhood Home,https://www.nytimes.com/2016/05/15/realestate/...,When Carolyn Burke landed a teaching job nearl...,When a New York neighborhood becomes gentrifie...,17fe5dcd-289d-542d-beb2-2578f44576dc,https://static01.nyt.com/images/2016/05/15/rea...,"Carolyn Burke, a teacher, in front of her old ...",17fe5dcd-289d-542d-beb2-2578f44576dc,Demo_data/D1_img2/117fe5dcd-289d-542d-beb2-257...,7,14
197,Technology,"Airbnb, a 'Sharing Economy' Pioneer, Files to ...",https://www.nytimes.com/2020/08/19/technology/...,The Airbnb headquarters in San Francisco. Its ...,"The home rental company, which was privately v...",7de22813-722c-5d83-8248-269e3fac0cad,https://static01.nyt.com/images/2020/08/18/bus...,Airbnb headquarters in San Francisco. public ...,7de22813-722c-5d83-8248-269e3fac0cad,Demo_data/D1_img2/17de22813-722c-5d83-8248-269...,20,1
198,Health,"Stay 6 Feet Apart, We're Told. But How Far Can...",https://www.nytimes.com/2020/04/14/health/coro...,"The rule of thumb, or rather feet, has been to...",Most of the big droplets travel a mere six fee...,6fc12abe-5d32-5c47-9372-cdf3c6de0ad5,https://static01.nyt.com/images/2020/04/19/sci...,A sign on the Coney Island boardwalk in Brookl...,6fc12abe-5d32-5c47-9372-cdf3c6de0ad5,Demo_data/D1_img2/16fc12abe-5d32-5c47-9372-cdf...,1,9


In [19]:
## Check the accuracy: CLIP after finetune
auto_verification(
    D1c_CLIP_finetuned,
    predicted_cols="D1c_CLIP_finetuned",
    true_cols="section_numeric",
    category=category_D1_CLIP
)


== Verification of 'D1c_CLIP_finetuned' vs. 'section_numeric' ==
Accuracy:   60.50%
Macro F1:   56.28%
Micro  F1:  60.50%

Full classification report:
              precision    recall  f1-score   support

           1       0.55      0.79      0.65        14
           2       1.00      0.29      0.44         7
           3       0.93      0.76      0.84        17
           4       0.33      0.60      0.43         5
           5       1.00      0.75      0.86         4
           6       1.00      1.00      1.00         8
           7       0.50      0.07      0.12        15
           8       0.50      0.29      0.36         7
           9       0.42      0.83      0.56         6
          10       0.80      0.92      0.86        13
          11       0.47      0.70      0.56        10
          12       0.88      0.64      0.74        11
          13       0.85      0.92      0.88        12
          14       0.27      1.00      0.42        11
          15       0.00      0.00    