In [1]:
import pandas as pd
from dataProcessor import process_metadata, clean_text, correct_misinterpreted_characters, pew_metadata_path, statista_metadata_path, llava_description_path
#  get the combined dataframe
combined_df = process_metadata(pew_metadata_path, statista_metadata_path, llava_description_path)
combined_df.shape

(29354, 5)

In [2]:
# Apply text cleaning and correction functions
combined_df['title'] = combined_df['title'].apply(clean_text).apply(correct_misinterpreted_characters)
combined_df['caption'] = combined_df['caption'].apply(clean_text).apply(correct_misinterpreted_characters)
combined_df['llava_description'] = combined_df['llava_description'].apply(clean_text).apply(correct_misinterpreted_characters)

In [3]:
combined_df

Unnamed: 0,id,title,caption,imgPath,llava_description
0,1,"Foreign-born population in the United States, ...",The foreign-born population residing in the U....,../dataset/pew_dataset/pew_imgs/1.png,The chart you've provided appears to be a line...
1,2,"English proficiency among U.S. immigrants, 198...","Since 1980, the share of immigrants who are pr...",../dataset/pew_dataset/pew_imgs/2.png,The chart you've provided shows the percentage...
2,3,"Languages spoken among U.S. immigrants, 2018","Among the nation’s immigrants, Spanish is by f...",../dataset/pew_dataset/pew_imgs/3.png,The chart presents the percentage of immigrant...
3,4,"Hispanic population in the U.S., 2000-2017",There were nearly 60 million Latinos in the Un...,../dataset/pew_dataset/pew_imgs/4.png,The chart you've provided appears to be a line...
4,5,Weekly broadcast audience for top 20 NPR-affil...,The top 20 NPR-affiliated public radio station...,../dataset/pew_dataset/pew_imgs/5.png,The chart you've provided is a line graph show...
...,...,...,...,...,...
29349,29350,Distribution of cyber stalking victims in 2013...,This statistic presents the distribution of cy...,../dataset/statista_dataset/statista_imgs/2786...,The chart you've provided appears to be a bar ...
29350,29351,Total number of dwellings in Great Britain fro...,This statistic displays the total number of dw...,../dataset/statista_dataset/statista_imgs/2786...,The chart you've provided appears to be a line...
29351,29352,Results in the European Parliament elections i...,This statistic shows the political parties and...,../dataset/statista_dataset/statista_imgs/2786...,The chart you've provided appears to be a bar ...
29352,29353,Average annual expenditure on curtains and dra...,This statistic shows the average annual expend...,../dataset/statista_dataset/statista_imgs/2786...,The chart you've provided appears to be a line...


In [7]:
from transformers import CLIPTokenizer, CLIPModel

# Load the CLIP model and tokenizer
model_name = "openai/clip-vit-large-patch14"
tokenizer = CLIPTokenizer.from_pretrained(model_name)
model = CLIPModel.from_pretrained(model_name)

# Function to tokenize the title using CLIP and get token length
def get_clip_token_length(text):
    tokens = tokenizer(text, return_tensors="pt")
    return tokens.input_ids.size(1)

# Function to truncate the title
def truncate_title(title, max_length=50):
    if len(title) > max_length:
        return title[:max_length] + "..."
    return title

# Apply the function to each title
combined_df['token_length'] = combined_df['title'].apply(get_clip_token_length)

# Select only the required columns
combined_df = combined_df[['id', 'title', 'token_length']]

combined_df = combined_df[combined_df['token_length'] > 77]

# sort it with token length
combined_df = combined_df.sort_values(by='token_length', ascending=False)

# Truncate the title for display purposes
combined_df['title'] = combined_df['title'].apply(truncate_title)

combined_df



Token indices sequence length is longer than the specified maximum sequence length for this model (141 > 77). Running this sequence through the model will result in indexing errors


Unnamed: 0,id,title,token_length
60,61,Confidence in public health organizations likely t...,141
695,696,"How Mixed-Race, Mestizo, 'uulatto' Hispanics Repor...",141
916,917,"Write- Ins for ""Some Other Race"" Among Hispanics i...",134
669,670,A Snapshot of What Americans Know About Science % ...,128
62,63,Roughly three-in-ten who say social media have a n...,109
799,800,"The Web IQ"" of American Internet Users % of finter...",96
93,94,Majority of parents say their child 11 or younger ...,89
660,661,Foreign- Born Hispanics More Likely to Be Catholic...,84
414,415,The largest number of single-mother households are...,82
335,336,No Muslim-majority nation was among top five U.S. ...,82


In [20]:
from torch import Tensor
from transformers import AutoTokenizer, AutoModel

# Load the tokenizer and quantized model
tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-mistral-7b-instruct')

# Function to tokenize the caption and get token length
def get_token_length(row):
    # Combine the title and caption into one string
    combined_text = row['title'] + " " + row['llava_description']
    # Tokenize the text and count the number of input IDs, which represent tokens
    tokens = tokenizer(combined_text, return_tensors="pt")
    return tokens['input_ids'].shape[1]  # Get the number of tokens

# Apply the function to each row in your DataFrame to calculate token length for combined title and caption
combined_df['combined_token_length'] = combined_df.apply(get_token_length, axis=1)

# Select only the required columns to view the results
combined_df = combined_df[['id', 'title', 'combined_token_length']]

# Sort the DataFrame by token length in descending order
combined_df = combined_df.sort_values(by='combined_token_length', ascending=False)

# Display the DataFrame with combined token length for title and caption is more than 500
combined_df = combined_df[combined_df['combined_token_length'] > 384]


combined_df

Unnamed: 0,id,title,combined_token_length
327,328,Number of STEM graduates under OPT grew substa...,3070
1606,1607,Incidence of coronavirus (COVID-19) cases in t...,3046
9999,10000,Number of people admitted to hospitals in Madr...,3045
21410,21411,Top ten PACs advocating for Hillary Clinton's ...,3045
29222,29223,Total number of season ticket holders for the ...,3045
...,...,...,...
7818,7819,Spain: The largest cities in 2015 ...,385
29026,29027,Most expensive summer vacation rental destinat...,385
15374,15375,"Population distribution of Nova Scotia, Canada...",385
17070,17071,Number of new Toyota cars registered in Poland...,385


In [21]:
# Display the DataFrame with combined token length for title and caption is more than 500
combined_df = combined_df[combined_df['combined_token_length'] > 1000]


combined_df

Unnamed: 0,id,title,combined_token_length
327,328,Number of STEM graduates under OPT grew substa...,3070
1606,1607,Incidence of coronavirus (COVID-19) cases in t...,3046
9999,10000,Number of people admitted to hospitals in Madr...,3045
21410,21411,Top ten PACs advocating for Hillary Clinton's ...,3045
29222,29223,Total number of season ticket holders for the ...,3045
...,...,...,...
22041,22042,Nonprofit organization's expectations for chan...,1004
2092,2093,African countries with the largest population ...,1003
27418,27419,Producer Price Index (PPI): Annual average out...,1003
11366,11367,Distribution of hospitalizations due to the co...,1002


In [1]:
import torch
from transformers import AutoTokenizer, AutoModel, BitsAndBytesConfig

# Quantization configuration
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

# Function to load model
def load_model(model_name, trust_remote_code=False):
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=trust_remote_code)
    model = AutoModel.from_pretrained(
        model_name,
        quantization_config=quantization_config,
        device_map="auto",
        trust_remote_code=trust_remote_code
    )
    return tokenizer, model

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the second model with trust_remote_code=True
model_name_2 = 'Alibaba-NLP/gte-Qwen2-7B-instruct'
tokenizer_2, model_2 = load_model(model_name_2, trust_remote_code=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 7/7 [00:17<00:00,  2.45s/it]
