In [2]:
from dataProcessor import process_metadata, pew_metadata_path, statista_metadata_path
#  get the combined dataframe
combined_df = process_metadata(pew_metadata_path, statista_metadata_path)
combined_df.shape


(29354, 4)

In [7]:
from transformers import CLIPTokenizer, CLIPModel

# Load the CLIP model and tokenizer
model_name = "openai/clip-vit-large-patch14"
tokenizer = CLIPTokenizer.from_pretrained(model_name)
model = CLIPModel.from_pretrained(model_name)

# Function to tokenize the title using CLIP and get token length
def get_clip_token_length(text):
    tokens = tokenizer(text, return_tensors="pt")
    return tokens.input_ids.size(1)

# Function to truncate the title
def truncate_title(title, max_length=50):
    if len(title) > max_length:
        return title[:max_length] + "..."
    return title

# Apply the function to each title
combined_df['token_length'] = combined_df['title'].apply(get_clip_token_length)

# Select only the required columns
combined_df = combined_df[['id', 'title', 'token_length']]

combined_df = combined_df[combined_df['token_length'] > 77]

# sort it with token length
combined_df = combined_df.sort_values(by='token_length', ascending=False)

# Truncate the title for display purposes
combined_df['title'] = combined_df['title'].apply(truncate_title)

combined_df



Token indices sequence length is longer than the specified maximum sequence length for this model (141 > 77). Running this sequence through the model will result in indexing errors


Unnamed: 0,id,title,token_length
60,61,Confidence in public health organizations likely t...,141
695,696,"How Mixed-Race, Mestizo, 'uulatto' Hispanics Repor...",141
916,917,"Write- Ins for ""Some Other Race"" Among Hispanics i...",134
669,670,A Snapshot of What Americans Know About Science % ...,128
62,63,Roughly three-in-ten who say social media have a n...,109
799,800,"The Web IQ"" of American Internet Users % of finter...",96
93,94,Majority of parents say their child 11 or younger ...,89
660,661,Foreign- Born Hispanics More Likely to Be Catholic...,84
414,415,The largest number of single-mother households are...,82
335,336,No Muslim-majority nation was among top five U.S. ...,82


In [12]:
from torch import Tensor
from transformers import AutoTokenizer, AutoModel
from dataProcessor import process_metadata, pew_metadata_path, statista_metadata_path

#  get the combined dataframe
combined_df = process_metadata(pew_metadata_path, statista_metadata_path)
combined_df.shape

# Load the tokenizer and quantized model
tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-mistral-7b-instruct')

# Function to tokenize the caption and get token length
def get_token_length(row):
    # Combine the title and caption into one string
    combined_text = row['title'] + " " + row['caption']
    # Tokenize the text and count the number of input IDs, which represent tokens
    tokens = tokenizer(combined_text, return_tensors="pt")
    return tokens['input_ids'].shape[1]  # Get the number of tokens

# Apply the function to each row in your DataFrame to calculate token length for combined title and caption
combined_df['combined_token_length'] = combined_df.apply(get_token_length, axis=1)

# Select only the required columns to view the results
combined_df = combined_df[['id', 'title', 'combined_token_length']]

# Sort the DataFrame by token length in descending order
combined_df = combined_df.sort_values(by='combined_token_length', ascending=False)

# Display the DataFrame with combined token length for title and caption is more than 500
combined_df = combined_df[combined_df['combined_token_length'] > 384]

# Truncate the title for display purposes
combined_df['title'] = combined_df['title'].apply(truncate_title)

combined_df

Unnamed: 0,id,title,combined_token_length
11346,11347,\r\n Population of Poland fro...,1217
7487,7488,\r\n Population of Greece fro...,1069
8186,8187,\r\n Number of assassinations...,1046
6540,6541,\r\n Population of France fro...,1024
4069,4070,\r\n Reported number of slave...,910
...,...,...,...
9525,9526,\r\n Number of electric passe...,385
4395,4396,\r\n Crude oil imports to Can...,385
1586,1587,\r\n Total Nike retail stores...,385
1521,1522,\r\n Number of coronavirus (C...,385


In [1]:
import torch
from transformers import AutoTokenizer, AutoModel, BitsAndBytesConfig

# Quantization configuration
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

# Function to load model
def load_model(model_name, trust_remote_code=False):
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=trust_remote_code)
    model = AutoModel.from_pretrained(
        model_name,
        quantization_config=quantization_config,
        device_map="auto",
        trust_remote_code=trust_remote_code
    )
    return tokenizer, model

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the second model with trust_remote_code=True
model_name_2 = 'Alibaba-NLP/gte-Qwen2-7B-instruct'
tokenizer_2, model_2 = load_model(model_name_2, trust_remote_code=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 7/7 [00:17<00:00,  2.45s/it]


In [None]:
{
 "model": "gpt-4o",
 "messages": [
  {
   "role": "user",
   "content": [
    {
     "type": "text",
     "text": "Please analyze the title, content, and the provided image data to provide statistical insights and answer the query.\nTitle: What Is the Main Reason Gas\nContent: About three-in-ten (31%) offer a variation on this theme \u2013 greed, oil companies or speculation \u2013 when asked what they think is the main reason gasoline prices have gone up recently, according to a Pew Research Center/Washington Post survey conducted April 28-May 1 among 1,006 adults.\nRoughly two-in-ten (19%) cite the ongoing wars or unrest in Libya and elsewhere in the Middle East as the top reason for rising fuel prices. Another 14% attribute this to politics or national policies.\nQuery: Are gas prices too high?"
    },
    {
     "type": "image_url",
     "image_url": {
      "url": "data:image/jpeg;base64,iVBORw0KolssZgtTlIAAAAASUVORK5CYII="
     }
    }
   ]
  },
  {
   "role": "user",
   "content": [
    {
     "type": "text",
     "text": "Please analyze the title, content, and the provided image data to provide statistical insights and answer the query.\nTitle: Average annual U.S. gasoline prices during times of crisis between 1956 and 2011 \n                    \n                            (in U.S. cents per gallon)\nContent: This statistic shows average annual U.S. gasoline prices in selected times of crisis between 1956 and 2011. In 1956, the year of the Suez crisis, the average annual gasoline price in the U.S. stood at 249.9 U.S. cents per gallon.\nQuery: Are gas prices too high?"
    },
    {
     "type": "image_url",
     "image_url": {
      "url": "data:image/jpeg;base64,iVBORw0KGgWfahWENGywF4jEi1R3BbkX7xesAwAAAFgO+YXl/mr//t0am5u98LMe8bo2gg+XwDDG6P8P6vceKy2oWYQAAAAASUVORK5CYII="
     }
    }
   ]
  }
 ],
 "max_tokens": 300
}

In [None]:
{
 "model": "gpt-4o",
 "messages": [
  {
   "role": "user",
   "content": [
    {
     "type": "text",
     "text": "Please analyze the title, content, and the provided image data to provide statistical insights and answer the query.\nTitle: What Is the Main Reason Gas\nContent: About three-in-ten (31%) offer a variation on this theme \u2013 greed, oil companies or speculation \u2013 when asked what they think is the main reason gasoline prices have gone up recently, according to a Pew Research Center/Washington Post survey conducted April 28-May 1 among 1,006 adults.\nRoughly two-in-ten (19%) cite the ongoing wars or unrest in Libya and elsewhere in the Middle East as the top reason for rising fuel prices. Another 14% attribute this to politics or national policies.\nQuery: Are gas prices too high?"
    },
    {
     "type": "image_url",
     "image_url": {
      "url": "data:image/jpeg;base64,iVBORw0KtMTu9WeHY+XJYBaQH8HbpPAKvoPgGsIuUAq0g5wCpSDrCKlAOsIuUAq0g5wCpSDrCqfl5uNpv94ziA11b1nNB9AlhFYQlYRcoBVpFygFU/HolssZgtTlIAAAAASUVORK5CYII="
     }
    },
    {
     "type": "text",
     "text": "Please analyze the title, content, and the provided image data to provide statistical insights and answer the query.\nTitle: Average annual U.S. gasoline prices during times of crisis between 1956 and 2011 \n                    \n                            (in U.S. cents per gallon)\nContent: This statistic shows average annual U.S. gasoline prices in selected times of crisis between 1956 and 2011. In 1956, the year of the Suez crisis, the average annual gasoline price in the U.S. stood at 249.9 U.S. cents per gallon.\nQuery: Are gas prices too high?"
    },
    {
     "type": "image_url",
     "image_url": {
      "url": "data:image/jpeg;base64,iVBORw0KVliUDUoutl/EI/G1c1iN53Sj48OOK+KCSilFEE3WfahWENGywF4jEi1R3BbkX7xesAwAAAFgO+YXl/mr//t0am5u98LMe8bo2gg+XwDDG6P8P6vceKy2oWYQAAAAASUVORK5CYII="
     }
    }
   ]
  }
 ],
 "max_tokens": 300
}