In [2]:
# Load data
import pandas as pd
pew = pd.read_csv('../dataset/pew_dataset/metadata.csv')
pew['imgPath'] = pew['imgPath'].str.replace('imgs', '../dataset/pew_dataset/pew_imgs')
statista = pd.read_csv('../dataset/statista_dataset/metadata.csv')
statista['imgPath'] = statista['imgPath'].str.replace('out/two_col/imgs', '../dataset/statista_dataset/statista_imgs')
columns = ['title', 'caption', 'imgPath']

pew_df = pew[columns]
statista_df = statista[columns]
combined_df = pd.concat([pew_df, statista_df], ignore_index=True)
combined_df.insert(0, 'id', combined_df.reset_index().index + 1)
combined_df

Unnamed: 0,id,title,caption,imgPath
0,1,"Foreign-born population in the United States, ...",The foreign-born population residing in the U....,../dataset/pew_dataset/pew_imgs/1.png
1,2,"English proficiency among U.S. immigrants, 198...","Since 1980, the share of immigrants who are pr...",../dataset/pew_dataset/pew_imgs/2.png
2,3,"Languages spoken among U.S. immigrants, 2018","Among the nation’s immigrants, Spanish is by f...",../dataset/pew_dataset/pew_imgs/3.png
3,4,"Hispanic population in the U.S., 2000-2017",There were nearly 60 million Latinos in the Un...,../dataset/pew_dataset/pew_imgs/4.png
4,5,Weekly broadcast audience for top 20 NPR-affil...,The top 20 NPR-affiliated public radio station...,../dataset/pew_dataset/pew_imgs/5.png
...,...,...,...,...
29349,29350,\r\n Distribution of cy...,This statistic presents the distribution of c...,../dataset/statista_dataset/statista_imgs/2786...
29350,29351,\r\n Total number of dw...,This statistic displays the total number of d...,../dataset/statista_dataset/statista_imgs/2786...
29351,29352,\r\n Results in the Eur...,This statistic shows the political parties an...,../dataset/statista_dataset/statista_imgs/2786...
29352,29353,\r\n Average annual exp...,This statistic shows the average annual expen...,../dataset/statista_dataset/statista_imgs/2786...


In [5]:
# Add caption lengths
combined_df['title_length'] = combined_df['title'].apply(len)
combined_df['caption_length'] = combined_df['caption'].apply(len)

combined_df

Unnamed: 0,id,title,caption,imgPath,title_length,caption_length
0,1,"Foreign-born population in the United States, ...",The foreign-born population residing in the U....,../dataset/pew_dataset/pew_imgs/1.png,55,392
1,2,"English proficiency among U.S. immigrants, 198...","Since 1980, the share of immigrants who are pr...",../dataset/pew_dataset/pew_imgs/2.png,52,480
2,3,"Languages spoken among U.S. immigrants, 2018","Among the nation’s immigrants, Spanish is by f...",../dataset/pew_dataset/pew_imgs/3.png,44,404
3,4,"Hispanic population in the U.S., 2000-2017",There were nearly 60 million Latinos in the Un...,../dataset/pew_dataset/pew_imgs/4.png,42,434
4,5,Weekly broadcast audience for top 20 NPR-affil...,The top 20 NPR-affiliated public radio station...,../dataset/pew_dataset/pew_imgs/5.png,66,282
...,...,...,...,...,...,...
29349,29350,\r\n Distribution of cy...,This statistic presents the distribution of c...,../dataset/statista_dataset/statista_imgs/2786...,113,228
29350,29351,\r\n Total number of dw...,This statistic displays the total number of d...,../dataset/statista_dataset/statista_imgs/2786...,176,295
29351,29352,\r\n Results in the Eur...,This statistic shows the political parties an...,../dataset/statista_dataset/statista_imgs/2786...,146,189
29352,29353,\r\n Average annual exp...,This statistic shows the average annual expen...,../dataset/statista_dataset/statista_imgs/2786...,176,256


In [2]:
# Add caption lengths
combined_df['caption_length'] = combined_df['caption'].apply(len)

# Find top 10 longest and 10 shortest captions
top_10_longest = combined_df.nlargest(10, 'caption_length')[['id','title','caption_length']]
top_10_shortest = combined_df.nsmallest(10, 'caption_length')[['id', 'caption_length']]

top_10_longest, top_10_shortest

(          id                                              title  \
 11346  11347  \r\n                        Population of Pola...   
 8186    8187  \r\n                        Number of assassin...   
 7487    7488  \r\n                        Population of Gree...   
 6540    6541  \r\n                        Population of Fran...   
 4069    4070  \r\n                        Reported number of...   
 24038  24039  \r\n                        Number of countrie...   
 5609    5610  \r\n                        Number of U.S. pre...   
 3862    3863  \r\n                        International box ...   
 14260  14261  \r\n                        Length of each maj...   
 745      746  FIGUREA Industries with High Shares of Unautho...   
 
        caption_length  
 11346            5110  
 8186             4509  
 7487             4488  
 6540             4327  
 4069             4071  
 24038            3725  
 5609             3621  
 3862             3587  
 14260            3579  


In [3]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-mpnet-base-v2")
print("Max Sequence Length:", model.max_seq_length)

  from tqdm.autonotebook import tqdm, trange


Max Sequence Length: 384


In [8]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("intfloat/e5-mistral-7b-instruct")
print("Max Sequence Length:", model.max_seq_length)

Downloading shards: 100%|██████████| 2/2 [20:45<00:00, 622.51s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:08<00:00,  4.12s/it]


Max Sequence Length: 32768


In [4]:
# Add caption lengths
combined_df['title_length'] = combined_df['title'].apply(len)

# Find top 10 longest and 10 shortest captions
top_10_longest = combined_df.nlargest(10, 'title_length')[['id','title','title_length']]
top_10_shortest = combined_df.nsmallest(10, 'title_length')[['id','title','title_length']]

top_10_longest, top_10_shortest

(        id                                              title  title_length
 60      61  Confidence in public health organizations like...           757
 669    670  A Snapshot of What Americans Know About Scienc...           620
 695    696  How Mixed-Race, Mestizo, 'uulatto' Hispanics R...           567
 916    917  Write- Ins for "Some Other Race" Among Hispani...           520
 62      63  Roughly three-in-ten who say social media have...           507
 799    800  The Web IQ" of American Internet Users % of fi...           439
 121    122  About one-quarter of partnered Americans say t...           397
 638    639  Share of interviews via cell phone continues t...           392
 601    602  The biggest federal tax breaks Estimates for f...           385
 1047  1048  Strategies to be less visible online %ofadult ...           384,
         id     title  title_length
 1362  1363     Radio             5
 1466  1467    CHART              6
 1467  1468    AGURE              6
 1469  1