In [2]:
import pandas as pd
import nltk
from nltk.corpus import words
from utils import non_concrete_words
from nltk.corpus import wordnet as wn

In [3]:
nltk.download('words')
nltk.download('wordnet')
real_words = set(words.words())

[nltk_data] Downloading package words to /home/sa162103/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to /home/sa162103/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
len(real_words)

235892

In [5]:
llava_558 = pd.read_csv("data/object_frequencies_LLaVA-Pretrain.csv")
llava_665 = pd.read_csv("data/object_frequencies_LLaVA-mix665k.csv")

llava_558.rename(columns={"Frequency": "generation_count"}, inplace=True)
llava_665.rename(columns={"Frequency": "understanding_count"}, inplace=True)

In [6]:
llava_558.describe()

Unnamed: 0,generation_count
count,140583.0
mean,11.53456
std,122.099289
min,1.0
25%,1.0
50%,1.0
75%,2.0
max,10849.0


In [7]:
llava_665.describe()

Unnamed: 0,understanding_count
count,52806.0
mean,130.403041
std,1175.560131
min,1.0
25%,1.0
50%,2.0
75%,11.0
max,158657.0


In [8]:
llava_558

Unnamed: 0,Object,generation_count
0,topper,89
1,mattress,277
2,furniture,925
3,luxury,444
4,inch,632
...,...,...
140578,beach days,1
140579,pretendors,1
140580,the duke and count,1
140581,m4a1,1


In [9]:
llava_665

Unnamed: 0,Object,understanding_count
0,image,158657
1,bus,10016
2,advertisement,578
3,back,5371
4,people,54676
...,...,...
52801,snowskate,1
52802,airfrance,1
52803,waiste,1
52804,Piee of brocclie,1


In [10]:
merged_df = llava_558.merge(llava_665, on="Object", how="outer") 
merged_df.fillna(0, inplace=True)

# Convert counts to integers
merged_df[["generation_count", "understanding_count"]] = \
    merged_df[["generation_count", "understanding_count"]].astype(int)

merged_df["Object"] = merged_df["Object"].astype(str)

merged_df.head()

Unnamed: 0,Object,generation_count,understanding_count
0,!,0,3
1,"""",0,2
2,""" bertie bubble bus """,0,1
3,"""90"" minutes",0,1
4,"""Bedder Sleep",0,1


In [11]:
merged_df.head()

Unnamed: 0,Object,generation_count,understanding_count
0,!,0,3
1,"""",0,2
2,""" bertie bubble bus """,0,1
3,"""90"" minutes",0,1
4,"""Bedder Sleep",0,1


In [12]:
merged_df.shape

(177562, 3)

In [13]:
def all_words_real(obj):
    return all(word in real_words for word in obj.lower().split())

In [14]:
merged_df = merged_df[merged_df["Object"].astype(str).str.contains(r'\d', regex=True) == False]
merged_df = merged_df[merged_df["Object"].apply(all_words_real)]

In [15]:
merged_df

Unnamed: 0,Object,generation_count,understanding_count
11436,A,0,17648
11437,A Bloody Mary,0,1
11439,A Cathedral Church,0,1
11442,A Dairy Queen,0,1
11443,A Diet Coke,0,1
...,...,...,...
177318,zoom zoom,1,0
177329,zooxanthellae,0,1
177336,zorro,1,0
177366,zucchini,59,96


In [16]:
expanded_rows = []
for _, row in merged_df.iterrows():
    words = row["Object"].split()  # Split the sentence into words
    for word in words:
        new_row = row.copy()
        new_row["Object"] = word  # Assign the single word as the object
        expanded_rows.append(new_row)

# Create a new DataFrame
merged_df = pd.DataFrame(expanded_rows)

# Aggregate counts by summing over duplicate words
merged_df = merged_df.groupby("Object").sum().reset_index()

In [17]:
merged_df.head()

Unnamed: 0,Object,generation_count,understanding_count
0,A,0,17688
1,AD,0,3
2,AI,0,263
3,AID,0,1
4,AL,0,2


In [18]:
merged_df.shape

(24831, 3)

In [19]:
merged_df = merged_df[(merged_df["generation_count"]>1) & (merged_df["understanding_count"]>1)]

In [20]:
merged_df

Unnamed: 0,Object,generation_count,understanding_count
6802,a,1171,3240
6803,aa,14,3
6816,abbreviation,38,3
6817,abdomen,17,6
6821,ability,31,5768
...,...,...,...
24822,zoning,6,23
24823,zoo,104,2634
24825,zoology,3,4
24826,zoom,97,10


In [21]:
merged_df["difference"] = merged_df["generation_count"] - merged_df["understanding_count"]

In [22]:
merged_df = merged_df.sort_values(by="difference", ascending=True)

In [23]:
merged_df

Unnamed: 0,Object,generation_count,understanding_count,difference
14763,image,6941,158657,-151716
18308,people,3178,54677,-51499
16341,man,7235,44056,-36821
7533,area,1701,34617,-32916
20614,scene,946,32336,-31390
...,...,...,...,...
9270,card,4777,232,4545
18886,poster,4898,171,4727
22120,stock,5435,112,5323
19063,print,5937,247,5690


In [24]:
def is_concrete_noun(word):
    synsets = wn.synsets(word, pos=wn.NOUN)
    if not synsets:
        return False  # not a noun or not in WordNet

    # Heuristic: if it has hypernyms like 'substance', 'artifact', 'object', it's likely concrete
    concrete_keywords = {'substance', 'object', 'artifact', 'physical_entity'}
    
    for syn in synsets:
        for hypernym in syn.hypernyms():
            if any(kw in str(hypernym) for kw in concrete_keywords):
                return True
    return False

In [25]:
merged_df = merged_df[merged_df["Object"].apply(lambda x: is_concrete_noun(str(x).lower()))]

In [26]:
merged_df

Unnamed: 0,Object,generation_count,understanding_count,difference
13078,food,1393,21688,-20295
18106,part,606,12004,-11398
16050,location,278,10654,-10376
8547,body,1058,10633,-9575
24342,way,788,9864,-9076
...,...,...,...,...
12544,fabric,1133,546,587
20486,sample,708,22,686
12794,film,954,230,724
9060,button,1145,312,833


In [28]:
merged_df.to_csv("interesting_objects_v3.csv")
merged_df["Object"].to_csv("interesting_objects_v3.txt", index=False, header=False)


In [60]:
# merged_df = merged_df[~merged_df["Object"].str.lower().isin(non_concrete_words)]

In [61]:
merged_df

Unnamed: 0,Object,generation_count,understanding_count,difference
13078,food,1393,21688,-20295
18106,part,606,12004,-11398
16050,location,278,10654,-10376
8547,body,1058,10633,-9575
24342,way,788,9864,-9076
...,...,...,...,...
12544,fabric,1133,546,587
20486,sample,708,22,686
12794,film,954,230,724
9060,button,1145,312,833


In [62]:
# gen_median = merged_df["generation_count"].median()
# und_median = merged_df["understanding_count"].median()

# print(gen_median, und_median)

# interesting_objects = merged_df[
#     ((merged_df["generation_count"] <= gen_median) & (merged_df["understanding_count"] >= und_median)) |
#     ((merged_df["understanding_count"] <= und_median) & (merged_df["generation_count"] >= gen_median))
# ]

In [63]:
# interesting_objects = interesting_objects[~interesting_objects["Object"].str.lower().isin(non_concrete_words)]

In [64]:
# interesting_objects.to_csv("interesting_objects.csv", index=None)

In [65]:
# interesting_objects["Object"].to_csv("interesting_objects.txt", index=False, header=False)

In [66]:
from nltk.corpus import wordnet as wn

# Collect all unique hypernym lemma names for noun synsets
hypernym_keywords = set()

for syn in wn.all_synsets(pos=wn.NOUN):
    for hyper in syn.hypernyms():
        for lemma in hyper.lemmas():
            hypernym_keywords.add(lemma.name().lower())

# Sort and display
sorted_keywords = sorted(hypernym_keywords)
print(sorted_keywords)

