# checklist

In [14]:
import pandas as pd

In [15]:
data = pd.read_csv('../data/raw/Anon.csv', 
                #  sheet_name='Sheet3',
                #  header=1,
                 sep=',',
                 dtype=str,
                 on_bad_lines='skip')

In [16]:
data.head()

Unnamed: 0,Filename,Solution,Difficulty
0,1,A drop in the bucket,Easy
1,2,first edition,Easy
2,3,kick the bucket,Easy
3,4,feeling under the weather,Easy
4,5,don't cut corners,Hard


In [17]:
data_f = data[['Filename', 'Solution']].copy()

In [18]:
data_f.head()

Unnamed: 0,Filename,Solution
0,1,A drop in the bucket
1,2,first edition
2,3,kick the bucket
3,4,feeling under the weather
4,5,don't cut corners


In [19]:
print(data_f.columns.tolist())

['Filename', 'Solution']


In [20]:
print(data_f.dtypes)

Filename    object
Solution    object
dtype: object


In [21]:
print(data_f['Solution'].head())

0         A drop in the bucket
1                first edition
2              kick the bucket
3    feeling under the weather
4            don't cut corners
Name: Solution, dtype: object


In [22]:
data_f.dropna(how='all', inplace=True)

In [23]:
data_f.to_csv('../data/raw/annotations.csv', index=False)

In [24]:
import pandas as pd

In [25]:
annot = pd.read_csv("../data/raw/annotations.csv", 
                   quotechar='"',  # Specify quote character
                   skipinitialspace=True,  # Skip spaces after delimiter
                   encoding='utf-8')  # Specify encoding
annot.head()

Unnamed: 0,Filename,Solution
0,1,A drop in the bucket
1,2,first edition
2,3,kick the bucket
3,4,feeling under the weather
4,5,don't cut corners


In [26]:
annot.iloc[50:65]

Unnamed: 0,Filename,Solution
50,51,Side burns
51,52,Carbon footprint
52,53,Back to school
53,54,A wolf in sheep's clothing
54,55,Time is money
55,56,Hit the big time
56,57,"Year in, year out"
57,58,Down for the count
58,59,Get bent out of shape
59,60,Go down in a blaze of glory


In [27]:
print(annot.iloc[50:65])

    Filename                     Solution
50        51                   Side burns
51        52             Carbon footprint
52        53               Back to school
53        54   A wolf in sheep's clothing
54        55                Time is money
55        56             Hit the big time
56        57            Year in, year out
57        58           Down for the count
58        59        Get bent out of shape
59        60  Go down in a blaze of glory
60        61             No pain, No gain
61        62               The last straw
62        63            Take a rain check
63        64             Mile in a minute
64        65                 Golden rules


In [28]:
print(annot.columns.tolist())

['Filename', 'Solution']


In [29]:
annot.dropna(how='all', inplace=True)

In [30]:
annot.to_csv('../data/raw/annotationx.csv', index=False)

In [31]:
print(annot.tail())

     Filename                         Solution
167       168                 Handily defeated
168       169  Everything but the kitchen sink
169       170           Read between the lines
170       171         Think of the big picture
171       172                 Cut to the chase


# fixes

In [1]:
import pandas as pd
import os

In [4]:
data = pd.read_csv("../data/raw/annotations.csv", 
                   quotechar='"',  # Specify quote character
                   skipinitialspace=True,  # Skip spaces after delimiter
                   encoding='utf-8')  # Specify encoding

In [5]:
print("Original data shape:", data.shape)
print("Original data head:")
print(data.head())

Original data shape: (172, 2)
Original data head:
   Filename                   Solution
0         1       A drop in the bucket
1         2              first edition
2         3            kick the bucket
3         4  feeling under the weather
4         5          don't cut corners


In [6]:
print("\nFilename column values (first 10):")
print(data['Filename'].head(10).tolist())
print("\nFilename column data type:", data['Filename'].dtype)


Filename column values (first 10):
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

Filename column data type: int64


In [7]:
img_dir = '../data/raw/img'
if os.path.exists(img_dir):
    actual_files = sorted([f for f in os.listdir(img_dir) if f.lower().endswith(('.jpg', '.jpeg', '.png'))])
    print(f"\nActual image files (first 10): {actual_files[:10]}")
    print(f"Total image files: {len(actual_files)}")
    
    # Extract the numeric part from actual filenames
    actual_numbers = []
    for f in actual_files[:10]:
        name_without_ext = os.path.splitext(f)[0]
        print(f"File: {f} -> Without extension: {name_without_ext}")
        actual_numbers.append(name_without_ext)
else:
    print(f"\nImage directory {img_dir} doesn't exist!")


Actual image files (first 10): ['001.jpg', '002.jpg', '003.jpg', '004.jpg', '005.jpg', '006.jpg', '007.jpg', '008.jpg', '009.jpg', '010.jpg']
Total image files: 172
File: 001.jpg -> Without extension: 001
File: 002.jpg -> Without extension: 002
File: 003.jpg -> Without extension: 003
File: 004.jpg -> Without extension: 004
File: 005.jpg -> Without extension: 005
File: 006.jpg -> Without extension: 006
File: 007.jpg -> Without extension: 007
File: 008.jpg -> Without extension: 008
File: 009.jpg -> Without extension: 009
File: 010.jpg -> Without extension: 010


In [8]:
# Convert numbers to zero-padded format
data_fixed = data.copy()

# Convert filename to zero-padded 3-digit format
data_fixed['Filename'] = data_fixed['Filename'].astype(str).str.zfill(3)

print("\nAfter zero-padding:")
print("Fixed filename column (first 10):")
print(data_fixed['Filename'].head(10).tolist())


After zero-padding:
Fixed filename column (first 10):
['001', '002', '003', '004', '005', '006', '007', '008', '009', '010']


In [9]:
if os.path.exists(img_dir):
    # Check how many filenames now match actual image files
    actual_basenames = [os.path.splitext(f)[0] for f in actual_files]
    csv_filenames = data_fixed['Filename'].tolist()
    
    matches = set(csv_filenames) & set(actual_basenames)
    print(f"\nMatching files: {len(matches)} out of {len(csv_filenames)} CSV entries")
    print(f"Sample matches: {sorted(list(matches))[:10]}")
    
    # Show mismatches
    csv_set = set(csv_filenames)
    img_set = set(actual_basenames)
    
    only_in_csv = csv_set - img_set
    only_in_img = img_set - csv_set
    
    if only_in_csv:
        print(f"\nFilenames in CSV but not in images: {sorted(list(only_in_csv))[:10]}")
    if only_in_img:
        print(f"\nImages without CSV entries: {sorted(list(only_in_img))[:10]}")


Matching files: 172 out of 172 CSV entries
Sample matches: ['001', '002', '003', '004', '005', '006', '007', '008', '009', '010']


In [10]:
data_final = data_fixed[['Filename', 'Solution']].copy()

# Remove any rows with missing data
data_final = data_final.dropna()

# Remove any duplicate filenames (keep first occurrence)
data_final = data_final.drop_duplicates(subset=['Filename'], keep='first')

print(f"\nFinal dataset shape: {data_final.shape}")
print("Final dataset head:")
print(data_final.head())


Final dataset shape: (172, 2)
Final dataset head:
  Filename                   Solution
0      001       A drop in the bucket
1      002              first edition
2      003            kick the bucket
3      004  feeling under the weather
4      005          don't cut corners


In [11]:
output_path = '../data/raw/annotatx`ions.csv'
data_final.to_csv(output_path, index=False)

print(f"\n✅ Saved corrected annotations to: {output_path}")

# Verify the saved file
verification = pd.read_csv(output_path)
print(f"\nVerification - saved file shape: {verification.shape}")
print("Verification - first few rows:")
print(verification.head())


✅ Saved corrected annotations to: ../data/raw/annotations.csv

Verification - saved file shape: (172, 2)
Verification - first few rows:
   Filename                   Solution
0         1       A drop in the bucket
1         2              first edition
2         3            kick the bucket
3         4  feeling under the weather
4         5          don't cut corners


In [12]:
import sys
sys.path.append('../')

try:
    from data.load_data import load_dataset
    
    # Test loading the dataset
    dataset = load_dataset('../data/raw/img', '../data/raw/annotations.csv')
    print(f"\n🎉 SUCCESS! Loaded {len(dataset)} image-annotation pairs")
    
    # Show first few examples
    for i, (img_path, solution) in enumerate(dataset[:5]):
        img_name = os.path.basename(img_path)
        print(f"  {i+1}. {img_name} -> {solution}")
        
except Exception as e:
    print(f"\n❌ Error testing load_dataset: {e}")
    print("You may need to apply the other code fixes first")


🎉 SUCCESS! Loaded 172 image-annotation pairs
  1. 001.jpg -> A drop in the bucket
  2. 002.jpg -> first edition
  3. 003.jpg -> kick the bucket
  4. 004.jpg -> feeling under the weather
  5. 005.jpg -> don't cut corners


In [13]:
try:
    # Test if prompt builder can find examples (after you create the JSON file)
    if os.path.exists('../data/sample/rebus_prompts.json'):
        print("\n✅ rebus_prompts.json exists - prompt building should work")
    else:
        print("\n⚠️  Still need to create data/sample/rebus_prompts.json")
        print("Run the setup script or create it manually")
        
except Exception as e:
    print(f"\nNote: {e}")


✅ rebus_prompts.json exists - prompt building should work
