# Actually filtering the dataset to passages and passage text

In [1]:
import duckdb
from datasets import load_dataset
import pandas as pd

# Step 1: Load the dataset from Hugging Face
dataset = load_dataset("microsoft/ms_marco", "v1.1")  # Replace with your actual dataset name

# Step 2: Convert to Pandas DataFrame
df = dataset['train'].to_pandas()

# Step 3: Create a DuckDB connection
con = duckdb.connect()

# Optionally: Register the DataFrame as a DuckDB table
con.execute("CREATE TABLE v11_train AS SELECT * FROM df")

# Step 4: Query the dataset
query = """
SELECT *,
       passages->>'passage_text' AS passage_text,
       passages->'is_selected' AS is_selected,
       passages->>'url' AS url
FROM v11_train;
"""

# Execute the query
result = con.execute(query).fetchdf()

# Step 5: Save the results to a CSV file
result.to_csv('your_file.csv', index=False)

print("Data has been saved to 'your_file.csv'.")

# Close the DuckDB connection
con.close()



FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Data has been saved to 'your_file.csv'.


In [2]:
result

Unnamed: 0,answers,passages,query,query_id,query_type,wellFormedAnswers,passage_text,is_selected,url
0,[Results-Based Accountability is a disciplined...,"{'is_selected': [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]...",what is rba,19699,description,[],"[""Since 2007, the RBA's outstanding reputation...","[0,0,0,0,0,1,0,0,0,0]","[""https://en.wikipedia.org/wiki/Reserve_Bank_o..."
1,[Yes],"{'is_selected': [0, 1, 0, 0, 0, 0, 0], 'passag...",was ronald reagan a democrat,19700,description,[],"[""In his younger years, Ronald Reagan was a me...","[0,1,0,0,0,0,0]","[""http://www.history.com/topics/us-presidents/..."
2,[20-25 minutes],"{'is_selected': [0, 0, 0, 0, 1, 0, 0, 0, 0, 0]...",how long do you need for sydney and surroundin...,19701,numeric,[],"[""Sydney, New South Wales, Australia is locate...","[0,0,0,0,1,0,0,0,0,0]","[""https://en.wikipedia.org/wiki/Geography_of_S..."
3,[$11 to $22 per square foot],"{'is_selected': [0, 0, 0, 0, 0, 0, 0, 0, 1], '...",price to install tile in shower,19702,numeric,[],"[""In regards to tile installation costs, consu...","[0,0,0,0,0,0,0,0,1]","[""http://www.improvenet.com/r/costs-and-prices..."
4,[Due to symptoms in the body],"{'is_selected': [0, 0, 1, 0, 0, 0, 0, 0], 'pas...",why conversion observed in body,19703,description,[],"[""Conclusions: In adult body CT, dose to an or...","[0,0,1,0,0,0,0,0]","[""http://www.ncbi.nlm.nih.gov/pmc/articles/PMC..."
...,...,...,...,...,...,...,...,...,...
82321,[The act or action of propagating as a increas...,"{'is_selected': [1, 0, 0], 'passage_text': ['d...",meaning of propagation,102124,description,[],"[""definition of propagation the act or action ...","[1,0,0]","[""http://www.merriam-webster.com/dictionary/pr..."
82322,[Yes],"{'is_selected': [0, 0, 1, 0, 0, 0, 0, 0, 0], '...",do you have to do a phd to be a clinical psych...,102125,description,[],"[""The Path to Becoming a Psychologist. First, ...","[0,0,1,0,0,0,0,0,0]","[""http://www.capella.edu/online-psychology-deg..."
82323,[Chablis],"{'is_selected': [0, 1, 0, 0, 0, 0], 'passage_t...",what wine goes with oysters,102126,entity,[],"[""If you need a wine to pair with oysters or m...","[0,1,0,0,0,0]","[""https://www.hellovino.com/wine/pairing/shell..."
82324,[1 Lithium carbonate 150 mg capsules. Lithium ...,"{'is_selected': [0, 0, 0, 1, 0, 0, 0, 0, 0], '...",what strengths does lithium come in,102127,description,[],"[""Your doctor will want to take regular blood ...","[0,0,0,1,0,0,0,0,0]","[""http://www.webmd.com/bipolar-disorder/bipola..."


In [3]:
result['passage_text'][1]

'["In his younger years, Ronald Reagan was a member of the Democratic Party and campaigned for Democratic candidates; however, his views grew more conservative over time, and in the early 1960s he officially became a Republican. In November 1984, Ronald Reagan was reelected in a landslide, defeating Walter Mondale and his running mate Geraldine Ferraro (1935-), the first female vice-presidential candidate from a major U.S. political party.","From Wikipedia, the free encyclopedia. A Reagan Democrat is a traditionally Democratic voter in the United States, especially a white working-class Northerner, who defected from their party to support Republican President Ronald Reagan in either or both the 1980 and 1984 elections. During the 1980 election a dramatic number of voters in the U.S., disillusioned with the economic \'malaise\' of the 1970s and the presidency of Jimmy Carter (even more than, four years earlier, Liberal Republican Gerald Ford), supported former California governor (and f

In [4]:
import ast

# Convert string representation of list to actual list
result['passage_text'] = result['passage_text'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)


In [5]:
# Create an empty list to hold the exploded rows
exploded_data = []

# Iterate over each row in the DataFrame
for index, row in result.iterrows():
    query = row['query']
    passage_texts = row['passage_text']

    # Ensure passage_texts is a list and iterate over it
    if isinstance(passage_texts, list):
        for passage in passage_texts:
            exploded_data.append({'query': query, 'passage_text': passage})

# Convert the exploded data into a new DataFrame
result_exploded = pd.DataFrame(exploded_data)

# View the result
result_exploded


Unnamed: 0,query,passage_text
0,what is rba,"Since 2007, the RBA's outstanding reputation h..."
1,what is rba,The Reserve Bank of Australia (RBA) came into ...
2,what is rba,RBA Recognized with the 2014 Microsoft US Regi...
3,what is rba,The inner workings of a rebuildable atomizer a...
4,what is rba,Results-Based Accountability® (also known as R...
...,...,...
676188,what is polarity index definition,Water (H 2 O) is an example of a polar molecul...
676189,what is polarity index definition,Supplement. Molecules can either be polar or n...
676190,what is polarity index definition,Full Definition of POLARITY. 1. : the quality ...
676191,what is polarity index definition,Part of the Smart grid glossary: Also see bipo...


In [6]:
result_exploded.to_csv('extra_dict.csv')

In [9]:
import pandas as pd
import random

# Initialize a dictionary to keep track of negative samples used for each query
used_negatives = {}

# Function to generate negative samples
def get_negative_sample(row):
    query = row['query']
    passage = row['passage_text']
    
    if query not in used_negatives:
        used_negatives[query] = set()  # Track used negative samples for this query
    
    # Create a list of available negative samples (all passages except the current one)
    available_negatives = [p for p in result_exploded['passage_text'].unique() if p not in used_negatives[query] and p != passage]
    
    if available_negatives:
        # Select a random negative sample
        negative_sample = random.choice(available_negatives)
        used_negatives[query].add(negative_sample)  # Mark this negative sample as used
        return negative_sample
    else:
        return None  # No available negatives left for this query

# Apply the function to create a new column for negative samples
result_exploded['negative_sampling'] = result_exploded.apply(get_negative_sample, axis=1)

# Display the result
print(result_exploded)


KeyboardInterrupt: 

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm  # Import tqdm for the progress bar

# Assuming your DataFrame is already loaded
# result_exploded = pd.DataFrame({'query': ..., 'passage_text': ...})

# Step 1: Create numpy arrays for queries and passage_texts for fast access
queries = result_exploded['query'].values
passages = result_exploded['passage_text'].values

# Step 2: Create a DataFrame of all indices (for efficient random sampling)
indices = np.arange(len(queries))

# Step 3: Create an empty array for negative samples
negative_samples = np.empty_like(passages, dtype=object)

# Step 4: Efficiently generate negative samples with a progress bar
for i in tqdm(range(len(queries)), desc="Generating negative samples"):
    query = queries[i]
    # Get the index of all passages except the current query's passage
    other_indices = indices[queries != query]
    
    # Randomly select one passage from the non-associated ones
    negative_index = np.random.choice(other_indices)
    negative_samples[i] = passages[negative_index]

# Step 5: Create a DataFrame with negative samples and concatenate it
negative_samples_df = pd.DataFrame(negative_samples, columns=['negative_sample'])
result_with_negatives = pd.concat([result_exploded, negative_samples_df], axis=1)

# Now, `result_with_negatives` contains the original data with an added column for negative samples
print(result_with_negatives)



Generating negative samples:   0%|          | 0/676193 [00:00<?, ?it/s]

Generating negative samples:   3%|▎         | 18931/676193 [03:47<2:27:26, 74.29it/s] 