In [17]:
import requests
import time
import pandas as pd
import requests


In [11]:
# Add your API Key here (or keep None if you don't have)
api_key = 'rl_bxreLpJLRynSNRyfZ6sG2Gu3F'
# If you don't have an API key, set api_key = None

# --- API parameters ---
site = 'stackoverflow'
tagged = 'nlp'
pagesize = 100  # Max allowed
base_question_url = "https://api.stackexchange.com/2.3/questions"
base_answer_url = "https://api.stackexchange.com/2.3/answers/"
total_posts_needed = 20000

# --- Storage ---
all_posts = []
accepted_answer_ids = []
page = 1

In [12]:
print("🚀 Starting question data collection...")

# Step 1: Pull questions
while len(all_posts) < total_posts_needed:
    params = {
        'order': 'desc',
        'sort': 'creation',
        'tagged': tagged,
        'site': site,
        'pagesize': 100,
        'page': page,
        'filter': '!)Q2B_A7F1D9bS7z6B3mJ0p80'  # Custom filter: includes title, body, tags, accepted_answer_id, view_count, creation_date
    }
    
    if api_key:
        params['key'] = api_key
    
    response = requests.get(base_question_url, params=params)
    data = response.json()
    
    if 'items' not in data or not data['items']:
        print("⚡ No more data available or error occurred.")
        break

    for item in data['items']:
        post = {
            'question_id': item.get('question_id', None),
            'title': item.get('title', ''),
            'body': item.get('body', ''),
            'tags': item.get('tags', []),
            'view_count': item.get('view_count', 0),
            'creation_date': item.get('creation_date', None),
            'accepted_answer_id': item.get('accepted_answer_id', None)
        }
        all_posts.append(post)

        if post['accepted_answer_id']:
            accepted_answer_ids.append(post['accepted_answer_id'])
    
    page += 1
    time.sleep(1)  # Sleep 1 second to respect API limits
    
    print(f"✅ Collected posts so far: {len(all_posts)}")

print(f"✅✅ Finished collecting {len(all_posts)} questions!")

# Step 2: Pull accepted answers
print("\n🚀 Starting accepted answers collection...")

accepted_answers = {}
batch_size = 100  # Max IDs per request

for i in range(0, len(accepted_answer_ids), batch_size):
    batch_ids = accepted_answer_ids[i:i+batch_size]
    ids_str = ';'.join(str(aid) for aid in batch_ids)
    
    answer_params = {
        'order': 'desc',
        'sort': 'activity',
        'site': site,
        'filter': '!9_bDE(B6I'  # Custom filter for answers: includes body, creation_date
    }
    
    if api_key:
        answer_params['key'] = api_key

    answer_response = requests.get(base_answer_url + ids_str, params=answer_params)
    answer_data = answer_response.json()
    
    for answer in answer_data.get('items', []):
        accepted_answers[answer['answer_id']] = {
            'answer_body': answer.get('body', ''),
            'answer_creation_date': answer.get('creation_date', None)
        }
    
    time.sleep(1)

print(f"✅✅ Finished collecting {len(accepted_answers)} accepted answers!")

# Step 3: Merge accepted answers into posts
print("\n🔄 Merging accepted answers into posts...")

for post in all_posts:
    aid = post['accepted_answer_id']
    if aid in accepted_answers:
        post['accepted_answer_body'] = accepted_answers[aid]['answer_body']
        post['accepted_answer_creation_date'] = accepted_answers[aid]['answer_creation_date']
    else:
        post['accepted_answer_body'] = ''
        post['accepted_answer_creation_date'] = None

🚀 Starting question data collection...
⚡ No more data available or error occurred.
✅✅ Finished collecting 0 questions!

🚀 Starting accepted answers collection...
✅✅ Finished collecting 0 accepted answers!

🔄 Merging accepted answers into posts...


In [None]:
# Step 4: Save everything to CSV
print("\nSaving to CSV...")

# Convert list of posts into a DataFrame
df = pd.DataFrame(all_posts)

# Convert 'tags' list into a string
df['tags'] = df['tags'].apply(lambda x: ', '.join(x) if isinstance(x, list) else '')

# Save to CSV
output_filename = 'C:/Users/Mrunmayee Dixit/Documents/Uni/NLP/Assignments/2/stack_nlp_posts_full.csv'
df.to_csv(output_filename, index=False, encoding='utf-8')

print(f"✅ All data saved to: {output_filename}")


Saving to CSV...
✅ All data saved to: C:/Users/Mrunmayee Dixit/Documents/Uni/NLP/Assignments/2/stack_nlp_posts_full.csv


In [None]:
# Step 4: Save everything to CSV
print("\nSaving to CSV...")

# Convert to DataFrame
df = pd.DataFrame(all_posts)

# Convert 'tags' list into a string
df['tags'] = df['tags'].apply(lambda x: ', '.join(x) if isinstance(x, list) else '')

# Save
output_filename = 'C:/Users/Mrunmayee Dixit/Documents/Uni/NLP/Assignments/2/stack_nlp_posts_full_updated.csv'
df.to_csv(output_filename, index=False, encoding='utf-8')

print(f"🏁 ALL DONE! Data saved to: {output_filename}")

In [19]:

# --- Your existing data ---
file_path = 'C:/Users/Mrunmayee Dixit/Documents/Uni/NLP/Assignments/2/stack_nlp_posts_full.csv'  # <-- Your original 20k CSV
api_key = 'rl_PpxUNHis5xSFGzQa6e3UuMDtf'

# --- Load the old data ---
df = pd.read_csv(file_path)

# --- Prepare lists ---
# Clean 'question_id' and 'accepted_answer_id' properly
df['question_id'] = pd.to_numeric(df['question_id'], errors='coerce')
df['accepted_answer_id'] = pd.to_numeric(df['accepted_answer_id'], errors='coerce')

# Now safely drop NaNs and convert to int
question_ids = df['question_id'].dropna().astype(int).tolist()
accepted_answer_ids = df['accepted_answer_id'].dropna().astype(int).tolist()

print(f"Total questions to fetch: {len(question_ids)}")
print(f"Total accepted answers to fetch: {len(accepted_answer_ids)}")

print(f"Total questions to fetch: {len(question_ids)}")
print(f"Total accepted answers to fetch: {len(accepted_answer_ids)}")

# --- API base URLs ---
question_api_base = "https://api.stackexchange.com/2.3/questions/"
answer_api_base = "https://api.stackexchange.com/2.3/answers/"

# --- Storage ---
question_updates = {}
answer_updates = {}

# --- Fetch questions data ---
print("\n🚀 Fetching updated questions data...")
batch_size = 100

for i in range(0, len(question_ids), batch_size):
    batch = question_ids[i:i+batch_size]
    ids_str = ';'.join(map(str, batch))
    params = {
        'order': 'desc',
        'sort': 'activity',
        'site': 'stackoverflow',
        'filter': '!)Q2B_A7F1D9bS7z6B3mJ0p80'  # Pull full question fields
    }
    if api_key:
        params['key'] = api_key
    
    response = requests.get(question_api_base + ids_str, params=params)
    data = response.json()

    for item in data.get('items', []):
        question_updates[item['question_id']] = {
            'title': item.get('title', ''),
            'body': item.get('body', ''),
            'tags': ', '.join(item.get('tags', [])),
            'view_count': item.get('view_count', 0),
            'creation_date': item.get('creation_date', None),
            'accepted_answer_id': item.get('accepted_answer_id', None)
        }
    
    time.sleep(1)

print(f"✅ Fetched {len(question_updates)} updated questions.")

# --- Fetch accepted answers data ---
print("\n🚀 Fetching updated accepted answers data...")

for i in range(0, len(accepted_answer_ids), batch_size):
    batch = accepted_answer_ids[i:i+batch_size]
    ids_str = ';'.join(map(str, batch))
    params = {
        'order': 'desc',
        'sort': 'activity',
        'site': 'stackoverflow',
        'filter': '!9_bDE(B6I'  # Pull full answer fields
    }
    if api_key:
        params['key'] = api_key

    response = requests.get(answer_api_base + ids_str, params=params)
    data = response.json()

    for item in data.get('items', []):
        answer_updates[item['answer_id']] = {
            'accepted_answer_body': item.get('body', ''),
            'accepted_answer_creation_date': item.get('creation_date', None)
        }
    
    time.sleep(1)

print(f"✅ Fetched {len(answer_updates)} updated accepted answers.")

# --- Merge back into DataFrame ---
print("\n🔄 Merging updated fields into original DataFrame...")

# Update questions
for idx, row in df.iterrows():
    qid = row['question_id']
    if qid in question_updates:
        for field in ['title', 'body', 'tags', 'view_count', 'creation_date', 'accepted_answer_id']:
            df.at[idx, field] = question_updates[qid][field]

# Update answers
for idx, row in df.iterrows():
    aid = row['accepted_answer_id']
    if not pd.isna(aid) and int(aid) in answer_updates:
        df.at[idx, 'accepted_answer_body'] = answer_updates[int(aid)]['accepted_answer_body']
        df.at[idx, 'accepted_answer_creation_date'] = answer_updates[int(aid)]['accepted_answer_creation_date']

# --- Save updated file ---
output_file = 'C:/Users/Mrunmayee Dixit/Documents/Uni/NLP/Assignments/2/stack_nlp_posts_final.csv'
df.to_csv(output_file, index=False, encoding='utf-8')

print(f"\n🏁 ALL DONE! Final updated data saved to: {output_file}")


Total questions to fetch: 20079
Total accepted answers to fetch: 8193
Total questions to fetch: 20079
Total accepted answers to fetch: 8193

🚀 Fetching updated questions data...
✅ Fetched 0 updated questions.

🚀 Fetching updated accepted answers data...
✅ Fetched 0 updated accepted answers.

🔄 Merging updated fields into original DataFrame...

🏁 ALL DONE! Final updated data saved to: C:/Users/Mrunmayee Dixit/Documents/Uni/NLP/Assignments/2/stack_nlp_posts_final.csv
