In [29]:
from openai import OpenAI
import pandas as pd
from dataProcessor import process_metadata, clean_text, correct_misinterpreted_characters, pew_metadata_path, statista_metadata_path, llava_description_path

In [31]:
#  get the combined dataframe
combined_df = process_metadata(pew_metadata_path, statista_metadata_path, llava_description_path)
# Apply text cleaning and correction functions
combined_df['title'] = combined_df['title'].apply(clean_text).apply(correct_misinterpreted_characters)
combined_df['caption'] = combined_df['caption'].apply(clean_text).apply(correct_misinterpreted_characters)


In [35]:
# Extract the 'title' and 'caption' columns
data = combined_df[['title', 'caption']].head(100)
# Display the new DataFrame
data

Unnamed: 0,title,caption
0,"Foreign-born population in the United States, ...",The foreign-born population residing in the U....
1,"English proficiency among U.S. immigrants, 198...","Since 1980, the share of immigrants who are pr..."
2,"Languages spoken among U.S. immigrants, 2018","Among the nation’s immigrants, Spanish is by f..."
3,"Hispanic population in the U.S., 2000-2017",There were nearly 60 million Latinos in the Un...
4,Weekly broadcast audience for top 20 NPR-affil...,The top 20 NPR-affiliated public radio station...
...,...,...
95,About four-in-ten Hispanics reported experienc...,"Shortly before the outbreak, about four-in-ten..."
96,Far more Americans favor keeping spending on p...,The survey finds little support for reducing s...
97,U.S Hispanic population reached nearly 61 mill...,The U.S. Hispanic population reached a record ...
98,U.S Hispanic population growth has slowed Aver...,Population growth among U.S. Hispanics has slo...


In [36]:
def context(title, caption):
    return f"Title: '{title}' \n Caption: '{caption}'"

# Example usage
content = data.apply(lambda row: context(row['title'], row['caption']), axis=1)

In [37]:
content

0     Title: 'Foreign-born population in the United ...
1     Title: 'English proficiency among U.S. immigra...
2     Title: 'Languages spoken among U.S. immigrants...
3     Title: 'Hispanic population in the U.S., 2000-...
4     Title: 'Weekly broadcast audience for top 20 N...
                            ...                        
95    Title: 'About four-in-ten Hispanics reported e...
96    Title: 'Far more Americans favor keeping spend...
97    Title: 'U.S Hispanic population reached nearly...
98    Title: 'U.S Hispanic population growth has slo...
99    Title: 'Around one-in-ten young adults in U.S....
Length: 100, dtype: object

In [42]:
# Create an OpenAI client with your deepinfra token and endpoint
openai = OpenAI(
    api_key="sfsdfdsfdsfsdfdffdfdxk3uMKrBh6S",
    base_url="https://api.deepinfra.com/v1/openai",
)

In [43]:
def generate_question(content):
    response = openai.chat.completions.create(
        model="meta-llama/Meta-Llama-3.1-70B-Instruct",
        messages=[
            {
                "role": "system",
                "content": (
                    "You are an AI assistant. Generate a 1 line question based on the following information.\n\n"
                    "Here are some example questions to illustrate the desired question format:\n"
                    "Are gas prices too high?\n"
                    "Is a college education worth it?\n"
                    "Is capitalism the best form of economy?\n"
                    "Do we need cash?\n"
                    "Should education be free?\n"
                    "Does lowering the federal corporate income tax rate create jobs?\n"
                    "Should everyone get a universal basic income?\n"
                    "Should the press be subsidized?\n"
                    "Does legal prostitution increase human trafficking?\n"
                    "Should the penny stay in circulation?\n"
                    "Should government spending be reduced?\n"
                    "Should blood donations be financially compensated?\n"
                    "Should prescription drugs be advertised directly to consumers?\n"
                    "Does poverty cause crime?"
                )    
             },
            {"role": "user", "content": content}
        ],
        max_tokens=20,  # Limit the response to ensure it's a single line
        temperature=0.5
    )
    question = (response.choices[0].message.content)
    return question

In [44]:
def classify_sector(question):
    response = openai.chat.completions.create(
        model="meta-llama/Meta-Llama-3.1-70B-Instruct",
        messages=[
            {"role": "system", "content": "You are an AI assistant. Classify the following question into a relevant sector in 1 word."},
            {"role": "user", "content": question}
        ],
        max_tokens=5,  # A short response is expected for the sector name
    )
    sector = (response.choices[0].message.content)
    return sector

In [45]:
from tqdm import tqdm
# Initialize lists to store the generated questions and sectors
questions = []
sectors = []

# Loop through each row in the DataFrame
for index, row in tqdm(data.iterrows(),total=data.shape[0], desc="Processing Rows"):
    content = context(row['title'], row['caption'])  # Using the context function to create the prompt
    # Generate the question
    question = generate_question(content)
    questions.append(question)
    
    # Classify the sector based on the generated question
    sector = classify_sector(question)
    sectors.append(sector)

Processing Rows: 100%|██████████| 100/100 [02:01<00:00,  1.21s/it]


In [48]:
# Adding the questions and sectors to the DataFrame
data['Question'] = questions
data['Sector'] = sectors
# Drop title and caption columns
data = data.drop(columns=['title', 'caption'], axis=1)
data

Unnamed: 0,Question,Sector
0,Should the U.S. continue to allow high levels ...,Politics
1,Should English proficiency be a requirement fo...,Politics
2,Should the US government provide official docu...,Politics
3,Should the growing Hispanic population in the ...,POLITICS
4,Should NPR be funded by the government?,Politics
...,...,...
95,Does widespread discrimination against Hispani...,Sociology
96,Should police department budgets be increased ...,Government
97,Is the slowing growth rate of the U.S. Hispani...,Demographics
98,Is the slowing growth rate of the U.S. Hispani...,Demographics


In [49]:
# Save the DataFrame to a CSV file
data.to_csv("../dataset/sample_questions.csv", index=False)

In [53]:
# Read the saved CSV file
Questions = pd.read_csv("../dataset/TopRelevant_topics.csv")
Questions

Unnamed: 0,Topic,Sector
0,"Is a 5.7 infant mortality rate per 1,000 live ...",Health
1,"Is a 15.7 infant mortality rate per 1,000 live...",Health
2,Does the travel and tourism industry have a si...,Economy
3,Has the COVID-19 pandemic increased online sho...,Ecommerce
4,"Is a 12.2 infant mortality rate per 1,000 live...",Health
...,...,...
12056,Should art education be more focused on teachi...,Education.
12057,"Should the Antarctic desert be considered a ""t...",Geography
12058,Does water boil at a lower temperature at high...,Physics
12059,Is gout more painful than kidney stones?,Medicine


In [1]:
import pandas as pd
import textwrap
import random

# Adjust display settings to show the entire row content
pd.set_option('display.max_colwidth', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_rows', None)

# Read the saved CSV file
Questions = pd.read_csv("../dataset/TopRelevant_topics.csv")

# Randomly select 20 rows
random_indices = random.sample(range(len(Questions)), 20)
random_sample = Questions.iloc[random_indices].reset_index(drop=True)

# Display the DataFrame
random_sample

Unnamed: 0,Topic,Sector
0,Are morning news programs on network TV still relevant?,Media
1,Should baby care products be sold in food retail stores?,Retail
2,Is urbanization in Kenya happening too quickly?,Environment
3,Should there be a limit to the height of skyscrapers in urban areas?,Urban planning
4,Are stricter laws and police enforcement the solution to India's rape problem?,Law
5,Should people with higher incomes spend a larger proportion of their household expenditure on sports and leisure activities?,Economics
6,Should NFL teams like the Cleveland Browns prioritize increasing revenue over improving team performance?,Sports
7,Should credit card lending be regulated more strictly to prevent excessive consumer debt?,Finance
8,Is the European Union out of touch with its citizens?,Politics
9,Should COVID-19 vaccination priority be given to people over 65 years old?,Healthcare
