In [5]:
import pandas as pd

# Load the Wikipedia articles about U.S. cities dataset
wiki_articles_path = '/content/us_cities_by_state_SEPT.2023.csv'
wiki_articles_df = pd.read_csv(wiki_articles_path)
state_populations_path = '/content/NST-EST2022-ALLDATA.csv'
state_populations_df = pd.read_csv(state_populations_path)
states_by_region_path = '/content/US States by Region - US Census Bureau - Sheet1.csv'
states_by_region_df = pd.read_csv(states_by_region_path)

In [7]:
# Remove rows that are not about individual cities (e.g., U.S. census articles)
wiki_articles_df = wiki_articles_df[~wiki_articles_df['page_title'].str.contains('census', case=False)]

# Remove duplicate city entries within the same state
wiki_articles_df = wiki_articles_df.drop_duplicates(subset=['state', 'page_title'])

# Show the first few rows of the cleaned dataset
wiki_articles_df.head()

Unnamed: 0,state,page_title,url
0,Alabama,"Abbeville, Alabama","https://en.wikipedia.org/wiki/Abbeville,_Alabama"
1,Alabama,"Adamsville, Alabama","https://en.wikipedia.org/wiki/Adamsville,_Alabama"
2,Alabama,"Addison, Alabama","https://en.wikipedia.org/wiki/Addison,_Alabama"
3,Alabama,"Akron, Alabama","https://en.wikipedia.org/wiki/Akron,_Alabama"
4,Alabama,"Alabaster, Alabama","https://en.wikipedia.org/wiki/Alabaster,_Alabama"


In [8]:
# Keep only the necessary columns and filter out rows that are not individual states
state_populations_df = state_populations_df[['NAME', 'POPESTIMATE2022']]
state_populations_df = state_populations_df[state_populations_df['NAME'].isin(wiki_articles_df['state'].unique())]

# Show the first few rows of the cleaned dataset
state_populations_df.head()

Unnamed: 0,NAME,POPESTIMATE2022
14,Alabama,5074296
15,Alaska,733583
16,Arizona,7359197
17,Arkansas,3045637
18,California,39029342


In [10]:
# Fill forward the REGION and DIVISION columns to associate them with each state
states_by_region_df['REGION'] = states_by_region_df['REGION'].fillna(method='ffill')
states_by_region_df['DIVISION'] = states_by_region_df['DIVISION'].fillna(method='ffill')

# Drop rows where the STATE column is NaN
states_by_region_df = states_by_region_df.dropna(subset=['STATE'])

# Show the first few rows of the cleaned dataset
states_by_region_df.head()

Unnamed: 0,REGION,DIVISION,STATE
2,Northeast,New England,Connecticut
3,Northeast,New England,Maine
4,Northeast,New England,Massachusetts
5,Northeast,New England,New Hampshire
6,Northeast,New England,Rhode Island


In [12]:
# Merge the Wikipedia articles dataset with the state populations dataset on the 'state'/'NAME' columns
merged_df = pd.merge(wiki_articles_df, state_populations_df, left_on='state', right_on='NAME', how='left')

# Merge the resulting dataset with the states by region dataset on the 'state'/'STATE' columns
merged_df = pd.merge(merged_df, states_by_region_df, left_on='state', right_on='STATE', how='left')

# Drop redundant columns ('NAME' and 'STATE')
merged_df = merged_df.drop(['NAME', 'STATE'], axis=1)

# Show the first few rows of the merged dataset
merged_df.head()

Unnamed: 0,state,page_title,url,POPESTIMATE2022,REGION,DIVISION
0,Alabama,"Abbeville, Alabama","https://en.wikipedia.org/wiki/Abbeville,_Alabama",5074296.0,South,East South Central
1,Alabama,"Adamsville, Alabama","https://en.wikipedia.org/wiki/Adamsville,_Alabama",5074296.0,South,East South Central
2,Alabama,"Addison, Alabama","https://en.wikipedia.org/wiki/Addison,_Alabama",5074296.0,South,East South Central
3,Alabama,"Akron, Alabama","https://en.wikipedia.org/wiki/Akron,_Alabama",5074296.0,South,East South Central
4,Alabama,"Alabaster, Alabama","https://en.wikipedia.org/wiki/Alabaster,_Alabama",5074296.0,South,East South Central


In [55]:
# Import required libraries
import json, time
import pandas as pd
import requests

# Throttling and headers for Wikipedia API
API_THROTTLE_WAIT = (1.0 / 100.0) - 0.002
REQUEST_HEADERS = {
    'User-Agent': '<uwnetid@uw.edu>, University of Washington, MSDS DATA 512 - AUTUMN 2023',
}

# Template for Wikipedia API request parameters
PAGEINFO_PARAMS_TEMPLATE = {
    "action": "query",
    "format": "json",
    "titles": "",
    "prop": "info",
    "inprop": ""
}
API_ENWIKIPEDIA_ENDPOINT = "https://en.wikipedia.org/w/api.php"

USERNAME = "Mohammap22"
ACCESS_TOKEN = 'eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJhdWQiOiJlNGE0MTgxM2ViZWZhZDUwM2JkODY1ODNiN2ExNTczMCIsImp0aSI6IjVjYTE5M2FmMDkxYjFiOTFlMjI5ODk3M2ZmZDM4NTFjMTQ3NzdiNWQ5NThjYmIzMTFjOGVmYTRmZGY0ZGU5YmNiN2I2NzY2OTc2ZjE0NTQ4IiwiaWF0IjoxNjk2MjE2MTQwLjgwMzI1MywibmJmIjoxNjk2MjE2MTQwLjgwMzI1NSwiZXhwIjoxNjk2MjMwNTQwLjgwMDQyNSwic3ViIjoiIiwiaXNzIjoiaHR0cHM6Ly9tZXRhLndpa2ltZWRpYS5vcmciLCJyYXRlbGltaXQiOnsicmVxdWVzdHNfcGVyX3VuaXQiOjUwMDAsInVuaXQiOiJIT1VSIn0sInNjb3BlcyI6WyJiYXNpYyJdfQ.lepqBNcG-P1kqoLa64LaJAM_EMb5Ivkf0DN3yC1Z88dNKmTP7FXdT1lOVTePxeGMhQigupJwv8cLrYq4uvGm4nRgTCta7gpQXxGqZYtk1qSA69C7jIkWjwLmCLBjgT54RzhOl3UqbjDO7h5l3izPLQCpvRKl1E25JAE0qErDDw6kxVoKSGdfG7yxK9Yq-z8FVnMKJSK3LjdeRcUmv6g_QaZ14Zu8KWD2CMzIvNeKhb0S6RsBt0mo6BOjC7GWuNu1n83SArWhbabZx-Wj7uWTybfJfG_2TskP9pucLWDpIE3x_KvcNKi8YbgYgP6LCo4Lcb1XOm1cZdRxlxcws7tEsHD9SmZTafgZel2L-enbirttiKG1OcgvIhajmwx9eLDG2iyFfxXmhTRhwiqc-tedJgGnY-ExxC78Dda6kaVdZV91eMeRgOvSOYKu9YOz5ew5INqqDJhiyigp9BcqDH8Ugil3DGjYTnFdB2mZNxvTjjq33OcG5waY1MGjExI--i_YZjOqq9-AHxC-BKV5w4Bth-lIdgvkKl7ohHhj49dzk1VerrJ38lCqUk50IJ83CK5Rbox_tQCu0uB1fEOE--70uJ4MEW5hUi_A1oBG1q6F5GnUo5vI-QzfEXQiw5nK-fTeIs9MJAMCTWfumMy2ex04daPeADWP6vDSfkUtwz23RYU'
API_ORES_LIFTWING_ENDPOINT = "https://api.wikimedia.org/service/lw/inference/v1/models/{model_name}:predict"
API_ORES_EN_QUALITY_MODEL = "enwiki-articlequality"

#
#    The throttling rate is a function of the Access token that you are granted when you request the token. The constants
#    come from dissecting the token and getting the rate limits from the granted token. An example of that is below.
#
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (60.0/5000.0)-API_LATENCY_ASSUMED

#    When making automated requests we should include something that is unique to the person making the request
#    This should include an email - your UW email would be good to put in there
#
#    Because all LiftWing API requests require some form of authentication, you need to provide your access token
#    as part of the header too
#
REQUEST_HEADER_TEMPLATE = {
    'User-Agent': "<{email_address}>, University of Washington, MSDS DATA 512 - AUTUMN 2023",
    'Content-Type': 'application/json',
    'Authorization': "Bearer {access_token}"
}
#
#    This is a template for the parameters that we need to supply in the headers of an API request
#
REQUEST_HEADER_PARAMS_TEMPLATE = {
    'email_address' : "petermohammadi@gmail.com",
    'access_token'  :  ACCESS_TOKEN
}

#
#    This is a template of the data required as a payload when making a scoring request of the ORES model
#
ORES_REQUEST_DATA_TEMPLATE = {
    "lang":        "en",     # required that its english - we're scoring English Wikipedia revisions
    "rev_id":      "",       # this request requires a revision id
    "features":    True
}

#
#    These are used later - defined here so they, at least, have empty values
#


In [57]:


def request_ores_score_per_article(article_revid = None, email_address=None, access_token=None,
                                   endpoint_url = API_ORES_LIFTWING_ENDPOINT,
                                   model_name = API_ORES_EN_QUALITY_MODEL,
                                   request_data = ORES_REQUEST_DATA_TEMPLATE,
                                   header_format = REQUEST_HEADER_TEMPLATE,
                                   header_params = REQUEST_HEADER_PARAMS_TEMPLATE):

    #    Make sure we have an article revision id, email and token
    #    This approach prioritizes the parameters passed in when making the call
    if article_revid:
        request_data['rev_id'] = article_revid
    if email_address:
        header_params['email_address'] = email_address
    if access_token:
        header_params['access_token'] = access_token

    #   Making a request requires a revision id - an email address - and the access token
    if not request_data['rev_id']:
        raise Exception("Must provide an article revision id (rev_id) to score articles")
    if not header_params['email_address']:
        raise Exception("Must provide an 'email_address' value")
    if not header_params['access_token']:
        raise Exception("Must provide an 'access_token' value")

    # Create the request URL with the specified model parameter - default is a article quality score request
    request_url = endpoint_url.format(model_name=model_name)

    # Create a compliant request header from the template and the supplied parameters
    headers = dict()
    for key in header_format.keys():
        headers[str(key)] = header_format[key].format(**header_params)

    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free data
        # source like ORES - or other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        #response = requests.get(request_url, headers=headers)
        response = requests.post(request_url, headers=headers, data=json.dumps(request_data))
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response['enwiki']['scores'][str(article_revid)]['articlequality']['score']['prediction']


In [None]:
def request_pageinfo_per_article(article_title):
    request_template = PAGEINFO_PARAMS_TEMPLATE.copy()
    request_template['titles'] = article_title

    if API_THROTTLE_WAIT > 0.0:
        time.sleep(API_THROTTLE_WAIT)

    response = requests.get(API_ENWIKIPEDIA_ENDPOINT, headers=REQUEST_HEADERS, params=request_template)
    json_response = response.json()
    try:
        page_id = list(json_response['query']['pages'].keys())[0]
        revision_id = json_response['query']['pages'][page_id]['lastrevid']
        return revision_id
    except KeyError:
        return None

def save_dataframe(df, filename):
    df.to_csv(filename, index=False)

# Main loop to read the dataset and fetch article quality and revision ID
missing_scores = []

for index, row in merged_df.iterrows():
    page_title = row['page_title']
    print(f"Fetching data for {page_title}...")

    # Get the revision ID using the new function
    revision_id = request_pageinfo_per_article(page_title)

    if revision_id:
        quality = request_ores_score_per_article(article_revid=revision_id,
                                                 email_address="petermohammadi@gmail.com",
                                                 access_token=ACCESS_TOKEN)

        if quality:
            print(f"Quality for {page_title}: {quality}")
            merged_df.loc[index, 'Article_Quality'] = quality
        else:
            print(f"Missing quality score for {page_title}")
            missing_scores.append(page_title)
    else:
        print(f"Missing revision ID for {page_title}")
        missing_scores.append(page_title)

    # Save the DataFrame every 10 iterations
    if index % 100 == 0:
        save_dataframe(merged_df, 'merged_df_backup.csv')

    # Log articles with missing scores
print("Articles with missing scores:", missing_scores)

Fetching data for Abbeville, Alabama...
Quality for Abbeville, Alabama: C
Fetching data for Adamsville, Alabama...
Quality for Adamsville, Alabama: C
Fetching data for Addison, Alabama...
Quality for Addison, Alabama: C
Fetching data for Akron, Alabama...
Quality for Akron, Alabama: GA
Fetching data for Alabaster, Alabama...
Quality for Alabaster, Alabama: C
Fetching data for Albertville, Alabama...
Quality for Albertville, Alabama: C
Fetching data for Alexander City, Alabama...
Quality for Alexander City, Alabama: GA
Fetching data for Aliceville, Alabama...
Quality for Aliceville, Alabama: GA
Fetching data for Allgood, Alabama...
Quality for Allgood, Alabama: C
Fetching data for Altoona, Alabama...
Quality for Altoona, Alabama: C
Fetching data for Andalusia, Alabama...
Quality for Andalusia, Alabama: C
Fetching data for Anderson, Lauderdale County, Alabama...
Quality for Anderson, Lauderdale County, Alabama: Stub
Fetching data for Anniston, Alabama...
Quality for Anniston, Alabama: C


In [None]:
# Filter out non-states (assuming the state name is the key for filtering)
final_df = merged_df[merged_df['state'].isin(state_populations_df['NAME'])]

# Identify and list areas for which there are no matches
non_matching_areas = set(state_populations_df['NAME']) - set(final_df['state'])
print("Non-matching areas:")
for area in non_matching_areas:
    print(area)

# Rename columns to match the final schema
final_df.rename(columns={
    'page_title': 'article_title',
    'POPESTIMATE2022': 'population',
    'DIVISION': 'regional_division'
}, inplace=True)

# Assuming that 'revision_id' and 'article_quality' are added to the DataFrame from ORES data
# Save the consolidated data into a single CSV file
final_df.to_csv('wp_scored_city_articles_by_state.csv', index=False)