<h2>Initial Exploration of Wikipedia Pageview Data<h2>

In [15]:
import pandas as pd
import sqlite3
import plotly.express as px

In [16]:
conn = sqlite3.connect('raw_data.db')

sql_query = "SELECT * FROM pageviews"
df = pd.read_sql_query(sql_query, conn)

<h3>Filter out irrelevant search_keys and dates outside of 2024

In [17]:
df = df[~df['search_key'].isin(['Cookie_(informatique)', 'Main_Page'])]
df = df[df['date'].str[:4] == '2024']

<h3>Beginning of data (Jan 1, 2024)

In [18]:
df.head(10).style.hide(axis='index')

date,search_id,search_key,view_count
2024-01-01,18713,Lynyrd_Skynyrd,374849
2024-01-01,36792343,Elle_King,307689
2024-01-01,100691,Paul_Anka,269259
2024-01-01,80903,Billie_Joe_Armstrong,191331
2024-01-01,52726,Green_Day,170677
2024-01-01,54162498,Gypsy_Rose_Blanchard,164705
2024-01-01,170459,LL_Cool_J,162843
2024-01-01,218238,Rob_Schneider,150123
2024-01-01,1473365,Johnny_Van_Zant,140385
2024-01-01,12342282,List_of_Lynyrd_Skynyrd_members,132688


<h3>Pages with most views in a single day in 2024

In [19]:
df.sort_values(by='view_count', ascending=False).head(10).style.hide(axis='index')

date,search_id,search_key,view_count
2024-07-15,53396477,J.D._Vance,3397556
2024-12-25,569,Anthropology,3253747
2024-08-06,2216593,Gwen_Walz,2393644
2024-11-13,34076202,Hegseth,2146555
2024-07-16,53396477,J.D._Vance,2071310
2024-11-06,48410011,2020_U.S._presidential_election,1848918
2024-11-22,4550623,Pam_Bondi,1439397
2024-02-24,15580374,Google_Chrome,1430379
2024-11-16,51021695,Jake_Paul,1420154
2024-02-06,682527,Toby_Keith,1403258


<h3>Set dates to datetime objects for plotting

In [20]:
df['date'] = pd.to_datetime(df['date'])

<h3>Get top 10 most viewed pages per day

In [21]:
df_sorted = df.sort_values(['date', 'view_count'], ascending=[True, False])
df_top10 = df_sorted.groupby('date', group_keys=False).head(10)

<h3>Plot data

In [22]:
fig = px.scatter(
    df_top10,
    x='date',
    y='view_count',
    hover_data=['search_key', 'view_count'],
    size_max=20
)

fig.show()

In [23]:
import sqlite3
import db

In [24]:
# db.init_searchkeys()

In [28]:
from google import genai

client = genai.Client()

def machinedotlearn(search_key: str) -> str:
    contents = f"""
    Classify the political nature of the following Wikipeda page title: '{search_key}'. 
    Respond strictly with one of the following options: political, non-political, or not-sure.
    Titles marked 'political' should include: Political figures and issues, voting logistics,
    and general political topics (e.g. elections, climate change, abortion, etc...)"""
    response = client.models.generate_content(
        model="gemini-2.5-flash", 
        contents=contents
    )
    return response.text

def insert_nature(search_key: str) -> None:
    nature = machinedotlearn(search_key)
    print(f'{search_key}: {nature}')
    db.insert_searchkey_nature(search_key, nature)
    

In [29]:
df_unique = df['search_key'].unique()
df_unique[:10]

array(['Lynyrd_Skynyrd', 'Elle_King', 'Paul_Anka', 'Billie_Joe_Armstrong',
       'Green_Day', 'Gypsy_Rose_Blanchard', 'LL_Cool_J', 'Rob_Schneider',
       'Johnny_Van_Zant', 'List_of_Lynyrd_Skynyrd_members'], dtype=object)

In [None]:
for search_key in df_unique[10:]:
    insert_nature(search_key)


Jared_Leto: non-political
Saltburn_(film): non-political
Lainey_Wilson: non-political
Ronnie_Van_Zant: non-political
Anderson_Cooper: political
JellyRoll: non-political
Andy_Cohen: non-political
Rita_Ora: non-political
Thirty_Seconds_to_Mars: non-political
Rickey_Medlocke: non-political
Ivy_Queen: non-political


ClientError: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.\n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 10\nPlease retry in 2.706531811s.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerMinutePerProjectPerModel-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-2.5-flash'}, 'quotaValue': '10'}]}, {'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}, {'@type': 'type.googleapis.com/google.rpc.RetryInfo', 'retryDelay': '2s'}]}}