## McDonalds NYT Data Collection

In [1]:
import json
import urllib.request
import urllib.parse
import time
import pandas as pd
import pickle

### Requesting from the NYT API

In [2]:
# pull all articles relevant to McDonald's in our time frame

years = range(2020, 2026) # Nov-15-2020 start date to match Google Trends data
allresults_mcd = []
baseurl="https://api.nytimes.com/svc/search/v2/articlesearch.json?"
apikey="GGCXEfbqfbBKk5Aii36FMKPK6De5udGe" # udpate API key for new users

# for loop to iterate through each year in our time frame
for year in years:
    begin = f"{year}0101"
    end   = f"{year}1231"

    page = 0

    while True:
        pa2 = {"api-key":apikey,
            "q": '"McDonald\'s"',
            "begin_date": begin,
            "end_date": end,
            "page": page}
    
        mcd_url = baseurl + urllib.parse.urlencode(pa2)
        request2 = urllib.request.urlopen(mcd_url).read()
        resd2 = json.loads(request2)
    
        response_block2 = resd2.get("response")

        
        # failsafe for API error
        if response_block2 is None:
            print(f"API returned no response for year {year}, page {page}. Retrying...")
            time.sleep(18)
            continue   # try again
    
        docs2 = response_block2.get("docs")
        
        # indicates end of articles
        if docs2 is None or len(docs2) == 0:
            print(f"Finished year {year}: no more articles at page {page}")
            break
    
        # collect the objects of interest
        for doc in docs2:
            allresults_mcd.append({
                "headline"         : doc.get('headline', {}).get('main', ''),
                "abstract"         : doc.get('abstract', ''),
                "publication_date" : doc.get('pub_date', ''),
                "document_type"    : doc.get('document_type', ''),
                "section_name"     : doc.get('section_name', ''),
                "subsection_name"  : doc.get('subsection_name', '')
            })
       
        page += 1  # scraping every page of articles
        time.sleep(18)  # time delay so we don't hit the article limit
        
    
mcd_df = pd.DataFrame(allresults_mcd)


Finished year 2020: no more articles at page 32
Finished year 2021: no more articles at page 30
Finished year 2022: no more articles at page 36
Finished year 2023: no more articles at page 24
Finished year 2024: no more articles at page 36
Finished year 2025: no more articles at page 24


In [1]:
mcd_df = mcd_df[
    (mcd_df['document_type'] == "article") ] # filtering the results down to article types only

NameError: name 'mcd_df' is not defined

In [None]:
mcd_df = mcd_df[
    mcd_df['headline'].str.contains("McCafé", case=False, na=False) |
    mcd_df['abstract'].str.contains("McCafé", case=False, na=False) |
    mcd_df['headline'].str.contains("McDonald’s", case=False, na=False) |
    mcd_df['abstract'].str.contains("McDonald’s", case=False, na=False)]
# filtering the output so the headline or abstract include the company name
# we also decided to include McCafe for relevance to coffee

mcd_df.head

### Sentiment analysis

In [4]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

# combine the headline and abstract into one field for sentiment analysis
mcd_df['text'] = mcd_df['headline'] + " " + mcd_df['abstract']

In [5]:
sent_pos2 = []
sent_neg2 = []
sent_neu2 = []
sent_comp2 = []
corpus2 = mcd_df['text']

# iterate through each sentence in corpus
for article in corpus2:
    
    # analyze the sentiment. ss is a dictionary
    ss = sia.polarity_scores(article)

    # append and store these sentiment scores into a list
    sent_pos2.append(ss['pos'])
    sent_neg2.append(ss['neg'])
    sent_neu2.append(ss['neu'])
    sent_comp2.append(ss['compound'])

In [6]:
# adding the list to the dataframe as column using assign(column_name = data)
mcd_df = mcd_df.assign(article_pos = sent_pos2)
mcd_df = mcd_df.assign(article_neg = sent_neg2)
mcd_df = mcd_df.assign(article_neu = sent_neu2)
mcd_df = mcd_df.assign(article_comp = sent_comp2)

### Summarizing data

In [7]:
mcd_df = mcd_df.copy()
mcd_df['publication_date'] = pd.to_datetime(mcd_df['publication_date'])

# find the Friday of the week for each publication date
mcd_df['fiscal_week'] = mcd_df['publication_date'] + pd.offsets.Week(weekday=4)
mcd_df['fiscal_week'] = mcd_df['fiscal_week'].dt.date

In [8]:
weekly_df_mcd = (
    mcd_df.groupby('fiscal_week')
          .agg(
               article_count = ('text', 'count'),
              pos_sentiment  = ('article_pos', 'sum'),
              neg_sentiment  = ('article_neg', 'sum'),
              neu_sentiment  = ('article_neu', 'sum'),
              comp_sentiment = ('article_comp', 'mean'),
          )
          .reset_index()
)
# group the rest of the columns by summing or averaging the values for the week

In [9]:
# create ratios to analyze article sentiment
# total
weekly_df_mcd['total_sent'] = weekly_df_mcd['pos_sentiment'] + weekly_df_mcd['neg_sentiment'] + weekly_df_mcd['neu_sentiment']
# ratios
weekly_df_mcd['pos_ratio'] = weekly_df_mcd['pos_sentiment'] / weekly_df_mcd['total_sent']
weekly_df_mcd['neg_ratio'] = weekly_df_mcd['neg_sentiment'] / weekly_df_mcd['total_sent']

weekly_df_mcd = weekly_df_mcd.drop(columns=['total_sent', 'pos_sentiment', 'neg_sentiment', 'neu_sentiment'])

In [10]:
# final dataset
weekly_df_mcd.head

<bound method NDFrame.head of     fiscal_week  article_count  comp_sentiment  pos_ratio  neg_ratio
0    2020-01-03              2        0.246950   0.056528   0.023012
1    2020-01-10              5       -0.012160   0.070614   0.091018
2    2020-01-17             11       -0.101209   0.052545   0.087273
3    2020-01-24              4       -0.136100   0.055750   0.099000
4    2020-01-31              8       -0.240062   0.047756   0.091261
..          ...            ...             ...        ...        ...
301  2025-10-31              1        0.557400   0.286000   0.000000
302  2025-11-07              8       -0.016400   0.081645   0.087647
303  2025-11-14              4       -0.147550   0.107223   0.124219
304  2025-11-21              5       -0.054300   0.051400   0.064600
305  2025-11-28              4       -0.758425   0.012750   0.210500

[306 rows x 5 columns]>

In [11]:
weekly_df_mcd.to_csv("mcd_nyt_data.csv", index=False)
