## Smuckers Company NYT data collection

In [1]:
import json
import urllib.request
import urllib.parse
import time
import pandas as pd
import pickle

### requesting from the NYT API

In [2]:
# pull all articles relevant to Dunkin' in our time frame

years = range(2020, 2026) # Nov-15-2020 start date to match Google Trends data
allresults_dunk = []
baseurl="https://api.nytimes.com/svc/search/v2/articlesearch.json?"
apikey="GGCXEfbqfbBKk5Aii36FMKPK6De5udGe"

# for loop to iterate through each year in our time frame
for year in years:
    begin = f"{year}0101" # changes the year for the api call for each loop
    end   = f"{year}1231"

    page = 0

    while True:
        pa3 = {"api-key": apikey,
            "q": '"Dunkin\'"',
            "begin_date": begin,
            "end_date": end,
            "page": page}

        
        dunk_url = baseurl + urllib.parse.urlencode(pa3)
        request3 = urllib.request.urlopen(dunk_url).read()
        resd3 = json.loads(request3)
    
        response_block3 = resd3.get("response")
        
        # failsafe troubleshooting for API error
        if response_block3 is None:
            print(f"API returned no response for year {year}, page {page}. Retrying...")
            time.sleep(18)
            continue   # try again
    
        docs3 = response_block3.get("docs")
        
        # indicates end of articles
        if docs3 is None or len(docs3) == 0:
            print(f"Finished year {year}: no more articles at page {page}")
            break
    
        # collect the objects of interest
        for doc in docs3:
            allresults_dunk.append({
                "headline"         : doc.get('headline', {}).get('main', ''),
                "abstract"         : doc.get('abstract', ''),
                "publication_date" : doc.get('pub_date', ''),
                "document_type"    : doc.get('document_type', ''),
                "section_name"     : doc.get('section_name', ''),
                "subsection_name"  : doc.get('subsection_name', '')
            })
       
        page += 1  # scraping every page of articles
        time.sleep(18)  # time delay so we don't hit the article limit
        
    
dunk_df = pd.DataFrame(allresults_dunk)


Finished year 2020: no more articles at page 8
Finished year 2021: no more articles at page 4
Finished year 2022: no more articles at page 4
Finished year 2023: no more articles at page 4
Finished year 2024: no more articles at page 6
Finished year 2025: no more articles at page 6


In [3]:
dunk_df = dunk_df[
    (dunk_df['document_type'] == "article") ]
# filtering the results down to article types only

dunk_df.head

<bound method NDFrame.head of                                               headline  \
0    Dunkin’ Brands Is in Talks to Sell Itself and ...   
1                   Met at Dunkin’, Married at Dunkin’   
2    Dunkin’ Brands has done well in the pandemic. ...   
3    Do Dunkin’ and Arby’s Go Together? Private Equ...   
4               Why Dunkin’ Is Worth Nearly $9 Billion   
..                                                 ...   
286                               Crypto’s Connections   
287  Are Markets Taking the Latest Trump Trade Figh...   
288  How Sleepy Bucks County Became a Rival to the ...   
289                         The Super Bowl Ads, Ranked   
290  What I Found on the 365-Mile Trail of a Lost F...   

                                              abstract      publication_date  \
0    The parent of Dunkin’ and Baskin Robbins is ne...  2020-10-25T17:25:19Z   
1    Sugar Good and John Thompson discovered there ...  2020-10-23T09:03:01Z   
2                                

In [9]:
# pull all articles relevant to Folgers in our time frame

years = range(2020, 2026) # Nov-15-2020 start date to match Google Trends data
allresults_folg = []
baseurl="https://api.nytimes.com/svc/search/v2/articlesearch.json?"
apikey="GGCXEfbqfbBKk5Aii36FMKPK6De5udGe"

time.sleep(6)
# for loop to iterate through each year in our time frame
for year in years:
    begin = f"{year}0101" # changes the year for the api call for each loop
    end   = f"{year}1231"

    page = 0

    while True:
        pa4 = {"api-key": apikey,
            "q": "Folgers",
            "begin_date": begin,
            "end_date": end,
            "page": page}

        
        folg_url = baseurl + urllib.parse.urlencode(pa4)
        request4 = urllib.request.urlopen(folg_url).read()
        resd4 = json.loads(request4)
    
        response_block4 = resd4.get("response")
        
        # failsafe troubleshooting for API error
        if response_block4 is None:
            print(f"API returned no response for year {year}, page {page}. Retrying...")
            time.sleep(18)
            continue   # try again
    
        docs4 = response_block4.get("docs")
        
        # indicates end of articles
        if docs4 is None or len(docs4) == 0:
            print(f"Finished year {year}: no more articles at page {page}")
            break
    
        # collect the objects of interest
        for doc in docs4:
            allresults_folg.append({
                "headline"         : doc.get('headline', {}).get('main', ''),
                "abstract"         : doc.get('abstract', ''),
                "publication_date" : doc.get('pub_date', ''),
                "document_type"    : doc.get('document_type', ''),
                "section_name"     : doc.get('section_name', ''),
                "subsection_name"  : doc.get('subsection_name', '')
            })
       
        page += 1  # scraping every page of articles
        time.sleep(24)  # time delay so we don't hit the article limit
        
    
folg_df = pd.DataFrame(allresults_folg)


Finished year 2020: no more articles at page 1
Finished year 2021: no more articles at page 2
Finished year 2022: no more articles at page 1
Finished year 2023: no more articles at page 1
Finished year 2024: no more articles at page 2
Finished year 2025: no more articles at page 1


In [10]:
folg_df = folg_df[
    (folg_df['document_type'] == "article") ]
# filtering the results down to article types only

folg_df.head

<bound method NDFrame.head of                                              headline  \
0   How Shakespeare Paperbacks Made Me Want to Be ...   
1               Is This a Livestream I See Before Me?   
2   Maurice Edwards, Busy Figure in Theater and Mu...   
3   City That Once Guided a Nation Now Shows Its C...   
4   ‘Minor Feelings’ Rescues Personal Experience F...   
6                             Is Coffee Good for You?   
7   A 15-Minute Grocery Delivery That Took 21 Minutes   
8   Transcript: Ezra Klein Answers Listener Questions   
9    Transcript: Ezra Klein Interviews Céline Gounder   
11  ‘The Two Noble Kinsmen’ Review: Shakespeare, W...   
12  Jason Sudeikis wins best comedic actor for ‘Te...   
13  A Scholarly Analysis of Shakespeare’s Life Tha...   
14  Karen Hastie Williams, Barrier-Breaking Lawyer...   
15  Congress, Vaccine Mandates, Baseball: Your Tue...   
16  Finding Memories, and Mom’s Sewing Stuff, in a...   
17  Finding Memories, and Mom’s Sewing Stuff, in a...   
1

In [15]:
smj_df = pd.concat([dunk_df, folg_df], ignore_index=True)
# combine the articles related to Dunkin' and those related to Folgers

In [17]:
smj_df = smj_df[
    smj_df['headline'].str.contains("Dunkin", case=False, na=False) |
    smj_df['abstract'].str.contains("Dunkin", case=False, na=False) |
    smj_df['headline'].str.contains("Folgers", case=False, na=False)|
    smj_df['abstract'].str.contains("Folgers", case=False, na=False)]
# filtering the output so the headline or abstract include the company name (Dunkin' or Folgers)

smj_df.head

<bound method NDFrame.head of                                               headline  \
0    Dunkin’ Brands Is in Talks to Sell Itself and ...   
1                   Met at Dunkin’, Married at Dunkin’   
2    Dunkin’ Brands has done well in the pandemic. ...   
3    Do Dunkin’ and Arby’s Go Together? Private Equ...   
4               Why Dunkin’ Is Worth Nearly $9 Billion   
5    Meet the private equity firm trying to buy Dun...   
20   CVS, Dunkin’, Lego: The Brands Pulling Ads Fro...   
112  Customer’s Racial Slur Drew a Fatal Punch. The...   
149    Restaurant Chains Make It Cost More to Be Loyal   
179        No Interviewing Voters in a Dunkin’ Donuts!   
183  How Alarmed Harris Staffers Went Rogue to Reac...   
229  2 Million Baked Goods Are Recalled Over Lister...   

                                              abstract      publication_date  \
0    The parent of Dunkin’ and Baskin Robbins is ne...  2020-10-25T17:25:19Z   
1    Sugar Good and John Thompson discovered there ... 

### Sentiment analysis

In [19]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()
# combine the headline and abstract into one field for sentiment analysis
smj_df['text'] = smj_df['headline'] + " " + smj_df['abstract']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  smj_df['text'] = smj_df['headline'] + " " + smj_df['abstract']


In [20]:
sent_pos4 = []
sent_neg4 = []
sent_neu4 = []
sent_comp4 = []
corpus4 = smj_df['text']

# iterate through each sentence in corpus
for article in corpus4:
    
    # analyze the sentiment. ss is a dictionary
    ss = sia.polarity_scores(article)

    # append and store these sentiment scores into a list
    sent_neg4.append(ss['neg'])
    sent_neu4.append(ss['neu'])
    sent_comp4.append(ss['compound'])

In [21]:
# adding the list to the dataframe as column using assign(column_name = data)
smj_df = smj_df.assign(article_pos = sent_pos4)
smj_df = smj_df.assign(article_neg = sent_neg4)
smj_df = smj_df.assign(article_neu = sent_neu4)
smj_df = smj_df.assign(article_comp = sent_comp4)

### Summarizing data

In [22]:
smj_df = smj_df.copy()
smj_df['publication_date'] = pd.to_datetime(smj_df['publication_date'])

# find the Friday of the week for each publication date
smj_df['fiscal_week'] = smj_df['publication_date'] + pd.offsets.Week(weekday=4)
smj_df['fiscal_week'] = smj_df['fiscal_week'].dt.date

In [23]:
weekly_df_smj = (
    smj_df.groupby('fiscal_week')
          .agg(
              article_count = ('text', 'count'),
              pos_sentiment  = ('article_pos', 'sum'),
              neg_sentiment  = ('article_neg', 'sum'),
              neu_sentiment  = ('article_neu', 'sum'),
              comp_sentiment = ('article_comp', 'mean'),
          )
          .reset_index()
)
# group the rest of the columns by summing or averaging the values for the week

In [24]:
# create ratios to analyze article sentiment
# total
weekly_df_smj['total_sent'] = weekly_df_smj['pos_sentiment'] + weekly_df_smj['neg_sentiment'] + weekly_df_smj['neu_sentiment']
# ratios
weekly_df_smj['pos_ratio'] = weekly_df_smj['pos_sentiment'] / weekly_df_smj['total_sent']
weekly_df_smj['neg_ratio'] = weekly_df_smj['neg_sentiment'] / weekly_df_smj['total_sent']

weekly_df_smj = weekly_df_smj.drop(columns=['total_sent', 'pos_sentiment', 'neg_sentiment', 'neu_sentiment'])

In [25]:
# final dataset
weekly_df_smj.head

<bound method NDFrame.head of   fiscal_week  article_count  comp_sentiment  pos_ratio  neg_ratio
0  2020-07-03              1        -0.92010   0.000000   0.256000
1  2020-10-30              5         0.33482   0.119600   0.007200
2  2020-11-06              1         0.58590   0.134000   0.000000
3  2022-03-11              1        -0.78450   0.032032   0.202202
4  2023-03-03              1         0.73220   0.220000   0.093000
5  2024-01-26              1        -0.50100   0.000000   0.246000
6  2024-12-13              1        -0.83600   0.024000   0.217000
7  2025-02-14              1         0.17790   0.094000   0.070000>

In [15]:
weekly_df_smj.to_csv("smj_nyt_data.csv", index=False)
