## Starbucks NYT Data Collection

In [1]:
import json
import urllib
import urllib.request
import urllib.parse
import time
import pandas as pd

### Requesting from the NYT API

In [2]:
# pull all articles relevant to Starbucks in our time frame

years = range(2020, 2026) # Nov-15-2020 start date to match Google Trends data
allresults = []
baseurl="https://api.nytimes.com/svc/search/v2/articlesearch.json?"
apikey="GGCXEfbqfbBKk5Aii36FMKPK6De5udGe"

# for loop to iterate through each year in our time frame
for year in years:
    begin = f"{year}0101" # changes the year for the api call for each loop
    end   = f"{year}1231"

    page = 0

    while True:
        
        pa1={"api-key":apikey,
            "q": '"Starbucks"',
            "begin_date": begin,
            "end_date": end,
            "page": page}

        sbuxurl = baseurl + urllib.parse.urlencode(pa1)
        request1 = urllib.request.urlopen(sbuxurl).read()
        resd1 = json.loads(request1)

        response_block1 = resd1.get("response")
    
        # failsafe troubleshooting for API error
        if response_block1 is None:
            print(f"API returned no response for year {year}, page {page}. Retrying...")
            time.sleep(18)
            continue   # try again
    
        docs = response_block1.get("docs")
        
        # indicates end of articles
        if docs is None or len(docs) == 0:
            print(f"Finished year {year}: no more articles at page {page}")
            break

        # collect the objects of interest
        for doc in docs:
            allresults.append({
                "headline"         : doc.get('headline', {}).get('main', ''),
                "abstract"         : doc.get('abstract', ''),
                "publication_date" : doc.get('pub_date', ''),
                "document_type"    : doc.get('document_type', ''),
                "section_name"     : doc.get('section_name', ''),
                "subsection_name"  : doc.get('subsection_name', '')
            })
       
        time.sleep(18)  # time delay so we don't hit the article limit
        page += 1  # scraping every page of articles
        
sbuxdf = pd.DataFrame(allresults)


Finished year 2020: no more articles at page 28
Finished year 2021: no more articles at page 28
Finished year 2022: no more articles at page 40
Finished year 2023: no more articles at page 27
Finished year 2024: no more articles at page 25
Finished year 2025: no more articles at page 17


In [26]:
sbuxdf = sbuxdf[
    (sbuxdf['document_type'] == "article") ] # filtering the results down to article types only

<bound method NDFrame.head of                                                headline  \
0     Starbucks sets goals to increase diversity thr...   
1     Starbucks Will Allow Employees to Wear Black L...   
2     Starbucks names Mellody Hobson as its board ch...   
3     Starbucks sales plunge 40 percent even as stor...   
4     Starbucks Barista Gets $87,000 in Donations Af...   
...                                                 ...   
1597            Teenagers on How Covid Has Changed Them   
1599  How David Henry Hwang Remade Theater in His Ow...   
1602  My Parents Expected to Be Retired. Instead, Th...   
1603                  Nothing Lasts. How Do We Face It?   
1604  Inside the Improbable, Audacious and (So Far) ...   

                                               abstract      publication_date  \
0                                                        2020-10-14T16:22:39Z   
1     In announcing the reversal, the coffee chain a...  2020-06-12T20:23:22Z   
2                 

In [None]:
sbuxdf = sbuxdf[
    sbuxdf['headline'].str.contains("Starbucks", case=False, na=False) |
    sbuxdf['abstract'].str.contains("Starbucks", case=False, na=False)]
# filtering the output so the headline or abstract include the company name

sbuxdf.head

### Sentiment analysis

In [14]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

# combine the headline and abstract into one field for sentiment analysis
sbuxdf['text'] = sbuxdf['headline'] + " " + sbuxdf['abstract']

In [15]:
sent_pos = []
sent_neg = []
sent_neu = []
sent_comp = []
corpus = sbuxdf['text']

# iterate through each sentence in corpus
for article in corpus:
    
    # analyze the sentiment. ss is a dictionary
    ss = sia.polarity_scores(article)

    # append and store these sentiment scores into a list
    sent_pos.append(ss['pos'])
    sent_neg.append(ss['neg'])
    sent_neu.append(ss['neu'])
    sent_comp.append(ss['compound'])

In [16]:
# adding the list to the dataframe as column using assign(column_name = data)
sbuxdf = sbuxdf.assign(article_pos = sent_pos)
sbuxdf = sbuxdf.assign(article_neg = sent_neg)
sbuxdf = sbuxdf.assign(article_neu = sent_neu)
sbuxdf = sbuxdf.assign(article_comp = sent_comp)

### Summarizing data

In [17]:
sbuxdf = sbuxdf.copy()
sbuxdf['publication_date'] = pd.to_datetime(sbuxdf['publication_date'])

# find the Friday of the week for each publication date
sbuxdf['fiscal_week'] = sbuxdf['publication_date'] + pd.offsets.Week(weekday=4)
sbuxdf['fiscal_week'] = sbuxdf['fiscal_week'].dt.date

In [18]:
weekly_df = (
    sbuxdf.groupby('fiscal_week')
          .agg(
               article_count = ('text', 'count'),
              pos_sentiment  = ('article_pos', 'sum'),
              neg_sentiment  = ('article_neg', 'sum'),
              neu_sentiment  = ('article_neu', 'sum'),
              comp_sentiment = ('article_comp', 'mean'),
          )
          .reset_index()
)
# group the rest of the columns by summing or averaging the values for the week

In [19]:
# create ratios to analyze article sentiment
# total
weekly_df['total_sent'] = weekly_df['pos_sentiment'] + weekly_df['neg_sentiment'] + weekly_df['neu_sentiment']
# ratios
weekly_df['pos_ratio'] = weekly_df['pos_sentiment'] / weekly_df['total_sent']
weekly_df['neg_ratio'] = weekly_df['neg_sentiment'] / weekly_df['total_sent']

weekly_df = weekly_df.drop(columns=['total_sent', 'pos_sentiment', 'neg_sentiment', 'neu_sentiment'])

In [20]:
# final dataset
weekly_df.head

<bound method NDFrame.head of     fiscal_week  article_count  comp_sentiment  pos_ratio  neg_ratio
0    2020-01-03              1       -0.214400   0.000000   0.062000
1    2020-01-10              4       -0.038875   0.076250   0.069750
2    2020-01-17              3        0.002333   0.105000   0.081333
3    2020-01-24              5       -0.305260   0.045800   0.107800
4    2020-01-31             13        0.029608   0.073615   0.043769
..          ...            ...             ...        ...        ...
297  2025-10-24              3       -0.117767   0.068000   0.072667
298  2025-10-31              6        0.222650   0.091485   0.044993
299  2025-11-07              3        0.436167   0.111333   0.000000
300  2025-11-14              5        0.476920   0.148630   0.018004
301  2025-11-28              1        0.680800   0.159000   0.035000

[302 rows x 5 columns]>

In [21]:
weekly_df.to_csv("sbux_nyt_data.csv", index=False)
