# Project Two Code 

In [1]:
import numpy as np
import nltk
from nltk.corpus import stopwords
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from itertools import chain
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import fuzz
from sklearn.linear_model import LogisticRegression
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


## Web Scraping NYT Articles Using NYT API 

In [2]:
import requests as req
import time
API_KEY='9kiWZKsGFtTkcTD4yYkyxWAKehiTKkzR' # your API key

In [3]:
import calendar

# Create an empty list to store articles
articles2 = []

# Loop through every month from Jan 2020 to Feb 2024
for year in range(2020, 2024):
    for month in range(1, 13):
        # Calculate the number of days in the current month
        num_days = calendar.monthrange(year, month)[1]

        # Construct the begin_date and end_date parameters for the query
        begin_date = f"{year:04d}-{month:02d}-01"
        end_date = f"{year:04d}-{month:02d}-{num_days:02d}"

        # Loop through 5 pages (max 10 results per page, hence 50 articles)
        for i in range(5):
            # Sleep to avoid exceeding the API rate limit
            time.sleep(10)
            try:
                # Construct the URL
                url = f'https://api.nytimes.com/svc/search/v2/articlesearch.json?q=artificial+intelligence&api-key={API_KEY}&sort=relevance&begin_date={begin_date}&end_date={end_date}&page={i+1}'

                # Make the request and get the response
                response = req.get(url).json()
                # Extract relevant information from the response and append to articles list
                docs = response['response']['docs']
                for doc in docs:
                    filteredDoc = {}
                    filteredDoc['title'] = doc['headline']['main']
                    filteredDoc['abstract'] = doc['abstract']
                    filteredDoc['paragraph'] = doc['lead_paragraph']
                    filteredDoc['date'] = f"{year}-{month:02d}"
                    articles2.append(filteredDoc)
                
                
            except KeyError as e:
                print(f"No response while processing page {i+1} for {begin_date} - {end_date}: {e}")


No response while processing page 3 for 2020-02-01 - 2020-02-29: 'response'
No response while processing page 4 for 2020-04-01 - 2020-04-30: 'response'
No response while processing page 5 for 2020-07-01 - 2020-07-31: 'response'
No response while processing page 1 for 2020-11-01 - 2020-11-30: 'response'
No response while processing page 2 for 2021-01-01 - 2021-01-31: 'response'
No response while processing page 3 for 2021-04-01 - 2021-04-30: 'response'
No response while processing page 4 for 2021-06-01 - 2021-06-30: 'response'
No response while processing page 5 for 2021-09-01 - 2021-09-30: 'response'
No response while processing page 1 for 2022-01-01 - 2022-01-31: 'response'
No response while processing page 2 for 2022-03-01 - 2022-03-31: 'response'
No response while processing page 3 for 2022-06-01 - 2022-06-30: 'response'
No response while processing page 4 for 2022-08-01 - 2022-08-31: 'response'
No response while processing page 5 for 2022-11-01 - 2022-11-30: 'response'
No response 

In [4]:
nyt_ai2 = pd.DataFrame(data=articles2)

In [5]:
nyt_ai2.shape

(1370, 4)

#### Note: 1370 articles in total means an average of ~28-29 articles scraped per month 

In [7]:
nyt_ai2.to_csv('r2_scraped_nyt.csv')

## Text Upload

In [11]:
nyt_data = pd.read_csv('r2_scraped_nyt.csv')

In [13]:
nyt_data.drop(columns=['Unnamed: 0'])

Unnamed: 0,title,abstract,paragraph,date
0,Soon a Robot Will Be Writing This Headline,"In “A World Without Work,” the economist Danie...","A WORLD WITHOUT WORK Technology, Automation, a...",2020-01
1,A.I. Comes to the Operating Room,Images made by lasers and read by computers ca...,Brain surgeons are bringing artificial intelli...,2020-01
2,Confessions of a Dating Profile,"My boss’s demands were very, very persuasive, ...","My boss’s demands were very, very persuasive, ...",2020-01
3,"Lesson of the Day: ‘The Machines Are Learning,...","In this lesson, students will learn about how ...","Featured Article: “The Machines Are Learning, ...",2020-01
4,Facebook Says It Will Ban ‘Deepfakes’,The company said it would remove videos altere...,WASHINGTON — Facebook says it will ban videos ...,2020-01
...,...,...,...,...
1365,Cities Foster Serendipity. But Can They Do It ...,Revisiting a theory about chance collisions an...,There is a thing that happens in cities — that...,2023-10
1366,Allied Spy Chiefs Warn of Chinese Espionage Ta...,F.B.I. officials say more than half of Chinese...,The United States and its allies vowed this we...,2023-10
1367,Researchers Say Guardrails Built Around A.I. S...,OpenAI now lets outsiders tweak what its chatb...,Before it released the A.I. chatbot ChatGPT la...,2023-10
1368,An Industry Insider Drives an Open Alternative...,"The nonprofit Allen Institute for AI, led by a...",Ali Farhadi is no tech rebel.,2023-10


## Preliminary Sentiment Analysis 

### Using Transfer-Learning Methods

### Using Lexicon-Based Methods (see https://www.analyticsvidhya.com/blog/2021/06/rule-based-sentiment-analysis-in-python/)

#### Data Cleaning, POS-tagging, Stop Word Removal, Stemming

In [17]:
# create an empty list for clean paragraphs 
cleanparagraph=[]
# for each rows in the df 
for i in range(len(nyt_data)):
    # replace unwanted characters 
    para=str(nyt_data['paragraph'].iloc[i])
    para=para.replace('\r',' ')
    para=para.replace('\n',' ')
    para=para.replace('. ',' ')
    para=para.replace(', ',' ')
    # lowercase 
    cleanparagraph.append(para.lower())
nyt_data['clean paragraph']= cleanparagraph

In [18]:
nyt_data.head()

Unnamed: 0.1,Unnamed: 0,title,abstract,paragraph,date,clean paragraph
0,0,Soon a Robot Will Be Writing This Headline,"In “A World Without Work,” the economist Danie...","A WORLD WITHOUT WORK Technology, Automation, a...",2020-01,a world without work technology automation and...
1,1,A.I. Comes to the Operating Room,Images made by lasers and read by computers ca...,Brain surgeons are bringing artificial intelli...,2020-01,brain surgeons are bringing artificial intelli...
2,2,Confessions of a Dating Profile,"My boss’s demands were very, very persuasive, ...","My boss’s demands were very, very persuasive, ...",2020-01,my boss’s demands were very very persuasive ap...
3,3,"Lesson of the Day: ‘The Machines Are Learning,...","In this lesson, students will learn about how ...","Featured Article: “The Machines Are Learning, ...",2020-01,featured article: “the machines are learning a...
4,4,Facebook Says It Will Ban ‘Deepfakes’,The company said it would remove videos altere...,WASHINGTON — Facebook says it will ban videos ...,2020-01,washington — facebook says it will ban videos ...
