In [1]:
import pandas as pd
import numpy as np
import requests
import json
from bs4 import BeautifulSoup
import re
import matplotlib.pyplot as plt
import seaborn as sns

# Part I : Working with HTML and JSON

In [2]:
# Load HTML file from GitHub into a pandas data frame
url_html = 'https://raw.githubusercontent.com/minjaelee0522/Project2/master/albums.html'
df_html = pd.read_html(url_html)[0]

# Load JSON file from GitHub into a pandas data frame
url_json = 'https://raw.githubusercontent.com/minjaelee0522/Project2/master/albums.json'
df_json = pd.read_json(url_json)

In [3]:
df_html

Unnamed: 0,Album,Artist,Year,Tracks
0,Daydream,Mariah Carey,1995,"Fantasy, One Sweet Day, Open Arms, Always Be M..."
1,Beyonce,Beyonce,2013,"Pretty Hurts, Haunted, Drunk in Love, Blow, Pa..."
2,Love Yourself: Answer,BTS,2018,"Euphoria, Trivia: Just Dance, I'm Fine, IDOL, ..."


In [4]:
df_json

Unnamed: 0,album,artist,year,tracks
0,Daydream,Mariah Carey,1995,"[Fantasy, One Sweet Day, Open Arms, Always Be ..."
1,Beyonce,Beyonce,2013,"[Pretty Hurts, Haunted, Drunk in Love, Blow, P..."
2,Love Yourself: Answer,BTS,2018,"[Euphoria, Trivia: Just Dance, I'm Fine, IDOL,..."


In [5]:
# Check if the two data frames are identical
print(df_html.equals(df_json))

False


It is not identical two dataframes Because
1. Formatting: The HTML and JSON files may have different formatting, leading to different interpretations of the data. For example, the HTML table may have additional formatting tags that are not present in the JSON file, causing the read_html() function to interpret the data differently.  
2. Encoding: The HTML and JSON files may have different character encoding, leading to differences in the data that is loaded. Make sure that the character encoding is consistent between the two files.  
3. Data types: The HTML and JSON files may have different data types for the same data. For example, the HTML table may have a date field formatted as a string, while the JSON file may have the same date field formatted as a datetime object. This can cause the read_html() and read_json() functions to interpret the data differently.

# Part II : Scraping the Katz School’s “Staff” Web Page

In [6]:
# Make a GET request to the Katz School's "Staff" web page
url_staff = "https://www.yu.edu/katz/staff"
response = requests.get(url_staff)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")

# Extract the staff member information from the HTML content
staff_div = soup.find("div", class_="text-only")
staff_paragraphs = staff_div.find_all("p")

# Create a Pandas dataframe named staff_info with columns office, name, title, email, and phone
staff_info = pd.DataFrame(columns=["office", "name", "title", "email", "phone"])

# Loop through the extracted data and add each staff member's information as a new row to the staff_info dataframe
for p in staff_paragraphs:
    name_title = p.contents[0].strip()
    if not name_title:
        continue
    name, title = name_title.split(",", 1)
    title = title.strip()

    email_tag = p.find("a", href=re.compile(r"^mailto:"))
    if email_tag:
        email = email_tag.text
    else:
        email = "N/A"

    phone = re.search(r"\d{3}-\d{3}-\d{4}", p.text)
    if phone:
        phone = phone.group()
    else:
        phone = "N/A"

    office = p.find_previous("h3").text.strip()

    staff_info = staff_info.append({"office": office, "name": name, "title": title, "email": email, "phone": phone}, ignore_index=True)

In [7]:
# Print the staff_info dataframe
print(staff_info)

                                           office                    name  \
0                              Office of the Dean              Paul Russo   
1                              Office of the Dean              Aaron Ross   
2                              Office of the Dean         Jackie Hamilton   
3                              Office of the Dean  Ekaterina Davarashvili   
4                              Office of the Dean         Tabitha Collazo   
5                              Office of the Dean               Ann Leary   
6                             Graduate Admissions            Jared Hakimi   
7                             Graduate Admissions          Shayna Matzner   
8                             Graduate Admissions        Xavier Velasquez   
9                           Graduate Student Life            Rafael Reyes   
10  Academic Operations and Teaching and Learning             John Vivolo   
11  Academic Operations and Teaching and Learning    Nebahat Bayrakcioglu   

In [8]:
df_staff = pd.DataFrame(staff_info)
df_staff

Unnamed: 0,office,name,title,email,phone
0,Office of the Dean,Paul Russo,Vice Provost and Dean,,
1,Office of the Dean,Aaron Ross,Director of Strategic Initiatives and Deputy t...,aaron.ross2@yu.edu,646-592-4148
2,Office of the Dean,Jackie Hamilton,Director of Global Engagement and New Business...,jackie.hamilton@yu.edu,646-787-6194
3,Office of the Dean,Ekaterina Davarashvili,"Manager, Administration and Finance",ekaterina.davarashvili@yu.edu,646-592-4777
4,Office of the Dean,Tabitha Collazo,Business and Operations Coordinator,tabitha.collazo@yu.edu,646-592-4735
5,Office of the Dean,Ann Leary,,ann.leary@yu.edu,646-592-4724
6,Graduate Admissions,Jared Hakimi,Director,jared.hakimi@yu.edu,646-592-4722
7,Graduate Admissions,Shayna Matzner,Assistant Director,shayna.matzner@yu.edu,646-592-4726
8,Graduate Admissions,Xavier Velasquez,Assistant Director,xavier.velasquez@yu.edu,646-592-4737
9,Graduate Student Life,Rafael Reyes,Director of Graduate Student Life and Communit...,rafael.reyes@yu.edu,646-592-4729


I have not been able to produce a perfect match. There are two issues. 
- Sofia Binioris, Senior Project Manager and Advisor to the Dean. other staff are each contained within a single 'p'. However, Aaron Ross, Director of Strategic Initiatives and Deputy to the Dean, and Sofia Binioris, Senior Project Manager and Advisor to the Dean, are two people in a single 'p'. Sofia is contained in a 'br' but does not produce any output code.
- Ann Leary's title. All staff members have their name and title separated by a ','. However, Ann Leary's title is inside a 'href'. No output code was generated.   

Other than these two issues, the result is a perfect match.

# Part III : Working with Web API’s

In [9]:
from newsdataapi import NewsDataApiClient

# API key authorization, Initialize the client with your API key
api = NewsDataApiClient(apikey="pub_19425dfee6a8ad195df192f95453414c23554")

# You can pass empty or with request parameters {ex. (country = "us")}
response = api.news_api(q="lakers", language="en")

# You can go to next page by providing Page parameter
response = api.news_api(page = "1679720508e164d29f2f8314a77e596880a2aa0e2f")

# You can paginate till last page by providing Page parameter in Loop
page=None
while True:
    response = api.news_api(page = page)
    page = response.get('nextPage',None)
    if not page:
        break

# Create a DataFrame from the articles list
df_news = pd.DataFrame(articles)

KeyboardInterrupt: 

In [None]:
df_news

In [None]:
# Convert the list of keywords to separate rows
df_keywords = df_news['keywords'].explode().reset_index(drop=True)
df_keywords.name = 'keyword'
df_news = df_news.drop('keywords', axis=1).join(df_keywords)

# Count the frequency of each keyword
keyword_counts = df_news['keyword'].value_counts()

# Display the top 10 keywords
top_keywords = keyword_counts.head(10)
print(top_keywords)

In [None]:
# Count the number of null values in the video_url column
null_video_url_count = df_news['video_url'].isnull().sum()

# Display the result
print(f'There are {null_video_url_count} null values in the video_url column.')


In [None]:
# Count the number of null values in the video_url column
null_image_url_count = df_news['image_url'].isnull().sum()

# Display the result
print(f'There are {null_image_url_count} null values in the image_url column.')


In [None]:
# Count the number of articles from each source_id
source_id_counts = df_news['source_id'].value_counts()

# Display the result
print(source_id_counts)

In [None]:
# Count the number of articles in the "sports" category
sports_count = sum(['sports' in categories for categories in df_news['category']])

# Calculate the rate of articles in the "sports" category
sports_rate = sports_count / len(df_news)

# Display the result
print(f"The rate of articles in the 'sports' category is {sports_rate:.2%}")



In [None]:
# Create an empty dictionary to store the country counts
country_counts = {}

# Loop through each row in the column
for row in df_news['country']:
    # Loop through each country in the list of countries for the row
    for country in row:
        # If the country is not in the dictionary yet, add it with a count of 1
        if country not in country_counts:
            country_counts[country] = 1
        # If the country is already in the dictionary, increment its count by 1
        else:
            country_counts[country] += 1

# Convert the dictionary to a pandas Series object and sort it by the count
country_counts = pd.Series(country_counts).sort_values(ascending=False)

# Display the result
print(country_counts)


In [None]:
# Create a bar chart of the country counts
plt.bar(country_counts.index, country_counts.values)

# Set the chart title and axis labels
plt.title('Article Counts by Country')
plt.xlabel('Country')
plt.ylabel('Number of Articles')

# Rotate the x-axis labels by 45 degrees
plt.xticks(rotation=45, ha='right')

# Show the chart
plt.show()