In [None]:
"""
웹 요청을 통해 json 데이터를 받아 출력하기
"""
import requests

url = "https://openlibrary.org/search.json"
params = {
    "subject": "books",
    "limit": 30
}

response = requests.get(url, params=params)
data = response.json()

for item in data["docs"]:
    print(item["title"], "- by ",item["author_name"][0])

In [None]:
"""
xml 데이터 읽어오기
"""
import xml.etree.ElementTree as ET
xml_data = """
<library>
    <book id="1">
        <title>파이썬 데이터 분석</title>
        <author>홍길동</author>
        <category>IT</category>
        <loans>150</loans>
    </book>
    <book id="2">
        <title>2025 세계 경제 전망</title>
        <author>이세계</author>
        <category>경제</category>
        <loans>85</loans>
    </book>
    <book id="3">
        <title>명상과 치유</title>
        <author>김마음</author>
        <category>인문</category>
        <loans>210</loans>
    </book>
</library>
"""

root = ET.fromstring(xml_data)
for book in root.findall("book"):
    print(book.find("title").text)

In [10]:
"""
pandas로 데이터 프레임 만들기
"""
import pandas as pd

# Series 생성
views_series = pd.Series([120, 340, 560, 430, 290])
print(views_series)

# DataFrame 생성
data = {
    "time": ["09시", "12시", "15시", "18시", "21시"],
    "views": [120, 340, 560, 430, 290]
}

df = pd.DataFrame(data)
print(df)


0    120
1    340
2    560
3    430
4    290
dtype: int64
  time  views
0  09시    120
1  12시    340
2  15시    560
3  18시    430
4  21시    290


In [12]:

""" # isuse_data.csv
date,media,views,comments
2025-01-01,A신문,520,34
2025-01-02,B신문,20,
"""
df = pd.read_csv("issue_data.csv")

print(df.head()) # 데이터의 일부를 빠르게 확인하기 위한 용도
print("-" * 10, "df.info()", "-" * 10)
print(df.info()) # 데이터 수, 데이터 타입, 결측치 여부를 점검하는 데 사용

views = df["views"]
print(views)

print(df.iloc[0]) # 첫 번째 행 확인

print(df.loc[1, "media"]) # 특정 컬럼과 행 동시 접근

         date media  views  comments
0  2025-01-01   A신문    520      34.0
1  2025-01-02   B신문     20       NaN
---------- df.info() ----------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   date      2 non-null      object 
 1   media     2 non-null      object 
 2   views     2 non-null      int64  
 3   comments  1 non-null      float64
dtypes: float64(1), int64(1), object(2)
memory usage: 196.0+ bytes
None
0    520
1     20
Name: views, dtype: int64
date        2025-01-01
media              A신문
views              520
comments          34.0
Name: 0, dtype: object
B신문


In [13]:
views = df[df["views"] > 300] # 특정 조건에 따른 분류
print(views)

media = df[df["media"] == 'B신문'] # 특정 조건에 따른 분류
print(media)

# 조회 수 기준 내림차순 정렬
sorted_df = df.sort_values(by="views", ascending=False)
print(sorted_df.head())

# 통계 요약
print(df.describe())


         date media  views  comments
0  2025-01-01   A신문    520      34.0
         date media  views  comments
1  2025-01-02   B신문     20       NaN
         date media  views  comments
0  2025-01-01   A신문    520      34.0
1  2025-01-02   B신문     20       NaN
            views  comments
count    2.000000       1.0
mean   270.000000      34.0
std    353.553391       NaN
min     20.000000      34.0
25%    145.000000      34.0
50%    270.000000      34.0
75%    395.000000      34.0
max    520.000000      34.0


In [14]:
# 결측치 확인
print(df.isnull().sum())

# 결측치가 포함된 행 제거
df_clean = df.dropna()

# comments 결측치를 평균값으로 대체
df["comments"] = df["comments"].fillna(df["comments"].mean())
print(df.head())

date        0
media       0
views       0
comments    1
dtype: int64
         date media  views  comments
0  2025-01-01   A신문    520      34.0
1  2025-01-02   B신문     20      34.0


In [None]:
import numpy as np

views_array = df["views"].to_numpy()
normalized = (views_array - np.mean(views_array)) / np.std(views_array)

result_df = pd.DataFrame({
    "views": views_array,
    "normalized_views": normalized
})

print(result_df.head())

   views  normalized_views
0    520               1.0
1     20              -1.0
