# Data Collection

### CSV

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('data/us_indicators.csv')

In [None]:
df

### Excel

In [None]:
import pandas as pd
# Also require installing openpyxl

In [None]:
df = pd.read_excel('data/us_indicators.xlsx', sheet_name='Sheet1')

In [None]:
df

### Database

In [None]:
import sqlite3
import pandas as pd

In [None]:
# Create connection to database
conn = sqlite3.connect(database='data/data.db')

In [None]:
# Set query command
query = 'SELECT date, cpi, policy_rate, neer, money_supply FROM us_indicators'

In [None]:
df = pd.read_sql_query(query, conn)

In [None]:
# Close connection
conn.close()

In [None]:
df

### JSON

In [None]:
import json
import pandas as pd

In [None]:
# Load JSON file as dictionary

with open('data/us_indicators.json') as file:
    data = json.load(file)

In [None]:
data

In [None]:
df = pd.DataFrame(data)

In [None]:
df

### API

Get data from https://fred.stlouisfed.org/docs/api/fred/

In [None]:
import requests

In [None]:
api_key = '5fd98b97565288c4cecea09d41f7a4ea'
target_series = 'GNPCA'

In [None]:
url = 'https://api.stlouisfed.org/fred/series/observations'

params = {
    'series_id': target_series,
    'api_key': api_key,
    'file_type': 'json'
}

In [None]:
response = requests.get(url, params=params)

In [None]:
# Check if the request was successful
if response.status_code == 200:
    # Load response as dictionary
    data = response.json()
else:
    # If request was not successful, print reason
    print(f"Error: {response.status_code}")

In [None]:
data

In [None]:
data['observations']

In [None]:
df = pd.DataFrame(data['observations'])

In [None]:
df

### Web Scraping

Web scraping is not recommended as it often violates the terms of use for many websites.  
Whenever possible, use an API.

#### Table

Get data from https://en.wikipedia.org/wiki/List_of_Nobel_laureates_in_Physics

In [None]:
import pandas as pd
# Also require installing lxml

In [None]:
url = 'https://en.wikipedia.org/wiki/List_of_Nobel_laureates_in_Physics'

In [None]:
# Get response
response = pd.read_html(url)

In [None]:
response

In [None]:
df = response[0]

In [None]:
df

#### Text

Get data from http://example.com

In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
url = 'http://example.com'

In [None]:
# Get response
response = requests.get(url)

In [None]:
# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Extract the text content
    paragraphs = soup.find_all('p')

    # Create a list to store data
    data = []
    
    # Print the text content
    for paragraph in paragraphs:
        data.append(paragraph.get_text())
else:
    # If request was not successful, print reason
    print(f"Error: {response.status_code}")

In [None]:
data

### Text file

In [None]:
# Load text line in the file as list

with open('data/fed_meeting.txt', 'r') as file:
    data = file.readlines()

In [None]:
data

### Picture

In [None]:
import os
from PIL import Image

In [None]:
# Target picture folder
folder_path = 'data/nasdaq'

In [None]:
# Target files in the folder
input_list = os.listdir(folder_path)

In [None]:
input_list

In [None]:
# Create a list to store data
image_list = []

In [None]:
for filename in input_list:
    # Check if a file is picture
    if filename.endswith(".jpg") or filename.endswith(".png"):
        # Create full path to the file
        file_path = os.path.join(folder_path, filename)
        img = Image.open(file_path)
        image_list.append(img)

In [None]:
image_list

In [None]:
image_list[0].show()