# Data Input and Output

- CSV  
- Excel  
- HTML  
- SQL

In [8]:
import pandas as pd

In [9]:
data_dir = '../assets/data/'

## CSV

In [12]:
# reading a csv file
pd.read_csv(data_dir + 'data.csv') # reads csv as DataFrame # example

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60,110,130,409.1
1,60,117,145,479.0
2,60,103,135,340.0
3,45,109,175,282.4
4,45,117,148,406.0
...,...,...,...,...
164,60,105,140,290.8
165,60,110,145,300.0
166,60,115,145,310.2
167,75,120,150,320.4


In [14]:
df = pd.read_csv(data_dir + 'data.csv') # store in variable

In [15]:
df.to_csv(data_dir + 'my_output',index=False) # saving data to file

In [16]:
pd.read_csv(data_dir + 'my_output')

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60,110,130,409.1
1,60,117,145,479.0
2,60,103,135,340.0
3,45,109,175,282.4
4,45,117,148,406.0
...,...,...,...,...
164,60,105,140,290.8
165,60,110,145,300.0
166,60,115,145,310.2
167,75,120,150,320.4


## Excel

In [19]:
# reading excel files
# reading excel files works only if xlrd and openpyxl are installed
# pandas reads an excel file as a workbook
# a workbook is a bunch of sheets
# and each sheet is a DataFrame
# dropping "Unnamed: 0" with index_col=[0]
pd.read_excel(data_dir + 'file_example_XLS_50.xls',sheet_name='Sheet1',index_col=[0]) # file_example_XLS_50.xls # Excel_Sample.xlsx

Unnamed: 0_level_0,First Name,Last Name,Gender,Country,Age,Date,Id
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Dulce,Abril,Female,United States,32,15/10/2017,1562
2,Mara,Hashimoto,Female,Great Britain,25,16/08/2016,1582
3,Philip,Gent,Male,France,36,21/05/2015,2587
4,Kathleen,Hanner,Female,United States,25,15/10/2017,3549
5,Nereida,Magwood,Female,United States,58,16/08/2016,2468
6,Gaston,Brumm,Male,United States,24,21/05/2015,2554
7,Etta,Hurn,Female,Great Britain,56,15/10/2017,3598
8,Earlean,Melgar,Female,United States,27,16/08/2016,2456
9,Vincenza,Weiland,Female,United States,40,21/05/2015,6548
10,Fallon,Winward,Female,Great Britain,28,16/08/2016,5486


In [21]:
# saving DataFrame to Excel file
df.to_excel(data_dir + 'Excel_Sample2.xlsx',sheet_name='NewSheet')

## HTML

Libraies needed for the examples are lxml, html5lib and BeautifulSoup4. The web page with the list of failed banks that will be used as an example can be found at http://www.fdic.gov/bank/individual/failed/banklist.html

In [22]:
html_bank_list = 'http://www.fdic.gov/bank/individual/failed/banklist.html'

In [23]:
# the following code returns 'ValueError: No tables found'
# most likely because of dynamic page loading
# df = pd.read_html(html_bank_list) # returns ValueError

In [24]:
# Wikipedia example
html_wiki_page = 'https://en.wikipedia.org/wiki/Minnesota'

In [26]:
table_MN = pd.read_html(html_wiki_page) # read_html ready all tables on the page

In [27]:
print(f'Total tables: {len(table_MN)}')

Total tables: 29


In [28]:
df = table_MN[0] # first table on page
df.head()

Unnamed: 0,Minnesota,Minnesota.1
0,State,State
1,FlagSeal,FlagSeal
2,Nicknames: Mni Sóta Makoce (Land where the wat...,Nicknames: Mni Sóta Makoce (Land where the wat...
3,Motto: L'Étoile du Nord (French: The Star of t...,Motto: L'Étoile du Nord (French: The Star of t...
4,"Anthem: ""Hail! Minnesota""","Anthem: ""Hail! Minnesota"""


In [30]:
# bank list example with BeautifulSoup
# following https://www.youtube.com/watch?v=tnq-f88Yofg
from selenium import webdriver
from bs4 import BeautifulSoup
import time

In [31]:
options = webdriver.FirefoxOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

In [32]:
site = html_bank_list

In [33]:
wd = webdriver.Firefox(options=options)

The geckodriver version (0.33.0) detected in PATH at /opt/homebrew/bin/geckodriver might not be compatible with the detected firefox version (130.0); currently, geckodriver 0.35.0 is recommended for firefox 130.*, so it is advised to delete the driver in PATH and retry


In [34]:
wd.get

<bound method WebDriver.get of <selenium.webdriver.firefox.webdriver.WebDriver (session="fba74392-b042-4df7-93f3-be6f3c699f0b")>>

In [35]:
wd.get(site)
time.sleep(10)

In [36]:
html = wd.page_source

In [37]:
html

'<html lang="en" dir="ltr" prefix="og: https://ogp.me/ns#" class=" js" data-once="displace-header"><head>\n  <meta charset="utf-8">\n<link rel="canonical" href="https://www.fdic.gov/bank-failures/failed-bank-list">\n<meta name="Generator" content="Drupal 10 (https://www.drupal.org)">\n<meta name="MobileOptimized" content="width">\n<meta name="HandheldFriendly" content="true">\n<meta name="viewport" content="width=device-width, initial-scale=1.0">\n<script type="application/ld+json">{\n          "@context": "https://schema.org",\n          "@type": "BreadcrumbList",\n          "itemListElement": [{\n            "@type": "ListItem",\n            "position": "1",\n            "name": "Home",\n            "item": "https://www.fdic.gov/"\n          },{\n            "@type": "ListItem",\n            "position": "2",\n            "name": "Resources",\n            "item": "https://www.fdic.gov/resources"\n          },{\n            "@type": "ListItem",\n            "position": "3",\n          

In [38]:
df = pd.read_html(html)

  df = pd.read_html(html)


In [39]:
df[0]

Unnamed: 0,Bank Name,City,State,Cert,Aquiring Institution,Closing Date,Fund
0,Republic First Bank dba Republic Bank,Philadelphia,Pennsylvania,27332,"Fulton Bank, National Association","April 26, 2024",10546
1,Citizens Bank,Sac City,Iowa,8758,Iowa Trust & Savings Bank,"November 3, 2023",10545
2,Heartland Tri-State Bank,Elkhart,Kansas,25851,"Dream First Bank, N.A.","July 28, 2023",10544
3,First Republic Bank,San Francisco,California,59017,"JPMorgan Chase Bank, N.A.","May 1, 2023",10543
4,Signature Bank,New York,New York,57053,"Flagstar Bank, N.A.","March 12, 2023",10540
5,Silicon Valley Bank,Santa Clara,California,24735,First Citizens Bank & Trust Company,"March 10, 2023",10539
6,Almena State Bank,Almena,Kansas,15426,Equity Bank,"October 23, 2020",10538
7,First City Bank of Florida,Fort Walton Beach,Florida,16748,"United Fidelity Bank, fsb","October 16, 2020",10537
8,The First State Bank,Barboursville,West Virginia,14361,"MVB Bank, Inc.","April 3, 2020",10536
9,Ericson State Bank,Ericson,Nebraska,18265,Farmers and Merchants Bank,"February 14, 2020",10535


## SQL

In [40]:
from sqlalchemy import create_engine

In [41]:
# creating a very simple sql engine in memory
engine = create_engine('sqlite:///:memory:')

In [43]:
df = pd.read_csv(data_dir + 'data.csv')
df.to_sql('my_table',engine)

169

In [44]:
sqldf = pd.read_sql('my_table',con=engine)

In [45]:
sqldf

Unnamed: 0,index,Duration,Pulse,Maxpulse,Calories
0,0,60,110,130,409.1
1,1,60,117,145,479.0
2,2,60,103,135,340.0
3,3,45,109,175,282.4
4,4,45,117,148,406.0
...,...,...,...,...,...
164,164,60,105,140,290.8
165,165,60,110,145,300.0
166,166,60,115,145,310.2
167,167,75,120,150,320.4
