# 第3章：建立與保存DataFrame

## 3.1 從無到有建立DataFrame

In [1]:
import pandas as pd
import numpy as np
#pd.set_option('max_columns', 4, 'max_rows', 10, 'max_colwidth', 12)

fname = ['Paul', 'John', 'Richard', 'George']
lname = ['McCartney', 'Lennon', 'Starkey', 'Harrison']
birth = [1942, 1940, 1940, 1943]

In [3]:
people = {'first': fname, 'last': lname, 'birth': birth}

In [5]:
beatles = pd.DataFrame(people)
beatles

Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [7]:
beatles.index

RangeIndex(start=0, stop=4, step=1)

In [9]:
pd.DataFrame(people, index=['a', 'b', 'c', 'd'])

Unnamed: 0,first,last,birth
a,Paul,McCartney,1942
b,John,Lennon,1940
c,Richard,Starkey,1940
d,George,Harrison,1943


In [11]:
pd.DataFrame([{"first":"Paul","last":"McCartney", "birth":1942},
              {"first":"John","last":"Lennon", "birth":1940},
              {"first":"Richard","last":"Starkey", "birth":1940},
              {"first":"George","last":"Harrison", "birth":1943}])

Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [13]:
pd.DataFrame([{"first":"Paul","last":"McCartney", "birth":1942},
              {"first":"John","last":"Lennon", "birth":1940},
              {"first":"Richard","last":"Starkey", "birth":1940},
              {"first":"George","last":"Harrison", "birth":1943}],
              columns=['last', 'first', 'birth'])

Unnamed: 0,last,first,birth
0,McCartney,Paul,1942
1,Lennon,John,1940
2,Starkey,Richard,1940
3,Harrison,George,1943


## 3.2 存取CSV檔案

In [16]:
beatles

Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [18]:
from io import StringIO
fout = StringIO()
beatles.to_csv(fout)  

In [20]:
print(fout.getvalue())

,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943



In [22]:
fout.seek(0)
pd.read_csv(fout)

Unnamed: 0.1,Unnamed: 0,first,last,birth
0,0,Paul,McCartney,1942
1,1,John,Lennon,1940
2,2,Richard,Starkey,1940
3,3,George,Harrison,1943


In [24]:
_ = fout.seek(0)
pd.read_csv(fout, index_col=0)

Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [26]:
fout = StringIO()
beatles.to_csv(fout, index=False) 
print(fout.getvalue())

first,last,birth
Paul,McCartney,1942
John,Lennon,1940
Richard,Starkey,1940
George,Harrison,1943



## 3.3 讀取大型的CSV檔案

In [28]:
diamonds = pd.read_csv('data/diamonds.csv', nrows=1000)
diamonds

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
995,0.54,Ideal,D,VVS2,61.4,52.0,2897,5.30,5.34,3.26
996,0.72,Ideal,E,SI1,62.5,55.0,2897,5.69,5.74,3.57
997,0.72,Good,F,VS1,59.4,61.0,2897,5.82,5.89,3.48
998,0.74,Premium,D,VS2,61.8,58.0,2897,5.81,5.77,3.58


In [30]:
diamonds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    1000 non-null   float64
 1   cut      1000 non-null   object 
 2   color    1000 non-null   object 
 3   clarity  1000 non-null   object 
 4   depth    1000 non-null   float64
 5   table    1000 non-null   float64
 6   price    1000 non-null   int64  
 7   x        1000 non-null   float64
 8   y        1000 non-null   float64
 9   z        1000 non-null   float64
dtypes: float64(6), int64(1), object(3)
memory usage: 78.3+ KB


In [32]:
diamonds2 = pd.read_csv('data/diamonds.csv', nrows=1000,
                        dtype={'carat': np.float32, 'depth': np.float32,
                               'table': np.float32, 'x': np.float32,
                               'y': np.float32, 'z': np.float32,
                               'price': np.int16})

diamonds2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    1000 non-null   float32
 1   cut      1000 non-null   object 
 2   color    1000 non-null   object 
 3   clarity  1000 non-null   object 
 4   depth    1000 non-null   float32
 5   table    1000 non-null   float32
 6   price    1000 non-null   int16  
 7   x        1000 non-null   float32
 8   y        1000 non-null   float32
 9   z        1000 non-null   float32
dtypes: float32(6), int16(1), object(3)
memory usage: 49.0+ KB


In [34]:
diamonds.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.68928,61.7228,57.7347,2476.54,5.60594,5.59918,3.45753
std,0.195291,1.758879,2.467946,839.57562,0.625173,0.611974,0.389819
min,0.2,53.0,52.0,326.0,3.79,3.75,2.27
25%,0.7,60.9,56.0,2777.0,5.64,5.63,3.45
50%,0.71,61.8,57.0,2818.0,5.77,5.76,3.55
75%,0.79,62.6,59.0,2856.0,5.92,5.91,3.64
max,1.27,69.5,70.0,2898.0,7.12,7.05,4.33


In [36]:
diamonds2.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.68928,61.722801,57.734699,2476.54,5.60594,5.59918,3.45753
std,0.195291,1.758878,2.467944,839.57562,0.625173,0.611972,0.389819
min,0.2,53.0,52.0,326.0,3.79,3.75,2.27
25%,0.7,60.900002,56.0,2777.0,5.64,5.63,3.45
50%,0.71,61.799999,57.0,2818.0,5.77,5.76,3.55
75%,0.79,62.599998,59.0,2856.0,5.92,5.91,3.64
max,1.27,69.5,70.0,2898.0,7.12,7.05,4.33


In [38]:
diamonds2.cut.value_counts()

cut
Ideal        333
Premium      290
Very Good    226
Good          89
Fair          62
Name: count, dtype: int64

In [40]:
diamonds2.color.value_counts()

color
E    240
F    226
G    139
D    129
H    125
I     95
J     46
Name: count, dtype: int64

In [42]:
diamonds2.clarity.value_counts()

clarity
SI1     306
VS2     218
VS1     159
SI2     154
VVS2     62
VVS1     58
I1       29
IF       14
Name: count, dtype: int64

In [44]:
diamonds3 = pd.read_csv('data/diamonds.csv', nrows=1000,
                        dtype={'carat': np.float32, 'depth': np.float32,
                               'table': np.float32, 'x': np.float32,
                               'y': np.float32, 'z': np.float32,
                               'price': np.int16,
                               'cut': 'category', 'color': 'category',
                               'clarity': 'category'})

diamonds3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    1000 non-null   float32 
 1   cut      1000 non-null   category
 2   color    1000 non-null   category
 3   clarity  1000 non-null   category
 4   depth    1000 non-null   float32 
 5   table    1000 non-null   float32 
 6   price    1000 non-null   int16   
 7   x        1000 non-null   float32 
 8   y        1000 non-null   float32 
 9   z        1000 non-null   float32 
dtypes: category(3), float32(6), int16(1)
memory usage: 29.4 KB


In [46]:
cols = ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price']
diamonds4 = pd.read_csv('data/diamonds.csv', nrows=1000,
                        dtype={'carat': np.float32, 'depth': np.float32,
                               'table': np.float32, 'price': np.int16,
                               'cut': 'category', 'color': 'category',
                               'clarity': 'category'},
                        usecols=cols)

diamonds4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    1000 non-null   float32 
 1   cut      1000 non-null   category
 2   color    1000 non-null   category
 3   clarity  1000 non-null   category
 4   depth    1000 non-null   float32 
 5   table    1000 non-null   float32 
 6   price    1000 non-null   int16   
dtypes: category(3), float32(3), int16(1)
memory usage: 17.6 KB


In [48]:
cols = ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price']
diamonds_iter = pd.read_csv('data/diamonds.csv', nrows=1000,
                            dtype={'carat': np.float32, 'depth': np.float32,
                                   'table': np.float32, 'price': np.int16,
                                   'cut': 'category', 'color': 'category',
                                   'clarity': 'category'},
                            usecols=cols,
                            chunksize=200)

def process(df):
    return f'processed {df.size} items'

for chunk in diamonds_iter:
    process(chunk)

In [54]:
np.iinfo(np.int16)

iinfo(min=-32768, max=32767, dtype=int16)

### 小編補充

In [52]:
diamonds4['price'].min()

326

In [56]:
diamonds4['price'].max()

2898

In [58]:
np.finfo(np.float16)

finfo(resolution=0.001, min=-6.55040e+04, max=6.55040e+04, dtype=float16)

In [60]:
diamonds.price.memory_usage()

8132

In [62]:
diamonds.price.memory_usage(index=False)

8000

In [64]:
diamonds.cut.memory_usage(deep=True)

55465

In [66]:
!pip install pyarrow

Defaulting to user installation because normal site-packages is not writeable


In [68]:
diamonds4.to_feather('d.arr')
diamonds5 = pd.read_feather('d.arr')

In [70]:
diamonds4.to_parquet('d.pqt')

## 3.4 使用Excel檔案

In [72]:
!pip install xlwt

Defaulting to user installation because normal site-packages is not writeable
Collecting xlwt
  Downloading xlwt-1.3.0-py2.py3-none-any.whl.metadata (3.5 kB)
Downloading xlwt-1.3.0-py2.py3-none-any.whl (99 kB)
   ---------------------------------------- 0.0/100.0 kB ? eta -:--:--
   ---- ----------------------------------- 10.2/100.0 kB ? eta -:--:--
   ------------------------------------ --- 92.2/100.0 kB 1.1 MB/s eta 0:00:01
   ---------------------------------------- 100.0/100.0 kB 1.2 MB/s eta 0:00:00
Installing collected packages: xlwt
Successfully installed xlwt-1.3.0


In [74]:
!pip install openpyxl

Defaulting to user installation because normal site-packages is not writeable


In [76]:
!pip install xlrd

Defaulting to user installation because normal site-packages is not writeable
Collecting xlrd
  Downloading xlrd-2.0.1-py2.py3-none-any.whl.metadata (3.4 kB)
Downloading xlrd-2.0.1-py2.py3-none-any.whl (96 kB)
   ---------------------------------------- 0.0/96.5 kB ? eta -:--:--
   ---- ----------------------------------- 10.2/96.5 kB ? eta -:--:--
   -------------------------------------- - 92.2/96.5 kB 1.1 MB/s eta 0:00:01
   ---------------------------------------- 96.5/96.5 kB 918.0 kB/s eta 0:00:00
Installing collected packages: xlrd
Successfully installed xlrd-2.0.1


In [78]:
beatles.to_excel('beat.xls')

ValueError: No engine for filetype: 'xls'

In [144]:
beatles.to_excel('beat.xlsx')

In [146]:
beat2 = pd.read_excel('beat.xls')
beat2

FileNotFoundError: [Errno 2] No such file or directory: 'beat.xls'

In [148]:
beat2 = pd.read_excel('beat.xls', index_col=0)
beat2

FileNotFoundError: [Errno 2] No such file or directory: 'beat.xls'

In [150]:
beat2.dtypes

NameError: name 'beat2' is not defined

In [152]:
xl_writer = pd.ExcelWriter('beat.xlsx')
beatles.to_excel(xl_writer, sheet_name='All')
beatles[beatles.birth < 1941].to_excel(xl_writer, sheet_name='1940')
xl_writer.save()

AttributeError: 'OpenpyxlWriter' object has no attribute 'save'

## 3.5 讀取ZIP檔案中的資料 

In [82]:
autos = pd.read_csv('data/vehicles.csv.zip')
autos

  autos = pd.read_csv('data/vehicles.csv.zip')


Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,...,mfrCode,c240Dscr,charge240b,c240bDscr,createdOn,modifiedOn,startStop,phevCity,phevHwy,phevComb
0,15.695714,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
1,29.964545,0.0,0.0,0.0,9,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
2,12.207778,0.0,0.0,0.0,23,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
3,29.964545,0.0,0.0,0.0,10,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
4,17.347895,0.0,0.0,0.0,17,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39096,14.982273,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
39097,14.330870,0.0,0.0,0.0,20,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
39098,15.695714,0.0,0.0,0.0,18,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
39099,15.695714,0.0,0.0,0.0,18,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0


In [84]:
autos.modifiedOn.dtype

dtype('O')

In [86]:
autos.modifiedOn

0        Tue Jan 01 00:00:00 EST 2013
1        Tue Jan 01 00:00:00 EST 2013
2        Tue Jan 01 00:00:00 EST 2013
3        Tue Jan 01 00:00:00 EST 2013
4        Tue Jan 01 00:00:00 EST 2013
                     ...             
39096    Tue Jan 01 00:00:00 EST 2013
39097    Tue Jan 01 00:00:00 EST 2013
39098    Tue Jan 01 00:00:00 EST 2013
39099    Tue Jan 01 00:00:00 EST 2013
39100    Tue Jan 01 00:00:00 EST 2013
Name: modifiedOn, Length: 39101, dtype: object

In [88]:
pd.to_datetime(autos.modifiedOn)

  pd.to_datetime(autos.modifiedOn)


ValueError: time data "Fri Apr 01 00:00:00 EDT 2016" doesn't match format "%a %b %d %H:%M:%S EST %Y", at position 1. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

In [90]:
autos = pd.read_csv('data/vehicles.csv.zip', parse_dates=['modifiedOn'])  
autos.modifiedOn

  autos = pd.read_csv('data/vehicles.csv.zip', parse_dates=['modifiedOn'])
  autos = pd.read_csv('data/vehicles.csv.zip', parse_dates=['modifiedOn'])


0        Tue Jan 01 00:00:00 EST 2013
1        Tue Jan 01 00:00:00 EST 2013
2        Tue Jan 01 00:00:00 EST 2013
3        Tue Jan 01 00:00:00 EST 2013
4        Tue Jan 01 00:00:00 EST 2013
                     ...             
39096    Tue Jan 01 00:00:00 EST 2013
39097    Tue Jan 01 00:00:00 EST 2013
39098    Tue Jan 01 00:00:00 EST 2013
39099    Tue Jan 01 00:00:00 EST 2013
39100    Tue Jan 01 00:00:00 EST 2013
Name: modifiedOn, Length: 39101, dtype: object

In [92]:
import zipfile

In [94]:
with zipfile.ZipFile('data/kaggle-survey-2018.zip') as z:
    print('\n'.join(z.namelist()))
    kag = pd.read_csv(z.open('multipleChoiceResponses.csv'))
    kag_questions = kag.iloc[0]
    survey = kag.iloc[1:]

multipleChoiceResponses.csv
freeFormResponses.csv
SurveySchema.csv


  kag = pd.read_csv(z.open('multipleChoiceResponses.csv'))


In [96]:
survey.head(2).T

Unnamed: 0,1,2
Time from Start to Finish (seconds),710,434
Q1,Female,Male
Q1_OTHER_TEXT,-1,-1
Q2,45-49,30-34
Q3,United States of America,Indonesia
...,...,...
Q50_Part_5,,
Q50_Part_6,,
Q50_Part_7,,
Q50_Part_8,,


## 3.6 存取資料庫

In [None]:
import sqlite3
con = sqlite3.connect('data/beat.db')
with con:
    cur = con.cursor()
    cur.execute("""DROP TABLE Band""")
    cur.execute("""CREATE TABLE Band(id INTEGER PRIMARY KEY,
                   fname TEXT, lname TEXT, birthyear INT)""")
    cur.execute("""INSERT INTO Band VALUES(
                   0, 'Paul', 'McCartney', 1942)""")
    cur.execute("""INSERT INTO Band VALUES(
                   1, 'John', 'Lennon', 1940)""")
    _ = con.commit()

In [None]:
!pip install sqlalchemy

In [None]:
import sqlalchemy as sa
engine = sa.create_engine('sqlite:///data/beat.db', echo=True)
sa_connection = engine.connect()

beat = pd.read_sql('Band', sa_connection, index_col='id')
beat

In [None]:
sql = '''SELECT fname, birthyear from Band'''
fnames = pd.read_sql(sql, con)
fnames

## 3.7 存取JSON格式的資料

In [110]:
import json
encoded = json.dumps(people)
encoded

'{"first": ["Paul", "John", "Richard", "George"], "last": ["McCartney", "Lennon", "Starkey", "Harrison"], "birth": [1942, 1940, 1940, 1943]}'

In [112]:
json.loads(encoded)

{'first': ['Paul', 'John', 'Richard', 'George'],
 'last': ['McCartney', 'Lennon', 'Starkey', 'Harrison'],
 'birth': [1942, 1940, 1940, 1943]}

In [114]:
beatles = pd.read_json(encoded)
beatles

  beatles = pd.read_json(encoded)


Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [116]:
records = beatles.to_json(orient='records')
records

'[{"first":"Paul","last":"McCartney","birth":1942},{"first":"John","last":"Lennon","birth":1940},{"first":"Richard","last":"Starkey","birth":1940},{"first":"George","last":"Harrison","birth":1943}]'

In [120]:
pd.read_json(records, orient='records')

  pd.read_json(records, orient='records')


Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [122]:
split = beatles.to_json(orient='split')
split

'{"columns":["first","last","birth"],"index":[0,1,2,3],"data":[["Paul","McCartney",1942],["John","Lennon",1940],["Richard","Starkey",1940],["George","Harrison",1943]]}'

In [124]:
pd.read_json(split, orient='split')

  pd.read_json(split, orient='split')


Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [126]:
index = beatles.to_json(orient='index')
index

'{"0":{"first":"Paul","last":"McCartney","birth":1942},"1":{"first":"John","last":"Lennon","birth":1940},"2":{"first":"Richard","last":"Starkey","birth":1940},"3":{"first":"George","last":"Harrison","birth":1943}}'

In [128]:
pd.read_json(index, orient='index')

  pd.read_json(index, orient='index')


Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [130]:
values = beatles.to_json(orient='values')
values

'[["Paul","McCartney",1942],["John","Lennon",1940],["Richard","Starkey",1940],["George","Harrison",1943]]'

In [132]:
pd.read_json(values, orient='values')

  pd.read_json(values, orient='values')


Unnamed: 0,0,1,2
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [134]:
(pd.read_json(values, orient='values')
   .rename(columns=dict(enumerate(['first', 'last', 'birth'])))
)

  (pd.read_json(values, orient='values')


Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [136]:
table = beatles.to_json(orient='table')
table

'{"schema":{"fields":[{"name":"index","type":"integer"},{"name":"first","type":"string"},{"name":"last","type":"string"},{"name":"birth","type":"integer"}],"primaryKey":["index"],"pandas_version":"1.4.0"},"data":[{"index":0,"first":"Paul","last":"McCartney","birth":1942},{"index":1,"first":"John","last":"Lennon","birth":1940},{"index":2,"first":"Richard","last":"Starkey","birth":1940},{"index":3,"first":"George","last":"Harrison","birth":1943}]}'

In [138]:
pd.read_json(table, orient='table')

  pd.read_json(table, orient='table')


Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


In [140]:
output = beat.to_dict()
output

NameError: name 'beat' is not defined

In [142]:
output['version'] = '0.4.1'
json.dumps(output)

NameError: name 'output' is not defined

## 3.8 讀取HTML表格

In [162]:
!pip install lxml

Defaulting to user installation because normal site-packages is not writeable


In [164]:
url ='https://en.wikipedia.org/wiki/The_Beatles_discography'
dfs = pd.read_html(url)
len(dfs)

72

In [158]:
dfs[0]

Unnamed: 0,The Beatles,The Beatles.1
0,The Beatles in 1964; clockwise from top left: ...,The Beatles in 1964; clockwise from top left: ...
1,Background information,Background information
2,Origin,"Liverpool, England"
3,Genres,Rockpopbeatpsychedelia
4,Discography,Albumssinglessongs
5,Years active,1960–1970
6,Labels,PolydorParlophoneTollieVee-JayCapitolSwanUnite...
7,Spinoff of,The Quarrymen
8,,
9,Past members,John Lennon Paul McCartney George Harrison Rin...


In [170]:
# url ='https://en.wikipedia.org/wiki/The_Beatles_discography'
url="https://en.m.wikipedia.org/wiki/The_Beatles_albums_discography"
dfs = pd.read_html(url, match='List of studio albums', na_values='—')
len(dfs)

2

In [172]:
dfs[0].columns

MultiIndex([(               'Title',            'Title'),
            (    'Album details[A]', 'Album details[A]'),
            ('Peak chart positions',        'UK [8][9]'),
            ('Peak chart positions',         'AUS [10]'),
            ('Peak chart positions',         'CAN [11]'),
            ('Peak chart positions',         'FRA [12]'),
            ('Peak chart positions',         'GER [13]'),
            ('Peak chart positions',         'NOR [14]'),
            ('Peak chart positions',      'US [15][16]'),
            (      'Certifications',   'Certifications'),
            (               'Sales',            'Sales')],
           )

In [174]:
url ='https://en.m.wikipedia.org/wiki/The_Beatles_albums_discography'
dfs = pd.read_html(url, match='List of studio albums', na_values='—',
                   header=[0,1])
len(dfs)

2

In [176]:
dfs[0]

Unnamed: 0_level_0,Title,Album details[A],Peak chart positions,Peak chart positions,Peak chart positions,Peak chart positions,Peak chart positions,Peak chart positions,Peak chart positions,Certifications,Sales
Unnamed: 0_level_1,Title,Album details[A],UK [8][9],AUS [10],CAN [11],FRA [12],GER [13],NOR [14],US [15][16],Certifications,Sales
0,Please Please Me,Released: 22 March 1963 Label: Parlophone,1,,,5,5,,155,BPI: Platinum[17] ARIA: Gold[18] MC: Gold[19] ...,
1,With the Beatles[B],Released: 22 November 1963 Label: Parlophone (...,1,,,5,1,,179,BPI: Gold[17] ARIA: Gold[18] BVMI: Gold[21] MC...,
2,A Hard Day's Night,Released: 10 July 1964 Label: Parlophone,1,1,,,1,,,BPI: Platinum[17] ARIA: Gold[18],
3,Beatles for Sale,Released: 4 December 1964 Label: Parlophone,1,1,,,1,,,BPI: Gold[17] ARIA: Gold[18] MC: Gold[19] RIAA...,"UK: 750,000[22]"
4,Help!,Released: 6 August 1965 Label: Parlophone,1,1,,5,1,,,BPI: Platinum[17] ARIA: Gold[18],
5,Rubber Soul,Released: 3 December 1965 Label: Parlophone,1,1,,5,1,,,BPI: 2× Platinum[17] ARIA: Platinum[18] BVMI: ...,
6,Revolver,Released: 5 August 1966 Label: Parlophone,1,1,,5,1,14,,BPI: 2× Platinum[17] ARIA: Platinum[18],
7,Sgt. Pepper's Lonely Hearts Club Band,Released: 26 May 1967[23] Label: Parlophone,1,1,1,4,1,1,1,BPI: 18× Platinum[17] ARIA: 4× Platinum[18] BV...,"UK: 5,340,000[25]"
8,"The Beatles (""The White Album"")",Released: 22 November 1968 Label: Apple,1,1,1,1,1,1,1,BPI: 2× Platinum[17] ARIA: 2× Platinum[18] MC:...,
9,Yellow Submarine[C],Released: 17 January 1969 Label: Apple,3,4,1,4,5,1,2,BPI: Gold[17] MC: Gold[19] RIAA: Platinum[20],


In [178]:
df = dfs[0]
df.columns = ['Title', 'Release', 'UK', 'AUS', 'CAN', 'FRA', 'GER',
              'NOR', 'US', 'Certifications', 'Sales']
df

Unnamed: 0,Title,Release,UK,AUS,CAN,FRA,GER,NOR,US,Certifications,Sales
0,Please Please Me,Released: 22 March 1963 Label: Parlophone,1,,,5,5,,155,BPI: Platinum[17] ARIA: Gold[18] MC: Gold[19] ...,
1,With the Beatles[B],Released: 22 November 1963 Label: Parlophone (...,1,,,5,1,,179,BPI: Gold[17] ARIA: Gold[18] BVMI: Gold[21] MC...,
2,A Hard Day's Night,Released: 10 July 1964 Label: Parlophone,1,1,,,1,,,BPI: Platinum[17] ARIA: Gold[18],
3,Beatles for Sale,Released: 4 December 1964 Label: Parlophone,1,1,,,1,,,BPI: Gold[17] ARIA: Gold[18] MC: Gold[19] RIAA...,"UK: 750,000[22]"
4,Help!,Released: 6 August 1965 Label: Parlophone,1,1,,5,1,,,BPI: Platinum[17] ARIA: Gold[18],
5,Rubber Soul,Released: 3 December 1965 Label: Parlophone,1,1,,5,1,,,BPI: 2× Platinum[17] ARIA: Platinum[18] BVMI: ...,
6,Revolver,Released: 5 August 1966 Label: Parlophone,1,1,,5,1,14,,BPI: 2× Platinum[17] ARIA: Platinum[18],
7,Sgt. Pepper's Lonely Hearts Club Band,Released: 26 May 1967[23] Label: Parlophone,1,1,1,4,1,1,1,BPI: 18× Platinum[17] ARIA: 4× Platinum[18] BV...,"UK: 5,340,000[25]"
8,"The Beatles (""The White Album"")",Released: 22 November 1968 Label: Apple,1,1,1,1,1,1,1,BPI: 2× Platinum[17] ARIA: 2× Platinum[18] MC:...,
9,Yellow Submarine[C],Released: 17 January 1969 Label: Apple,3,4,1,4,5,1,2,BPI: Gold[17] MC: Gold[19] RIAA: Platinum[20],


In [180]:
!pip install html5lib

Defaulting to user installation because normal site-packages is not writeable


In [182]:
import html5lib
import lxml
from bs4 import BeautifulSoup
import pandas as pd

In [184]:
html5lib.__version__

'1.1'

In [186]:
url = 'https://github.com/mattharrison/datasets/blob/master/data/anscombes.csv'
dfs = pd.read_html(url, attrs={'class': 'csv-data'})
len(dfs)

ValueError: No tables found

In [None]:
dfs[0]