# DataFrame
DataFrame là một cấu trúc dữ liệu 2D, được tổ chức theo dòng và cột.

In [1]:
import numpy as np
import pandas as pd

## 1. Khởi tạo DataFrame
**`pandas.DataFrame(data, index, columns, dtype, copy)`**<br>
`data`: dữ liệu (list, dict, series, ndarray, dataframe)<br>
`index`: danh sách nhãn dòng<br>
`colums`: danh sách nhãn cột<br>
`dtype`: kiểu dữ liệu các cột<br>
`copy`: có copy dữ liệu hay không<br>

### 1.1. Khởi tạo từ list

In [2]:
# Khoi tao tu list mot chieu
list = [1, 2, 3, 4]
df = pd.DataFrame(list)
df

Unnamed: 0,0
0,1
1,2
2,3
3,4


In [3]:
# Khoi tao tu list co mo ta index
df = pd.DataFrame(list, index=['a','b','c','d'])
df

Unnamed: 0,0
a,1
b,2
c,3
d,4


In [4]:
# Khoi tao tu list co mo ta index va columns
df = pd.DataFrame(list, index=['a','b','c','d'], columns=['CotA'])
df

Unnamed: 0,CotA
a,1
b,2
c,3
d,4


In [2]:
# Khoi tao tu list hai chieu
list = [['Nam',9], ['Binh',10]]
df = pd.DataFrame(list, columns=['Name', 'Age'])
df

Unnamed: 0,Name,Age
0,Nam,9
1,Binh,10


### 1.2. Khởi tạo từ dictionary

In [3]:
# Khoi tao
dict = {
    'name': ['An', 'Tu', 'Nhi'],
    'age': [18, 20, 25],
    'mail': ['an@gmail.com', 'tu@gmail.com', 'nhi@gmail.com']
}
df = pd.DataFrame(dict)
df

Unnamed: 0,name,age,mail
0,An,18,an@gmail.com
1,Tu,20,tu@gmail.com
2,Nhi,25,nhi@gmail.com


### 1.3. Khởi tạo từ Series

In [23]:
name_series = pd.Series(['An', 'Tu', 'Nhi'])
age_series = pd.Series([18, 20, 25])
mail_series = pd.Series(['an@gmail.com', 'tu@gmail.com', 'nhi@gmail.com'])
df = pd.DataFrame({'name': name_series, 'age': age_series, 'mail': mail_series})
df

Unnamed: 0,name,age,mail
0,An,18,an@gmail.com
1,Tu,20,tu@gmail.com
2,Nhi,25,nhi@gmail.com


### 1.4. Khởi tạo từ file
**`read_csv`, `read_json`, `read_html`, `read_xml`, `read_excel`**

In [None]:
# File csv
insurance = pd.read_csv('insurance.csv')
insurance.head()

In [None]:
# File tsv with delimiter
shark = pd.read_csv('shark.tsv', delimiter='\t')
shark

In [None]:
# File excel with sheet_name, skiprows, skipfooter
df = pd.read_excel('Canada.xls', sheet_name=1, skiprows=20, skipfooter=2)
df.head()

In [None]:
# File excel xlxs use custom engine
canada = pd.read_excel('Canada.xlsx', sheet_name=1, skiprows=20, skipfooter=2, engine='openpyxl')
canada[]

In [None]:
# File excel xlxs use custom engine and set custom index column
movies = pd.read_excel('movies.xlsx', index_col=0, engine='openpyxl')
movies

In [9]:
euro = pd.read_csv('euro2012.csv', index_col=0)
euro.head()

Unnamed: 0,Team,Goals,Shots on target,Shots off target,Shooting Accuracy,% Goals-to-shots,Total shots (inc. Blocked),Hit Woodwork,Penalty goals,Penalties not scored,...,Saves made,Saves-to-shots ratio,Fouls Won,Fouls Conceded,Offsides,Yellow Cards,Red Cards,Subs on,Subs off,Players Used
0,Croatia,4,13,12,51.9%,16.0%,32,0,0,0,...,13,81.3%,41,62,2,9,0,9,9,16
1,Czech Republic,4,13,18,41.9%,12.9%,39,0,0,0,...,9,60.1%,53,73,8,7,0,11,11,19
2,Denmark,4,10,10,50.0%,20.0%,27,1,0,0,...,10,66.7%,25,38,8,4,0,7,7,15
3,England,5,11,18,50.0%,17.2%,40,0,0,0,...,22,88.1%,43,45,6,5,0,11,11,16
4,France,3,22,24,37.9%,6.5%,65,1,0,0,...,6,54.6%,36,51,5,6,0,11,11,19


## 2. Xem thông tin DataFrame

In [None]:
# Tuple số phần tử của dòng, cột
euro.shape

In [None]:
# Danh sách các cột
euro.columns

In [None]:
# Kiểu dữ liệu các cột
euro.dtypes

In [None]:
# Toàn bộ info
euro.info()

In [None]:
# Thông tin chung. Xem các cột kiểu số
euro.describe()

## 3. Truy xuất DataFrame

### 3.1. Truy xuất theo index

In [10]:
euro.head()

Unnamed: 0,Team,Goals,Shots on target,Shots off target,Shooting Accuracy,% Goals-to-shots,Total shots (inc. Blocked),Hit Woodwork,Penalty goals,Penalties not scored,...,Saves made,Saves-to-shots ratio,Fouls Won,Fouls Conceded,Offsides,Yellow Cards,Red Cards,Subs on,Subs off,Players Used
0,Croatia,4,13,12,51.9%,16.0%,32,0,0,0,...,13,81.3%,41,62,2,9,0,9,9,16
1,Czech Republic,4,13,18,41.9%,12.9%,39,0,0,0,...,9,60.1%,53,73,8,7,0,11,11,19
2,Denmark,4,10,10,50.0%,20.0%,27,1,0,0,...,10,66.7%,25,38,8,4,0,7,7,15
3,England,5,11,18,50.0%,17.2%,40,0,0,0,...,22,88.1%,43,45,6,5,0,11,11,16
4,France,3,22,24,37.9%,6.5%,65,1,0,0,...,6,54.6%,36,51,5,6,0,11,11,19


In [13]:
# Truy xuất dòng 0, cột 0 (iloc)
euro.iloc[0, 0]
# Truy xuất dòng 0, cột Team (loc)
euro.loc[0, 'Team']

'Croatia'

In [15]:
# Truy xuất dòng 0, cột 0->1 (iloc) -> Series
euro.iloc[0,:2]
# Truy xuất dòng 0, cột Team và Goals (loc) -> Series
euro.loc[0, ['Team', 'Goals']]

Team     Croatia
Goals          4
Name: 0, dtype: object

In [16]:
# Truy xuất dòng 0, cột 0->1 (iloc) -> DataFrame
euro.iloc[[0], [0,1]]
# Truy xuất dòng 0, cột Team và Goals (loc) -> DataFrame
euro.loc[[0], ['Team', 'Goals']]

Unnamed: 0,Team,Goals
0,Croatia,4


In [25]:
# Truy xuất từ dòng 2 -> 5 và các cột Team, Goals
euro.loc[2:6, ['Team', 'Goals']]

Unnamed: 0,Team,Goals
2,Denmark,4
3,England,5
4,France,3
5,Germany,10
6,Greece,5


In [28]:
# Truy xuất theo cột 0 (iloc)
euro.iloc[:, [0]].head(3)
# Truy xuất theo cột Team (loc)
euro.loc[:, ['Team']].head(3)
# Cách viết gọn
euro[['Team']].head(3)

Unnamed: 0,Team
0,Croatia
1,Czech Republic
2,Denmark


### 3.2. Truy xuất theo điều kiện

In [None]:
euro['Goals'] == 4 # -> Series

In [31]:
# Truy xuất các đội có Goals == 4
euro.loc[euro['Goals'] == 4, ['Team', 'Goals']]

Unnamed: 0,Team,Goals
0,Croatia,4
1,Czech Republic,4
2,Denmark,4


In [32]:
# Truy xuất số bàn thắng của đội Croatia
euro.loc[euro['Team'] == 'Croatia', 'Goals']

0    4
Name: Goals, dtype: int64

In [33]:
# Truy xuất đội có nhiều bàn thắng nhất
euro.loc[euro['Goals'] == euro['Goals'].max(), 'Team']

13    Spain
Name: Team, dtype: object

In [36]:
# Đặt cột Team làm index
new_euro = euro.set_index('Team')
new_euro.head(3)

Unnamed: 0_level_0,Goals,Shots on target,Shots off target,Shooting Accuracy,% Goals-to-shots,Total shots (inc. Blocked),Hit Woodwork,Penalty goals,Penalties not scored,Headed goals,...,Saves made,Saves-to-shots ratio,Fouls Won,Fouls Conceded,Offsides,Yellow Cards,Red Cards,Subs on,Subs off,Players Used
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Croatia,4,13,12,51.9%,16.0%,32,0,0,0,2,...,13,81.3%,41,62,2,9,0,9,9,16
Czech Republic,4,13,18,41.9%,12.9%,39,0,0,0,0,...,9,60.1%,53,73,8,7,0,11,11,19
Denmark,4,10,10,50.0%,20.0%,27,1,0,0,3,...,10,66.7%,25,38,8,4,0,7,7,15


## 4. Thao tác với DataFrame

### 4.1. Thêm vào DataFrame

In [11]:
df

Unnamed: 0,name,age,mail
0,An,18,an@gmail.com
1,Tu,20,tu@gmail.com
2,Nhi,25,nhi@gmail.com


In [12]:
# Thêm sử dụng concat
new_df = pd.DataFrame([['Minh', '32', 'minh@gmail.com']], columns=['name', 'age', 'mail'])
pd.concat([df, new_df])

Unnamed: 0,name,age,mail
0,An,18,an@gmail.com
1,Tu,20,tu@gmail.com
2,Nhi,25,nhi@gmail.com
0,Minh,32,minh@gmail.com


In [13]:
# Thêm sử dụng concat, reset lại index
pd.concat([df, new_df], ignore_index=True)

Unnamed: 0,name,age,mail
0,An,18,an@gmail.com
1,Tu,20,tu@gmail.com
2,Nhi,25,nhi@gmail.com
3,Minh,32,minh@gmail.com


In [18]:
# Thêm sử dụng concat, custom index
new_df2 = pd.DataFrame([['Minh', '32', 'minh@gmail.com']], columns=['name', 'age', 'mail'], index=[11])
pd.concat([df, new_df2])

Unnamed: 0,name,age,mail
0,An,18,an@gmail.com
1,Tu,20,tu@gmail.com
2,Nhi,25,nhi@gmail.com
11,Minh,32,minh@gmail.com


### 4.2. Sửa giá trị trong DataFrame

In [24]:
# Sửa email của An
df2 = df.set_index('name')
df2.loc['An', 'mail'] = 'an@hotmail.com'
df2

Unnamed: 0_level_0,age,mail
name,Unnamed: 1_level_1,Unnamed: 2_level_1
An,18,an@hotmail.com
Tu,20,tu@gmail.com
Nhi,25,nhi@gmail.com


In [26]:
# Sửa email theo điều kiện
df2.loc[df2['age'] > 20, 'mail'] = '-'
df2

Unnamed: 0_level_0,age,mail
name,Unnamed: 1_level_1,Unnamed: 2_level_1
An,18,an@hotmail.com
Tu,20,tu@gmail.com
Nhi,25,-


### 4.3. Xóa trong DataFrame

In [27]:
# Xóa dòng
df2.drop(['Nhi'])

Unnamed: 0_level_0,age,mail
name,Unnamed: 1_level_1,Unnamed: 2_level_1
An,18,an@hotmail.com
Tu,20,tu@gmail.com


In [28]:
# Xóa cột
df2.drop(['age'], axis=1)

Unnamed: 0_level_0,mail
name,Unnamed: 1_level_1
An,an@hotmail.com
Tu,tu@gmail.com
Nhi,-


In [29]:
# Đổi tên cột
df2.rename(columns={'mail': 'email'})

Unnamed: 0_level_0,age,email
name,Unnamed: 1_level_1,Unnamed: 2_level_1
An,18,an@hotmail.com
Tu,20,tu@gmail.com
Nhi,25,-


## 5. Thống kê