# Pandas - DataFrame
DataFrame là một cấu trúc dữ liệu 2D, được tổ chức theo dòng và cột.

In [1]:
import numpy as np
import pandas as pd

## 1. Khởi tạo DataFrame
**`pandas.DataFrame(data, index, columns, dtype, copy)`**<br>
`data`: dữ liệu (list, dict, series, ndarray, dataframe)<br>
`index`: danh sách nhãn dòng<br>
`colums`: danh sách nhãn cột<br>
`dtype`: kiểu dữ liệu các cột<br>
`copy`: có copy dữ liệu hay không<br>

### 1.1. Khởi tạo từ list

In [2]:
# Tạo từ list
list = [1, 2, 3, 4]
df = pd.DataFrame(list)
df

Unnamed: 0,0
0,1
1,2
2,3
3,4


In [3]:
# Tạo từ list với index gắn nhãn
df = pd.DataFrame(list, index=['a','b','c','d'])
df

Unnamed: 0,0
a,1
b,2
c,3
d,4


In [4]:
# Tạo từ list hai chiều
list = [['Nam',9], ['Binh',10]]
df = pd.DataFrame(list, columns=['Name', 'Age'], index=['hv1', 'hv2'])
df

Unnamed: 0,Name,Age
hv1,Nam,9
hv2,Binh,10


### 1.2. Khởi tạo từ dictionary

In [5]:
dict = {
    'name': ['An', 'Tu', 'Nhi'],
    'age': [18, 20, 25],
    'mail': ['an@gmail.com', 'tu@gmail.com', 'nhi@gmail.com']
}
df = pd.DataFrame(dict)
df

Unnamed: 0,name,age,mail
0,An,18,an@gmail.com
1,Tu,20,tu@gmail.com
2,Nhi,25,nhi@gmail.com


### 1.3. Khởi tạo từ Series

In [6]:
# Build lại dict từ series
name_series = pd.Series(['An', 'Tu', 'Nhi'])
age_series = pd.Series([18, 20, 25])
mail_series = pd.Series(['an@gmail.com', 'tu@gmail.com', 'nhi@gmail.com'])
df = pd.DataFrame({'name': name_series, 'age': age_series, 'mail': mail_series})
df

Unnamed: 0,name,age,mail
0,An,18,an@gmail.com
1,Tu,20,tu@gmail.com
2,Nhi,25,nhi@gmail.com


### 1.4. Khởi tạo từ file
**`read_csv`, `read_json`, `read_html`, `read_xml`, `read_excel`**

In [7]:
# File csv
insurance = pd.read_csv('data/insurance.csv')
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [42]:
# File tsv with delimiter
shark = pd.read_csv('data/shark.tsv', delimiter='\t')
shark.head()

Unnamed: 0,Shark Attacks,USA,Australia,South Africa
0,Jan,28,94,68
1,Feb,27,78,32
2,Mar,66,63,34
3,Apr,103,54,25
4,May,106,21,25


In [9]:
# File excel xlxs use custom engine
canada = pd.read_excel('data/Canada.xlsx', sheet_name=1, skiprows=20, skipfooter=2, engine='openpyxl')
canada.head(3)

Unnamed: 0,Type,Coverage,OdName,AREA,AreaName,REG,RegName,DEV,DevName,1980,...,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013
0,Immigrants,Foreigners,Afghanistan,935,Asia,5501,Southern Asia,902,Developing regions,16,...,2978,3436,3009,2652,2111,1746,1758,2203,2635,2004
1,Immigrants,Foreigners,Albania,908,Europe,925,Southern Europe,901,Developed regions,1,...,1450,1223,856,702,560,716,561,539,620,603
2,Immigrants,Foreigners,Algeria,903,Africa,912,Northern Africa,902,Developing regions,80,...,3616,3626,4807,3623,4005,5393,4752,4325,3774,4331


In [10]:
# File excel xlxs use custom engine and set custom index column
movies = pd.read_excel('data/movies.xlsx', index_col=0, engine='openpyxl')
movies.head(3)

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance


In [11]:
# Set custom index column
euro = pd.read_csv('data/euro2012.csv', index_col=0)
euro.head()

Unnamed: 0,Team,Goals,Shots on target,Shots off target,Shooting Accuracy,% Goals-to-shots,Total shots (inc. Blocked),Hit Woodwork,Penalty goals,Penalties not scored,...,Saves made,Saves-to-shots ratio,Fouls Won,Fouls Conceded,Offsides,Yellow Cards,Red Cards,Subs on,Subs off,Players Used
0,Croatia,4,13,12,51.9%,16.0%,32,0,0,0,...,13,81.3%,41,62,2,9,0,9,9,16
1,Czech Republic,4,13,18,41.9%,12.9%,39,0,0,0,...,9,60.1%,53,73,8,7,0,11,11,19
2,Denmark,4,10,10,50.0%,20.0%,27,1,0,0,...,10,66.7%,25,38,8,4,0,7,7,15
3,England,5,11,18,50.0%,17.2%,40,0,0,0,...,22,88.1%,43,45,6,5,0,11,11,16
4,France,3,22,24,37.9%,6.5%,65,1,0,0,...,6,54.6%,36,51,5,6,0,11,11,19


## 2. Xem thông tin DataFrame

In [12]:
# Tuple số phần tử của dòng, cột
euro.shape

(16, 35)

In [13]:
# Danh sách các cột
euro.columns

Index(['Team', 'Goals', 'Shots on target', 'Shots off target',
       'Shooting Accuracy', '% Goals-to-shots', 'Total shots (inc. Blocked)',
       'Hit Woodwork', 'Penalty goals', 'Penalties not scored', 'Headed goals',
       'Passes', 'Passes completed', 'Passing Accuracy', 'Touches', 'Crosses',
       'Dribbles', 'Corners Taken', 'Tackles', 'Clearances', 'Interceptions',
       'Clearances off line', 'Clean Sheets', 'Blocks', 'Goals conceded',
       'Saves made', 'Saves-to-shots ratio', 'Fouls Won', 'Fouls Conceded',
       'Offsides', 'Yellow Cards', 'Red Cards', 'Subs on', 'Subs off',
       'Players Used'],
      dtype='object')

In [14]:
# Kiểu dữ liệu các cột
euro.dtypes

Team                           object
Goals                           int64
Shots on target                 int64
Shots off target                int64
Shooting Accuracy              object
% Goals-to-shots               object
Total shots (inc. Blocked)      int64
Hit Woodwork                    int64
Penalty goals                   int64
Penalties not scored            int64
Headed goals                    int64
Passes                          int64
Passes completed                int64
Passing Accuracy               object
Touches                         int64
Crosses                         int64
Dribbles                        int64
Corners Taken                   int64
Tackles                         int64
Clearances                      int64
Interceptions                   int64
Clearances off line           float64
Clean Sheets                    int64
Blocks                          int64
Goals conceded                  int64
Saves made                      int64
Saves-to-sho

In [15]:
# Toàn bộ info
euro.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16 entries, 0 to 15
Data columns (total 35 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Team                        16 non-null     object 
 1   Goals                       16 non-null     int64  
 2   Shots on target             16 non-null     int64  
 3   Shots off target            16 non-null     int64  
 4   Shooting Accuracy           16 non-null     object 
 5   % Goals-to-shots            16 non-null     object 
 6   Total shots (inc. Blocked)  16 non-null     int64  
 7   Hit Woodwork                16 non-null     int64  
 8   Penalty goals               16 non-null     int64  
 9   Penalties not scored        16 non-null     int64  
 10  Headed goals                16 non-null     int64  
 11  Passes                      16 non-null     int64  
 12  Passes completed            16 non-null     int64  
 13  Passing Accuracy            16 non-nu

In [16]:
# Thông tin chung. Xem các cột kiểu số
euro.describe()

Unnamed: 0,Goals,Shots on target,Shots off target,Total shots (inc. Blocked),Hit Woodwork,Penalty goals,Penalties not scored,Headed goals,Passes,Passes completed,...,Goals conceded,Saves made,Fouls Won,Fouls Conceded,Offsides,Yellow Cards,Red Cards,Subs on,Subs off,Players Used
count,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,...,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0
mean,4.75,17.125,24.9375,54.9375,1.25,0.1875,0.0625,1.375,1763.375,1467.375,...,4.75,12.125,52.9375,55.625,8.125,7.4375,0.1875,10.875,10.875,17.25
std,2.886751,10.582218,10.680005,26.065223,1.612452,0.403113,0.25,1.024695,906.177898,827.580721,...,1.983263,4.573474,22.915697,18.973227,4.910872,3.265348,0.403113,3.53789,3.53789,1.527525
min,1.0,7.0,10.0,27.0,0.0,0.0,0.0,0.0,851.0,606.0,...,1.0,6.0,25.0,30.0,2.0,4.0,0.0,7.0,7.0,15.0
25%,2.75,9.75,18.0,36.5,0.0,0.0,0.0,0.75,1190.75,951.5,...,3.0,9.75,35.75,44.5,4.0,5.0,0.0,8.5,8.5,16.0
50%,4.5,13.0,23.5,44.0,1.0,0.0,0.0,1.5,1522.0,1211.5,...,5.0,11.0,45.5,51.0,7.5,6.5,0.0,10.5,10.5,17.0
75%,5.25,22.0,32.25,68.75,2.0,0.0,0.0,2.0,1934.75,1546.5,...,6.0,13.5,64.0,64.75,11.25,9.0,0.0,12.5,12.5,18.25
max,12.0,42.0,45.0,110.0,6.0,1.0,1.0,3.0,4317.0,3820.0,...,9.0,22.0,102.0,90.0,19.0,16.0,1.0,18.0,18.0,20.0


## 3. Truy xuất DataFrame

### 3.1. Truy xuất theo index
``df.iloc[row-index, col-index]`` </br>
``df.loc[row-label, col-label]``

In [17]:
euro.head()

Unnamed: 0,Team,Goals,Shots on target,Shots off target,Shooting Accuracy,% Goals-to-shots,Total shots (inc. Blocked),Hit Woodwork,Penalty goals,Penalties not scored,...,Saves made,Saves-to-shots ratio,Fouls Won,Fouls Conceded,Offsides,Yellow Cards,Red Cards,Subs on,Subs off,Players Used
0,Croatia,4,13,12,51.9%,16.0%,32,0,0,0,...,13,81.3%,41,62,2,9,0,9,9,16
1,Czech Republic,4,13,18,41.9%,12.9%,39,0,0,0,...,9,60.1%,53,73,8,7,0,11,11,19
2,Denmark,4,10,10,50.0%,20.0%,27,1,0,0,...,10,66.7%,25,38,8,4,0,7,7,15
3,England,5,11,18,50.0%,17.2%,40,0,0,0,...,22,88.1%,43,45,6,5,0,11,11,16
4,France,3,22,24,37.9%,6.5%,65,1,0,0,...,6,54.6%,36,51,5,6,0,11,11,19


In [18]:
# Truy xuất dòng 0, cột 0 (iloc)
euro.iloc[0, 0]

'Croatia'

In [19]:
# Truy xuất dòng label 0, cột label Team (loc)
euro.loc[0, 'Team']

'Croatia'

In [20]:
# Truy xuất dòng 0, cột 0->1 (iloc) -> Series
euro.iloc[0,:2]

Team     Croatia
Goals          4
Name: 0, dtype: object

In [21]:
# Truy xuất dòng label 0, cột Team và Goals (loc) -> Series
euro.loc[0, ['Team', 'Goals']]

Team     Croatia
Goals          4
Name: 0, dtype: object

In [22]:
# Truy xuất dòng 0, cột 0->1 (iloc) -> DataFrame
euro.iloc[[0], [0,1]]

Unnamed: 0,Team,Goals
0,Croatia,4


In [23]:
# Truy xuất dòng label 0, cột Team và Goals (loc) -> DataFrame
euro.loc[[0], ['Team', 'Goals']]

Unnamed: 0,Team,Goals
0,Croatia,4


In [24]:
# Truy xuất từ dòng 2 -> 5 và các cột Team, Goals
euro.loc[2:6, ['Team', 'Goals']]

Unnamed: 0,Team,Goals
2,Denmark,4
3,England,5
4,France,3
5,Germany,10
6,Greece,5


In [25]:
# Truy xuất theo cột 0 (iloc)
euro.iloc[:, [0]].head(3)

Unnamed: 0,Team
0,Croatia
1,Czech Republic
2,Denmark


In [26]:
# Truy xuất theo cột Team (loc)
euro.loc[:, ['Team']].head(3)
# Cách viết gọn
euro[['Team']].head(3)

Unnamed: 0,Team
0,Croatia
1,Czech Republic
2,Denmark


### 3.2. Truy xuất theo điều kiện

In [27]:
euro['Goals'] == 4 # -> Series

0      True
1      True
2      True
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
Name: Goals, dtype: bool

In [28]:
# Truy xuất các đội có Goals == 4
euro.loc[euro['Goals'] == 4, ['Team', 'Goals']]

Unnamed: 0,Team,Goals
0,Croatia,4
1,Czech Republic,4
2,Denmark,4


In [29]:
# Truy xuất số bàn thắng của đội Croatia
euro.loc[euro['Team'] == 'Croatia', 'Goals']

0    4
Name: Goals, dtype: int64

In [30]:
# Truy xuất đội có nhiều bàn thắng nhất
euro.loc[euro['Goals'] == euro['Goals'].max(), 'Team']

13    Spain
Name: Team, dtype: object

In [31]:
# Đặt cột Team làm index
new_euro = euro.set_index('Team')
new_euro.head(3)

Unnamed: 0_level_0,Goals,Shots on target,Shots off target,Shooting Accuracy,% Goals-to-shots,Total shots (inc. Blocked),Hit Woodwork,Penalty goals,Penalties not scored,Headed goals,...,Saves made,Saves-to-shots ratio,Fouls Won,Fouls Conceded,Offsides,Yellow Cards,Red Cards,Subs on,Subs off,Players Used
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Croatia,4,13,12,51.9%,16.0%,32,0,0,0,2,...,13,81.3%,41,62,2,9,0,9,9,16
Czech Republic,4,13,18,41.9%,12.9%,39,0,0,0,0,...,9,60.1%,53,73,8,7,0,11,11,19
Denmark,4,10,10,50.0%,20.0%,27,1,0,0,3,...,10,66.7%,25,38,8,4,0,7,7,15


## 4. Thao tác với DataFrame

### 4.1. Thêm vào DataFrame

In [32]:
df

Unnamed: 0,name,age,mail
0,An,18,an@gmail.com
1,Tu,20,tu@gmail.com
2,Nhi,25,nhi@gmail.com


In [33]:
# Thêm sử dụng concat
new_df = pd.DataFrame([['Minh', '23', 'minh@gmail.com']], columns=['name', 'age', 'mail'])
df = pd.concat([df, new_df])
df

Unnamed: 0,name,age,mail
0,An,18,an@gmail.com
1,Tu,20,tu@gmail.com
2,Nhi,25,nhi@gmail.com
0,Minh,23,minh@gmail.com


In [34]:
# Thêm sử dụng concat, reset lại index
pd.concat([df, new_df], ignore_index=True)

Unnamed: 0,name,age,mail
0,An,18,an@gmail.com
1,Tu,20,tu@gmail.com
2,Nhi,25,nhi@gmail.com
3,Minh,23,minh@gmail.com
4,Minh,23,minh@gmail.com


In [35]:
# Thêm sử dụng concat, custom index
new_df2 = pd.DataFrame([['Minh', '32', 'minh@gmail.com']], columns=['name', 'age', 'mail'], index=[11])
pd.concat([df, new_df2])

Unnamed: 0,name,age,mail
0,An,18,an@gmail.com
1,Tu,20,tu@gmail.com
2,Nhi,25,nhi@gmail.com
0,Minh,23,minh@gmail.com
11,Minh,32,minh@gmail.com


In [36]:
# Thêm cột
df['sex'] = 'male'
df['avg'] = df['age'].astype(float)/2
df

Unnamed: 0,name,age,mail,sex,avg
0,An,18,an@gmail.com,male,9.0
1,Tu,20,tu@gmail.com,male,10.0
2,Nhi,25,nhi@gmail.com,male,12.5
0,Minh,23,minh@gmail.com,male,11.5


In [37]:
# Thêm cột kèm xử lý
df['ketqua'] = df['avg'].map(lambda x: 'dat' if x >= 10 else 'khong dat')
df

Unnamed: 0,name,age,mail,sex,avg,ketqua
0,An,18,an@gmail.com,male,9.0,khong dat
1,Tu,20,tu@gmail.com,male,10.0,dat
2,Nhi,25,nhi@gmail.com,male,12.5,dat
0,Minh,23,minh@gmail.com,male,11.5,dat


In [38]:
# Thêm cột với hàm apply
def salutation(x):
    if x.sex == 'male':
        return 'Dear Mr. ' + x['name']
    else:
        return 'Dear Ms. ' + x['name']
df['salutation'] = df.apply(salutation, axis=1)
df.head()

Unnamed: 0,name,age,mail,sex,avg,ketqua,salutation
0,An,18,an@gmail.com,male,9.0,khong dat,Dear Mr. An
1,Tu,20,tu@gmail.com,male,10.0,dat,Dear Mr. Tu
2,Nhi,25,nhi@gmail.com,male,12.5,dat,Dear Mr. Nhi
0,Minh,23,minh@gmail.com,male,11.5,dat,Dear Mr. Minh


In [39]:
# Thêm cột kèm sort
df['hang'] = df['avg'].rank(ascending=False)
df['hang'] = df['hang'].astype(int)
df.sort_values(by='hang')

Unnamed: 0,name,age,mail,sex,avg,ketqua,salutation,hang
2,Nhi,25,nhi@gmail.com,male,12.5,dat,Dear Mr. Nhi,1
0,Minh,23,minh@gmail.com,male,11.5,dat,Dear Mr. Minh,2
1,Tu,20,tu@gmail.com,male,10.0,dat,Dear Mr. Tu,3
0,An,18,an@gmail.com,male,9.0,khong dat,Dear Mr. An,4


### 4.2. Sửa giá trị trong DataFrame

In [40]:
# Sửa email của An
df2 = df.copy()
df2 = df2.set_index('name')
df2.loc['An', 'mail'] = 'an@hotmail.com'
df2

Unnamed: 0_level_0,age,mail,sex,avg,ketqua,salutation,hang
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
An,18,an@hotmail.com,male,9.0,khong dat,Dear Mr. An,4
Tu,20,tu@gmail.com,male,10.0,dat,Dear Mr. Tu,3
Nhi,25,nhi@gmail.com,male,12.5,dat,Dear Mr. Nhi,1
Minh,23,minh@gmail.com,male,11.5,dat,Dear Mr. Minh,2


In [45]:
# Đổi cột sang dạng số
cols = ['age']
df2[cols] = df2[cols].apply(pd.to_numeric, errors='ignore')
df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, An to Minh
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   age         4 non-null      int64  
 1   mail        4 non-null      object 
 2   sex         4 non-null      object 
 3   avg         4 non-null      float64
 4   ketqua      4 non-null      object 
 5   salutation  4 non-null      object 
 6   hang        4 non-null      int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 428.0+ bytes


In [46]:
# Sửa email theo điều kiện
df2.loc[df2['age'] > 20, 'mail'] = '-'
df2.head()

Unnamed: 0_level_0,age,mail,sex,avg,ketqua,salutation,hang
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
An,18,an@hotmail.com,male,9.0,khong dat,Dear Mr. An,4
Tu,20,tu@gmail.com,male,10.0,dat,Dear Mr. Tu,3
Nhi,25,-,male,12.5,dat,Dear Mr. Nhi,1
Minh,23,-,male,11.5,dat,Dear Mr. Minh,2


### 4.3. Xóa trong DataFrame

In [47]:
# Xóa dòng
df2.drop(['Nhi'])

Unnamed: 0_level_0,age,mail,sex,avg,ketqua,salutation,hang
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
An,18,an@hotmail.com,male,9.0,khong dat,Dear Mr. An,4
Tu,20,tu@gmail.com,male,10.0,dat,Dear Mr. Tu,3
Minh,23,-,male,11.5,dat,Dear Mr. Minh,2


In [48]:
# Xóa cột
df2.drop(['age'], axis=1)

Unnamed: 0_level_0,mail,sex,avg,ketqua,salutation,hang
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
An,an@hotmail.com,male,9.0,khong dat,Dear Mr. An,4
Tu,tu@gmail.com,male,10.0,dat,Dear Mr. Tu,3
Nhi,-,male,12.5,dat,Dear Mr. Nhi,1
Minh,-,male,11.5,dat,Dear Mr. Minh,2


In [49]:
# Đổi tên cột
df2.rename(columns={'mail': 'email'})

Unnamed: 0_level_0,age,email,sex,avg,ketqua,salutation,hang
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
An,18,an@hotmail.com,male,9.0,khong dat,Dear Mr. An,4
Tu,20,tu@gmail.com,male,10.0,dat,Dear Mr. Tu,3
Nhi,25,-,male,12.5,dat,Dear Mr. Nhi,1
Minh,23,-,male,11.5,dat,Dear Mr. Minh,2


## 5. Thống kê

In [50]:
df.describe()

Unnamed: 0,avg,hang
count,4.0,4.0
mean,10.75,2.5
std,1.554563,1.290994
min,9.0,1.0
25%,9.75,1.75
50%,10.75,2.5
75%,11.75,3.25
max,12.5,4.0


In [58]:
# Phát hiện null
insurance.isnull()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...
1333,False,False,False,False,False,False,False
1334,False,False,False,False,False,False,False
1335,False,False,False,False,False,False,False
1336,False,False,False,False,False,False,False


In [59]:
# Đếm null theo cột
insurance.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [60]:
# Check theo điều kiện
insurance[insurance['region'].isnull()]

Unnamed: 0,age,sex,bmi,children,smoker,region,charges


## 6. Sort

In [51]:
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [57]:
# Sort tăng dần theo cột age
insurance.sort_values(by='age')

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
1248,18,female,39.820,0,no,southeast,1633.96180
482,18,female,31.350,0,no,southeast,1622.18850
492,18,female,25.080,0,no,northeast,2196.47320
525,18,female,33.880,0,no,southeast,11482.63485
529,18,male,25.460,0,no,northeast,1708.00140
...,...,...,...,...,...,...,...
398,64,male,25.600,2,no,southwest,14988.43200
335,64,male,34.500,0,no,southwest,13822.80300
378,64,female,30.115,3,no,northwest,16455.70785
1265,64,male,23.760,0,yes,southeast,26926.51440


In [56]:
# Sort tăng dần theo age, giảm dần theo charges
insurance.sort_values(by=['age', 'charges'], ascending=[True, False])

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
803,18,female,42.240,0,yes,southeast,38792.68560
759,18,male,38.170,0,yes,southeast,36307.79830
161,18,female,36.850,0,yes,southeast,36149.48350
623,18,male,33.535,0,yes,northeast,34617.84065
57,18,male,31.680,2,yes,southeast,34303.16720
...,...,...,...,...,...,...,...
768,64,female,39.700,0,no,southwest,14319.03100
801,64,female,35.970,0,no,southeast,14313.84630
752,64,male,37.905,0,no,northwest,14210.53595
534,64,male,40.480,0,no,southeast,13831.11520


## 7. Utils

In [61]:
# Phát hiện trùng lắp
insurance.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
1333    False
1334    False
1335    False
1336    False
1337    False
Length: 1338, dtype: bool

In [62]:
# Xuất trùng lắp theo cột
insurance[insurance.duplicated(subset='charges')]

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
581,19,male,30.59,0,no,northwest,1639.5631


In [63]:
# Xóa dữ liệu trùng lắp
insurance.drop_duplicates(keep='first')

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500
