## 遺失值檢測與丟棄
#### 檢測遺失值

In [1]:
import pandas as pd
import numpy as np

raw_data = {'first_name':['Jason', 'Molly', np.nan, np.nan, np.nan],
            'nationality':['USA', 'USA', 'France', 'UK', 'UK'],
            'age':[42, 52, 36, 24, 70]}
df = pd.DataFrame(raw_data)
df

Unnamed: 0,first_name,nationality,age
0,Jason,USA,42
1,Molly,USA,52
2,,France,36
3,,UK,24
4,,UK,70


In [4]:
# 承上，檢測到3筆null資料（True）
df.isnull()

Unnamed: 0,first_name,nationality,age
0,False,False,False
1,False,False,False
2,True,False,False
3,True,False,False
4,True,False,False


In [5]:
# 承上，在isnull後加上any，藉此判斷資料內是否有缺失值。
# any()：判斷一個迭代物件裡，是否包含True相關元素。
df.isnull().any()

first_name      True
nationality    False
age            False
dtype: bool

In [6]:
# 承上，使用sum確認有多少缺失值（軸：上下）
df.isnull().sum(axis = 0)

first_name     3
nationality    0
age            0
dtype: int64

In [7]:
# 承上，使用sum確認有多少缺失值（軸：左右）
df.isnull().sum(axis = 1)

0    0
1    0
2    1
3    1
4    1
dtype: int64

In [1]:
# Practice（從文字轉換成pandas）
# stringIO：把字串轉成串流，從pandas去讀取檔案（模擬檔案）

import pandas as pd
import  numpy as np
from io import  StringIO

csv_data = '''
A,B,C,D
1.0,2.0,3.5,4
5.5,34,3.4
10,,11.5,8.5
'''
df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.5,4.0
1,5.5,34.0,3.4,
2,10.0,,11.5,8.5


#### 忽略遺失值

In [2]:
# Practice

import pandas as pd
import  numpy as np
from io import  StringIO

csv_data = '''
A,B,C,D
2,3,4,5
6,34,6
10,,11,8
'''
df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,2,3.0,4,5.0
1,6,34.0,6,
2,10,,11,8.0


In [3]:
print(df[['A']].sum())
print(df[['B']].sum())
print(df[['A']].mean()) # 不列入Nan計算
print(df[['B']].mean())
print(df[['A']].count())
print(df[['B']].count()) # 不列入Nan計算
print(df.isnull().sum())

A    18
dtype: int64
B    37.0
dtype: float64
A    6.0
dtype: float64
B    18.5
dtype: float64
A    3
dtype: int64
B    2
dtype: int64
A    0
B    1
C    0
D    1
dtype: int64


#### 丟棄遺失值

In [4]:
# Practice

import pandas as pd
import  numpy as np
from io import  StringIO

csv_data = '''
A,B,C,D
2,3,4,5
6,34,6
10,,11,8
,,,
3,3,,
,5,,
'''
df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,2.0,3.0,4.0,5.0
1,6.0,34.0,6.0,
2,10.0,,11.0,8.0
3,,,,
4,3.0,3.0,,
5,,5.0,,


In [5]:
df1 = df.dropna(axis = 0)
df1

Unnamed: 0,A,B,C,D
0,2.0,3.0,4.0,5.0


In [6]:
df2 = df.dropna(axis = 1)
df2

0
1
2
3
4
5


In [7]:
# 加入參數how = 'all'，表示整個row資料都是遺失值情況下才可刪除丟棄。
df3 = df.dropna(axis = 0, how = 'all')
df3

Unnamed: 0,A,B,C,D
0,2.0,3.0,4.0,5.0
1,6.0,34.0,6.0,
2,10.0,,11.0,8.0
4,3.0,3.0,,
5,,5.0,,


In [8]:
# Practice

import pandas as pd
import  numpy as np
from io import  StringIO

csv_data = '''
A,B,C,D,E
2,3,4,5,
6,34,6,
10,,11,8,
,,,
3,,3,,
,5,,,
'''
df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D,E
0,2.0,3.0,4.0,5.0,
1,6.0,34.0,6.0,,
2,10.0,,11.0,8.0,
3,,,,,
4,3.0,,3.0,,
5,,5.0,,,


In [9]:
# 加入參數thresh = N，表示刪除包含少於N個觀察值的row。
df1 = df.dropna(thresh = 3)
df1

Unnamed: 0,A,B,C,D,E
0,2.0,3.0,4.0,5.0,
1,6.0,34.0,6.0,,
2,10.0,,11.0,8.0,


In [10]:
df2 = df.dropna(subset = ['C','D'])
df2

Unnamed: 0,A,B,C,D,E
0,2.0,3.0,4.0,5.0,
2,10.0,,11.0,8.0,


In [11]:
df3 = df.dropna(subset = ['C','D'], thresh = 1)
df3

Unnamed: 0,A,B,C,D,E
0,2.0,3.0,4.0,5.0,
1,6.0,34.0,6.0,,
2,10.0,,11.0,8.0,
4,3.0,,3.0,,


#### 填充遺失值

In [12]:
# Practice
# fillna()：輸入要填充的數值

import pandas as pd
import  numpy as np
from io import  StringIO

csv_data = '''
A,B,C,D
2,3,4,5
6,34,6
10,,11,8
,,,
3,3,,
,5,,
'''
df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,2.0,3.0,4.0,5.0
1,6.0,34.0,6.0,
2,10.0,,11.0,8.0
3,,,,
4,3.0,3.0,,
5,,5.0,,


In [13]:
df1 = df.fillna(0)
df1

Unnamed: 0,A,B,C,D
0,2.0,3.0,4.0,5.0
1,6.0,34.0,6.0,0.0
2,10.0,0.0,11.0,8.0
3,0.0,0.0,0.0,0.0
4,3.0,3.0,0.0,0.0
5,0.0,5.0,0.0,0.0


In [14]:
# 補上中位數：建議補上中位數而非平均值，相對的不會受到極端值的影響。
median1 = df.median(axis = 0)
df2 = df.fillna(median1)
df2

Unnamed: 0,A,B,C,D
0,2.0,3.0,4.0,5.0
1,6.0,34.0,6.0,6.5
2,10.0,4.0,11.0,8.0
3,4.5,4.0,6.0,6.5
4,3.0,3.0,6.0,6.5
5,4.5,5.0,6.0,6.5


In [15]:
# Practice

import pandas as pd
import  numpy as np
from io import  StringIO

csv_data = '''
A,B,C,D
2,3,4,5
6,34,6
10,,11,8
,,,
3,3,,
,5,,
'''
df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,2.0,3.0,4.0,5.0
1,6.0,34.0,6.0,
2,10.0,,11.0,8.0
3,,,,
4,3.0,3.0,,
5,,5.0,,


In [16]:
df1 = df.fillna(0, limit = 1)
df1

Unnamed: 0,A,B,C,D
0,2.0,3.0,4.0,5.0
1,6.0,34.0,6.0,0.0
2,10.0,0.0,11.0,8.0
3,0.0,,0.0,
4,3.0,3.0,,
5,,5.0,,


In [17]:
median1 = df.median(axis = 0)
df2 = df.fillna(median1, limit = 1) # 最多填入一個
df2

Unnamed: 0,A,B,C,D
0,2.0,3.0,4.0,5.0
1,6.0,34.0,6.0,6.5
2,10.0,4.0,11.0,8.0
3,4.5,,6.0,
4,3.0,3.0,,
5,,5.0,,


In [18]:
# Practice

import pandas as pd
import  numpy as np
from io import  StringIO

csv_data = '''
A,B,C,D
2,3,4,5
6,34,6
10,,11,8
,,,
3,3,,
,5,,
'''
df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,2.0,3.0,4.0,5.0
1,6.0,34.0,6.0,
2,10.0,,11.0,8.0
3,,,,
4,3.0,3.0,,
5,,5.0,,


In [19]:
# 加入參數method = 'pad'、'ffill'，表示向後填充。
df1 = df.fillna(method = 'pad')
df1

Unnamed: 0,A,B,C,D
0,2.0,3.0,4.0,5.0
1,6.0,34.0,6.0,5.0
2,10.0,34.0,11.0,8.0
3,10.0,34.0,11.0,8.0
4,3.0,3.0,11.0,8.0
5,3.0,5.0,11.0,8.0


In [20]:
# 加入參數method = 'bfill'、'backfill'，代表向前填充。
median1 = df.median()
df2 = df.fillna(method = 'backfill')
df2

Unnamed: 0,A,B,C,D
0,2.0,3.0,4.0,5.0
1,6.0,34.0,6.0,8.0
2,10.0,3.0,11.0,8.0
3,3.0,3.0,,
4,3.0,3.0,,
5,,5.0,,


In [21]:
df3 = df.fillna(method = 'backfill', limit = 1)
df3 

Unnamed: 0,A,B,C,D
0,2.0,3.0,4.0,5.0
1,6.0,34.0,6.0,8.0
2,10.0,,11.0,8.0
3,3.0,3.0,,
4,3.0,3.0,,
5,,5.0,,


## 重複值操作
#### 重複值偵測

In [1]:
import pandas as pd

pd.read_excel('hr.xlsx')

Unnamed: 0,姓名,身分證,專長,學歷,經驗,可用
0,tom,a123456789,SQL,PHD,10.0,1
1,ken,b112234567,JAVA,BA,4.0,0
2,lewis,C123456789,PHP,Master,,0
3,ada,D123456789,SQL,Master,6.0,0
4,Julia,E123456789,SQL,Master,4.0,0
5,jean,f234567891,MYSQL,Master,2.0,1
6,jim,X123456789,SQL,Master,5.0,0
7,annie,Y123456789,BI,PHD,10.0,1


In [4]:
# 安裝連線資料庫套件
!pip3 install pyodbc



In [6]:
# 查看套件版本
!pip3 list

Package                            Version
---------------------------------- -------------------
absl-py                            0.12.0
alabaster                          0.7.12
anaconda-client                    1.7.2
anaconda-navigator                 2.0.3
anaconda-project                   0.9.1
anyio                              2.2.0
appdirs                            1.4.4
applaunchservices                  0.2.1
appnope                            0.1.2
appscript                          1.1.2
argh                               0.26.2
asn1crypto                         1.4.0
astroid                            2.5
astropy                            4.2.1
astunparse                         1.6.3
async-generator                    1.10
atomicwrites                       1.4.0
attrs                              20.3.0
autopep8                           1.5.6
Babel                              2.9.0
backcall                           0.2.0
backports.functoo

In [None]:
# Windows：https://www.microsoft.com/en-us/download/details.aspx?id=56567
# Mac：https://docs.microsoft.com/zh-tw/sql/connect/odbc/linux-mac/install-microsoft-odbc-driver-sql-server-macos?view=sql-server-ver15

In [7]:
# connect to SQL Server（缺少ODBC驅動程式）

import pyodbc 
import pandas as pd
conn = pyodbc.connect("DRIVER={ODBC Driver 17 for SQL Server};SERVER=azsql0706.database.windows.net;Database=azsql0706;UID=sadmin;PWD=Ilove_gdb_2021")  
#stmt = "SELECT GETDATE() AS [NOW], HOST_NAME() AS [CLIENT]"
stmt = "SELECT * FROM [dbo].[求職者]"
# 執行上述命令抓取資料
df = pd.read_sql(stmt,conn)
df

Error: ('01000', "[01000] [unixODBC][Driver Manager]Can't open lib 'ODBC Driver 17 for SQL Server' : file not found (0) (SQLDriverConnect)")

In [1]:
# 解說NaN = Database-null

import pandas as pd

df = pd.read_excel('hr.xlsx')
df.isnull().sum(axis = 0)

姓名     0
身分證    0
專長     0
學歷     0
經驗     1
可用     0
dtype: int64

In [2]:
# SQL操作
# 從資料庫端直接去除NULL

import pyodbc 
import pandas as pd

conn = pyodbc.connect("DRIVER={ODBC Driver 17 for SQL Server};SERVER=azsql0706.database.windows.net;Database=azsql0706;UID=sadmin;PWD=Ilove_gdb_2021")  

stmt = "SELECT * FROM [dbo].[求職者] WHERE [經驗] is not null"
# 執行上述命令抓取資料
df = pd.read_sql(stmt,conn)
df

Error: ('01000', "[01000] [unixODBC][Driver Manager]Can't open lib 'ODBC Driver 17 for SQL Server' : file not found (0) (SQLDriverConnect)")

In [3]:
# SQL操作

import pyodbc 
import pandas as pd

conn = pyodbc.connect("DRIVER={ODBC Driver 17 for SQL Server};SERVER=azsql0706.database.windows.net;Database=azsql0706;UID=sadmin;PWD=Ilove_gdb_2021")  
#stmt = "SELECT GETDATE() AS [NOW], HOST_NAME() AS [CLIENT]"
stmt = """
SELECT [姓名]
      ,[身分證]
      ,[專長]
      ,[學歷]
      ,ISNULL([經驗],0) AS [經驗]
      ,[可用]
  FROM [dbo].[求職者]
"""
# 執行上述命令抓取資料
df = pd.read_sql(stmt,conn)
df

Error: ('01000', "[01000] [unixODBC][Driver Manager]Can't open lib 'ODBC Driver 17 for SQL Server' : file not found (0) (SQLDriverConnect)")

In [4]:
# 每種專長共有多少人

import pandas as pd

df = pd.read_excel('hr.xlsx')
df.groupby('專長').agg('count').iloc[:, [0]]

Unnamed: 0_level_0,姓名
專長,Unnamed: 1_level_1
BI,1
JAVA,1
MYSQL,1
PHP,1
SQL,4


In [5]:
# SQL操作
# 每種專長共有多少人

import pyodbc 
import pandas as pd

conn = pyodbc.connect("DRIVER={ODBC Driver 17 for SQL Server};SERVER=azsql0706.database.windows.net;Database=azsql0706;UID=sadmin;PWD=Ilove_gdb_2021")  
#stmt = "SELECT GETDATE() AS [NOW], HOST_NAME() AS [CLIENT]"
stmt = """
  SELECT 
       [專長],count(*) as [總數]
  FROM [dbo].[求職者]
  GROUP BY 專長
"""  
# SQL：資料庫語法整合 Python

# 執行上述命令抓取資料
df = pd.read_sql(stmt,conn)
df

Error: ('01000', "[01000] [unixODBC][Driver Manager]Can't open lib 'ODBC Driver 17 for SQL Server' : file not found (0) (SQLDriverConnect)")

In [6]:
# 解說 DataFrame.dropna
# 當有任何值為Na，就會刪除整個row（預設axis = 0）

import pandas as pd

df = pd.read_excel('hr.xlsx')
df.dropna()

Unnamed: 0,姓名,身分證,專長,學歷,經驗,可用
0,tom,a123456789,SQL,PHD,10.0,1
1,ken,b112234567,JAVA,BA,4.0,0
3,ada,D123456789,SQL,Master,6.0,0
4,Julia,E123456789,SQL,Master,4.0,0
5,jean,f234567891,MYSQL,Master,2.0,1
6,jim,X123456789,SQL,Master,5.0,0
7,annie,Y123456789,BI,PHD,10.0,1


In [8]:
# 再次解說fillna

import pandas as pd

df = pd.read_excel('hr.xlsx')
df['經驗'] = df['經驗'].fillna(99)
df

Unnamed: 0,姓名,身分證,專長,學歷,經驗,可用
0,tom,a123456789,SQL,PHD,10.0,1
1,ken,b112234567,JAVA,BA,4.0,0
2,lewis,C123456789,PHP,Master,99.0,0
3,ada,D123456789,SQL,Master,6.0,0
4,Julia,E123456789,SQL,Master,4.0,0
5,jean,f234567891,MYSQL,Master,2.0,1
6,jim,X123456789,SQL,Master,5.0,0
7,annie,Y123456789,BI,PHD,10.0,1


In [9]:
# 進階修改資料庫數據

import pyodbc

conn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};SERVER=azsql0706.database.windows.net;Database=azsql0706;UID=sadmin;PWD=Ilove_gdb_2021')
 
cursor = conn.cursor()

cursor.execute('''
               update [dbo].[求職者] set [經驗] = [經驗] + 1 where [身分證]='a123456789'
               ''')
conn.commit()

Error: ('01000', "[01000] [unixODBC][Driver Manager]Can't open lib 'ODBC Driver 17 for SQL Server' : file not found (0) (SQLDriverConnect)")

In [10]:
# 在SQL處理完再到Python執行

import pyodbc 
import pandas as pd

conn = pyodbc.connect("DRIVER={ODBC Driver 17 for SQL Server};SERVER=azsql0706.database.windows.net;Database=azsql0706;UID=sadmin;PWD=Ilove_gdb_2021")  
#stmt = "SELECT GETDATE() AS [NOW], HOST_NAME() AS [CLIENT]"
stmt = """
  SELECT * FROM [dbo].[求職者]
"""
# SQL：資料庫語法整合 Python

# 執行上述命令抓取資料
df = pd.read_sql(stmt,conn)
df

Error: ('01000', "[01000] [unixODBC][Driver Manager]Can't open lib 'ODBC Driver 17 for SQL Server' : file not found (0) (SQLDriverConnect)")

#### 重複值丟棄

In [12]:
import pandas as pd

df = pd.read_excel('hr.xlsx')
df[['專長']]

Unnamed: 0,專長
0,SQL
1,JAVA
2,PHP
3,SQL
4,SQL
5,MYSQL
6,SQL
7,BI


In [15]:
# 各種專長分別有多少人
df[['專長']].value_counts()

專長   
SQL      4
BI       1
JAVA     1
MYSQL    1
PHP      1
dtype: int64

In [17]:
# 有哪幾種專長
df['專長'].unique()

array(['SQL', 'JAVA', 'PHP', 'MYSQL', 'BI'], dtype=object)

In [19]:
# duplicated()：檢查DataFrame內的row是否有重複值資料。
# drop_duplicates()：將丟棄遺失值。
# -> 預設保留第一個。
# -> 加入參數keep = 'first'將得到相同結果。
# -> 加入參數keep = 'last'將保留最後一個。
# -> 加入參數keep = False代表不保留。

df[['專長']].drop_duplicates()

Unnamed: 0,專長
0,SQL
1,JAVA
2,PHP
5,MYSQL
7,BI


In [4]:
# 重複資料（以亂數產生）

import pyodbc 
import pandas as pd

conn = pyodbc.connect("DRIVER={ODBC Driver 17 for SQL Server};SERVER=azsql0706.database.windows.net;Database=azsql0706;UID=sadmin;PWD=Ilove_gdb_2021")  
#stmt = "SELECT GETDATE() AS [NOW], HOST_NAME() AS [CLIENT]"
stmt = """
  SELECT *
  FROM [dbo].[求職者5]
  
"""
# SQL：資料庫語法整合Python

# 執行上述命令抓取資料
df = pd.read_sql(stmt,conn)
df
# df.to_excel('hr_duplicated.xlsx', index = False)

Error: ('01000', "[01000] [unixODBC][Driver Manager]Can't open lib 'ODBC Driver 17 for SQL Server' : file not found (0) (SQLDriverConnect)")

In [5]:
# 載入上述產生的檔案（True為重複值）

import pandas as pd

dfd = pd.read_excel('hr_duplicated.xlsx')
dfd.duplicated(subset = ['專長', '學歷'])

0     False
1     False
2     False
3     False
4      True
5     False
6      True
7     False
8      True
9      True
10     True
11     True
12     True
13     True
14     True
15     True
16     True
17     True
18     True
19     True
20     True
21     True
22     True
23     True
dtype: bool

In [6]:
# Practice_1

import pandas as pd
import  numpy as np
from io import  StringIO

csv_data = '''
A,B,C,D
2,3,5,5
5,5,5,5
5,5,5,5
13,23,5,5
'''
df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,2,3,5,5
1,5,5,5,5
2,5,5,5,5
3,13,23,5,5


In [7]:
# Practice_1，檢查重複值
df.duplicated()

0    False
1    False
2     True
3    False
dtype: bool

In [8]:
# Practice_1，檢查重複值（加入~表示相反的意思）
~df.duplicated()

0     True
1     True
2    False
3     True
dtype: bool

In [9]:
# Practice_1，以各欄位來看
print(df.duplicated('A'))
print(df.duplicated('D'))

0    False
1    False
2     True
3    False
dtype: bool
0    False
1     True
2     True
3     True
dtype: bool


In [11]:
# Practice_1，丟棄重複值
df.drop_duplicates()

Unnamed: 0,A,B,C,D
0,2,3,5,5
1,5,5,5,5
3,13,23,5,5


In [12]:
# Practice_1，保留最後一個
df.drop_duplicates(keep = 'last')

Unnamed: 0,A,B,C,D
0,2,3,5,5
2,5,5,5,5
3,13,23,5,5


In [14]:
# Practice_1，不保留
df.drop_duplicates(keep = False)

Unnamed: 0,A,B,C,D
0,2,3,5,5
3,13,23,5,5


In [None]:
# SQL語法

'''
SELECT *
FROM [dbo].[求職者]

SELECT DISTINCT [專長]
FROM [dbo].[求職者]


SELECT   [專長], [學歷]
FROM     [dbo].[求職者]
GROUP BY [專長], [學歷]


SELECT * INTO [dbo].[求職者5]
FROM [dbo].[求職者]

select * from [dbo].[求職者5]

insert into [dbo].[求職者5]
select top(4) * from [求職者] order by newid()

'''

# 依不同欄位進行平均計算

請整理出七項經常性薪資資料  
請顯示這七項薪資資料的各別平均值  
* 行業筆數
* 經常性的薪資平均值
* 專業人員-經常性的薪資平均值
* 技術員及助理專業人員-經常性的薪資平均值
* 事務支援人員-經常性的薪資平均值
* 服務及銷售工作人員-經常性的薪資平均值
* 技藝_機械設備操作及組裝人員-經常性的薪資平均值
* 基層技術工及勞力工-經常性的薪資平均值

In [11]:
# Homework

import pandas as pd

df = pd.read_csv('HW14__Data.csv')
df

Unnamed: 0,行業別,經常性薪資,專業人員-經常性薪資,技術員及助理專業人員-經常性薪資,事務支援人員-經常性薪資,服務及銷售工作人員-經常性薪資,技藝_機械設備操作及組裝人員-經常性薪資,基層技術工及勞力工-經常性薪資
0,工業及服務業部門,27055,34003,28646,26068,25012,25338,22824
1,工業部門,26860,33788,28103,25906,25811,24919,22784
2,礦業及土石採取業,26170,35768,27744,23441,23625,26988,22440
3,石油及天然氣礦業,27696,37125,25313,23625,23625,23625,0
4,砂_石及黏土採取業,25661,30000,29185,23405,0,27629,22440
...,...,...,...,...,...,...,...,...
106,運動_娛樂及休閒服務業,24943,30582,27136,24401,23996,26269,22621
107,其他服務業,23848,32250,27270,24338,22272,24938,22708
108,個人及家庭用品維修業,25049,35000,27257,24652,22905,25115,22011
109,美髮及美容美體業,21918,0,0,23050,21782,0,0


In [12]:
print('行業筆數為', df.shape[0], '筆')

行業筆數為 111 筆


In [13]:
# 各行業薪資平均值_方法一

import numpy as np

np.mean(df.iloc[:, :])

經常性薪資                   27764.801802
專業人員-經常性薪資              33162.009009
技術員及助理專業人員-經常性薪資        28917.549550
事務支援人員-經常性薪資            26588.432432
服務及銷售工作人員-經常性薪資         22083.828829
技藝_機械設備操作及組裝人員-經常性薪資    23405.243243
基層技術工及勞力工-經常性薪資         19570.288288
dtype: float64

In [14]:
# 各行業薪資平均值_方法二
# 此為DataFrame繼承numpy的用法

import numpy as np

df.iloc[:, :].mean()

經常性薪資                   27764.801802
專業人員-經常性薪資              33162.009009
技術員及助理專業人員-經常性薪資        28917.549550
事務支援人員-經常性薪資            26588.432432
服務及銷售工作人員-經常性薪資         22083.828829
技藝_機械設備操作及組裝人員-經常性薪資    23405.243243
基層技術工及勞力工-經常性薪資         19570.288288
dtype: float64

In [15]:
# Exam
# Q1：能否呈現為DataFrame

result = df.iloc[:, :].mean().reset_index()
result.columns = ['說明', '數值']
result

Unnamed: 0,說明,數值
0,經常性薪資,27764.801802
1,專業人員-經常性薪資,33162.009009
2,技術員及助理專業人員-經常性薪資,28917.54955
3,事務支援人員-經常性薪資,26588.432432
4,服務及銷售工作人員-經常性薪資,22083.828829
5,技藝_機械設備操作及組裝人員-經常性薪資,23405.243243
6,基層技術工及勞力工-經常性薪資,19570.288288


In [16]:
# Q2：如何將兩個DataFrame合併

import pandas as pd

pd.DataFrame(['行業筆數', '111'])

Unnamed: 0,0
0,行業筆數
1,111


In [19]:
# 承Q2，轉置成相同格式

result1 = pd.DataFrame([['行業筆數', '111']])
result1
# 亦或 ---> pd.DataFrame(['行業筆數', '111']).T

Unnamed: 0,0,1
0,行業筆數,111


In [22]:
# 將兩個DataFrame合併

import pandas as pd

result1 = pd.DataFrame([['行業筆數', df.shape[0]]])
result1.columns = ['說明', '數值']

result = df.iloc[:, :].mean().reset_index()
result.columns = ['說明', '數值']

final = pd.concat([result1, result]).reset_index(drop = True) # 如何合併兩個資料集

final['數值'] = final['數值'].astype('float32') # 如何轉換資料型態

print(final.dtypes)
final.to_excel('result.xlsx', index = False)

說明     object
數值    float32
dtype: object


In [3]:
# 無條件進位

import math

math.ceil(123.456), math.ceil(123.987)

(124, 124)

In [4]:
# 無條件捨去

import math

math.floor(123.456), math.floor(123.987)

(123, 123)

In [5]:
# 小數點下兩位，其他都捨去

round(123.456, 2)

123.46

In [8]:
# 小數點下幾位都一樣

import math

N = 2
data = 123.4567
math.floor(data * 10**N) / 10**N

123.45