In [1]:
import pandas as pd
import numpy as np

# Series

## 列表创建Series

默认索引

In [2]:
t = pd.Series([1,2,31,12,3,4])
t

0     1
1     2
2    31
3    12
4     3
5     4
dtype: int64

In [3]:
type(t)

pandas.core.series.Series

指定索引

In [4]:
t2 = pd.Series([1,2,31,12,3,4], index=list('abcdef'))
t2

a     1
b     2
c    31
d    12
e     3
f     4
dtype: int64

## 字典创建Series

In [5]:
temp_dict = {'name':'xiaohong', 'age':30, 'tel':10086}
t3 = pd.Series(temp_dict)
t3

name    xiaohong
age           30
tel        10086
dtype: object

In [6]:
t2.dtype

dtype('int64')

In [7]:
t2.astype('float')

a     1.0
b     2.0
c    31.0
d    12.0
e     3.0
f     4.0
dtype: float64

## Series的切片和索引

In [8]:
t3['age']

30

In [9]:
t3[1]

30

### 取多行

In [10]:
t3[:2]

name    xiaohong
age           30
dtype: object

In [11]:
t3[[0,2]]

name    xiaohong
tel        10086
dtype: object

In [12]:
t3[['age', 'name']]

age           30
name    xiaohong
dtype: object

In [13]:
t

0     1
1     2
2    31
3    12
4     3
5     4
dtype: int64

In [14]:
t[t>3] # 布尔索引

2    31
3    12
5     4
dtype: int64

In [15]:
t3.index

Index(['name', 'age', 'tel'], dtype='object')

In [16]:
for i in t3.index:
    print(i)

name
age
tel


In [17]:
type(t3.index)

pandas.core.indexes.base.Index

In [18]:
len(t3.index)

3

In [19]:
list(t3.index)[:2]

['name', 'age']

In [20]:
t3.index[:2]

Index(['name', 'age'], dtype='object')

In [21]:
t3.values

array(['xiaohong', 30, 10086], dtype=object)

In [22]:
type(t3.values)

numpy.ndarray

# DataFrame

DataFrame是Series的容器

行索引叫index，axis=0
列索引叫columns，axis=1

In [23]:
pd.DataFrame(np.arange(12).reshape(3,4))

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [24]:
t1 = pd.DataFrame(np.arange(12).reshape(3,4), index=list('abc'), columns=list('xyzw'))
t1

Unnamed: 0,x,y,z,w
a,0,1,2,3
b,4,5,6,7
c,8,9,10,11


## 字典方式创建

In [25]:
d1 = {'name':['xiaoming', 'xiaohong'], 'age':[20,32], 'tel':[10086, 10010]}
d1

{'name': ['xiaoming', 'xiaohong'], 'age': [20, 32], 'tel': [10086, 10010]}

In [26]:
t1 = pd.DataFrame(d1)
t1

Unnamed: 0,name,age,tel
0,xiaoming,20,10086
1,xiaohong,32,10010


In [27]:
type(t1)

pandas.core.frame.DataFrame

缺失值

In [28]:
d2 = [{'name':'xiaohong', 'age':32, 'tel':10086}, {'name':'xiaowang', 'tel':10010}, {'name':'xiaogong', 'age':22}]
d2

[{'name': 'xiaohong', 'age': 32, 'tel': 10086},
 {'name': 'xiaowang', 'tel': 10010},
 {'name': 'xiaogong', 'age': 22}]

In [29]:
t2 = pd.DataFrame(d2)
t2

Unnamed: 0,name,age,tel
0,xiaohong,32.0,10086.0
1,xiaowang,,10010.0
2,xiaogong,22.0,


## DateFrame的基础属性

In [32]:
t2.index # 行索引

RangeIndex(start=0, stop=3, step=1)

In [33]:
t2.columns # 列索引

Index(['name', 'age', 'tel'], dtype='object')

In [34]:
t2.values

array([['xiaohong', 32.0, 10086.0],
       ['xiaowang', nan, 10010.0],
       ['xiaogong', 22.0, nan]], dtype=object)

In [35]:
t2.shape

(3, 3)

In [36]:
t2.dtypes

name     object
age     float64
tel     float64
dtype: object

In [48]:
t2.ndim

2

## DataFrame的基础方法

In [49]:
t2.head(2)

Unnamed: 0,name,age,tel
0,xiaohong,32.0,10086.0
1,xiaowang,,10010.0


In [50]:
t2.tail(2)

Unnamed: 0,name,age,tel
1,xiaowang,,10010.0
2,xiaogong,22.0,


In [51]:
t2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   name    3 non-null      object 
 1   age     2 non-null      float64
 2   tel     2 non-null      float64
dtypes: float64(2), object(1)
memory usage: 200.0+ bytes


In [52]:
t2.describe() # 基本统计信息

Unnamed: 0,age,tel
count,2.0,2.0
mean,27.0,10048.0
std,7.071068,53.740115
min,22.0,10010.0
25%,24.5,10029.0
50%,27.0,10048.0
75%,29.5,10067.0
max,32.0,10086.0


In [54]:
t2.sort_values(by='tel')

Unnamed: 0,name,age,tel
1,xiaowang,,10010.0
0,xiaohong,32.0,10086.0
2,xiaogong,22.0,


## 索引

In [58]:
data = pd.read_csv('./dogNames2.csv')
df = pd.DataFrame(data)
df

Unnamed: 0,Row_Labels,Count_AnimalName
0,1,1
1,2,2
2,40804,1
3,90201,1
4,90203,1
...,...,...
16215,37916,1
16216,38282,1
16217,38583,1
16218,38948,1


In [71]:
df[:20] # 取行

Unnamed: 0,Row_Labels,Count_AnimalName
0,1,1
1,2,2
2,40804,1
3,90201,1
4,90203,1
5,102201,1
6,3010271,1
7,MARCH,2
8,APRIL,51
9,AUGUST,14


In [70]:
df['Row_Labels'] # 取列

0            1
1            2
2        40804
3        90201
4        90203
         ...  
16215    37916
16216    38282
16217    38583
16218    38948
16219    39743
Name: Row_Labels, Length: 16220, dtype: object

In [66]:
type(df['Row_Labels'])

pandas.core.series.Series

In [69]:
type(df[:20])

pandas.core.frame.DataFrame

### pandas的loc和iloc

In [80]:
df = pd.DataFrame(np.arange(12).reshape(3,4), index=list('abc'), columns=list('WXYZ'))
df

Unnamed: 0,W,X,Y,Z
a,0,1,2,3
b,4,5,6,7
c,8,9,10,11


In [81]:
df.loc['b', 'Y']

6

In [82]:
type(df.loc['b', 'Y'])

numpy.int64

In [83]:
df.loc['b']

W    4
X    5
Y    6
Z    7
Name: b, dtype: int64

In [86]:
df.loc['b',:]

W    4
X    5
Y    6
Z    7
Name: b, dtype: int64

In [85]:
df.loc[:,'Y']

a     2
b     6
c    10
Name: Y, dtype: int64

In [88]:
df.loc[['a', 'c'], ["W", 'Y', 'Z']]

Unnamed: 0,W,Y,Z
a,0,2,3
c,8,10,11


In [89]:
df.iloc[1]

W    4
X    5
Y    6
Z    7
Name: b, dtype: int64

In [91]:
df.iloc[[1,2],[2,3]]

Unnamed: 0,Y,Z
b,6,7
c,10,11


### 布尔索引

In [93]:
data = pd.read_csv('./dogNames2.csv')
df = pd.DataFrame(data)
df

Unnamed: 0,Row_Labels,Count_AnimalName
0,1,1
1,2,2
2,40804,1
3,90201,1
4,90203,1
...,...,...
16215,37916,1
16216,38282,1
16217,38583,1
16218,38948,1


In [96]:
df[ (df['Count_AnimalName']>800) & (df['Count_AnimalName']<1000) ]

Unnamed: 0,Row_Labels,Count_AnimalName
2660,CHARLIE,856
3251,COCO,852
12368,ROCKY,823


In [142]:
d1 = {'info':['zhao/shixin', 'yu/suya', 'xiang/chouchou'], 'age':[26,np.nan,33], 'tel':[2011,2019,np.nan]}
df = pd.DataFrame(d1)
df

Unnamed: 0,info,age,tel
0,zhao/shixin,26.0,2011.0
1,yu/suya,,2019.0
2,xiang/chouchou,33.0,


In [143]:
df['info'].str.split('/')

0       [zhao, shixin]
1           [yu, suya]
2    [xiang, chouchou]
Name: info, dtype: object

In [144]:
df['info'].str.split('/').to_list()

[['zhao', 'shixin'], ['yu', 'suya'], ['xiang', 'chouchou']]

## 缺失数据处理

In [145]:
pd.isnull(df)

Unnamed: 0,info,age,tel
0,False,False,False
1,False,True,False
2,False,False,True


In [146]:
pd.notnull(df)

Unnamed: 0,info,age,tel
0,True,True,True
1,True,False,True
2,True,True,False


In [147]:
df[pd.notnull(df['age'])] # 输出age不为nan的行

Unnamed: 0,info,age,tel
0,zhao/shixin,26.0,2011.0
2,xiang/chouchou,33.0,


In [148]:
df[pd.notnull(df['tel'])] # 输出tel不为nan的行

Unnamed: 0,info,age,tel
0,zhao/shixin,26.0,2011.0
1,yu/suya,,2019.0


In [149]:
df.dropna(axis=0) # 删除有nan的行

Unnamed: 0,info,age,tel
0,zhao/shixin,26.0,2011.0


In [150]:
df.dropna(axis=0, how='any') # how默认为any，只要有nan的就删除

Unnamed: 0,info,age,tel
0,zhao/shixin,26.0,2011.0


In [151]:
df.dropna(axis=0, how='all') # 全部为nan时才删除

Unnamed: 0,info,age,tel
0,zhao/shixin,26.0,2011.0
1,yu/suya,,2019.0
2,xiang/chouchou,33.0,


In [152]:
df

Unnamed: 0,info,age,tel
0,zhao/shixin,26.0,2011.0
1,yu/suya,,2019.0
2,xiang/chouchou,33.0,


In [156]:
df.dropna(axis=0, how='any', inplace=True) #原地替换，就是df进行替换
df

Unnamed: 0,info,age,tel
0,zhao/shixin,26.0,2011.0


In [157]:
d1 = {'info':['zhao/shixin', 'yu/suya', 'xiang/chouchou'], 'age':[26,np.nan,33], 'tel':[2011,2019,np.nan]}
df = pd.DataFrame(d1)
df

Unnamed: 0,info,age,tel
0,zhao/shixin,26.0,2011.0
1,yu/suya,,2019.0
2,xiang/chouchou,33.0,


In [158]:
df.fillna(0)

Unnamed: 0,info,age,tel
0,zhao/shixin,26.0,2011.0
1,yu/suya,0.0,2019.0
2,xiang/chouchou,33.0,0.0


In [170]:
df.fillna(df.mean()) # 全部替换为均值，含有非数值列会报警告

  df.fillna(df.mean()) # 全部替换为均值，含有非数值列会报警告


Unnamed: 0,info,age,tel
0,zhao/shixin,26.0,2011.0
1,yu/suya,29.5,2019.0
2,xiang/chouchou,33.0,2015.0


In [174]:
df['age'] = df['age'].fillna(df['age'].mean())
df['tel'] = df['tel'].fillna(df['tel'].mean())
df

Unnamed: 0,info,age,tel
0,zhao/shixin,26.0,2011.0
1,yu/suya,29.5,2019.0
2,xiang/chouchou,33.0,2015.0
