## Pandas中的DataFrame

In [2]:
import pandas as pd

In [3]:
import numpy as np

#### 可通过传入以下类型构造DataFrame
    Dict of 1D ndarrays, lists, dicts, or Series   一维数组，列表，字典，序列形式的字典
    2-D numpy.ndarray   2维数组
    Structured or record ndarray   结构化或记录 ndarray
    A Series   序列
    Another DataFrame   另一个DataFrame

In [4]:
df = pd.DataFrame(np.arange(12).reshape(3,4))

In [5]:
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


#### 行索引  &  列索引 
    行索引：索引不同行，横向索引，index，0轴，axis=0
    列索引：索引不同列，纵向索引，columns， 1轴， axis=1

In [11]:
import string

In [12]:
t1 = pd.DataFrame(np.arange(12).reshape(3,4), index=list(string.ascii_uppercase[:3]), columns=list(string.ascii_uppercase[-4:]))

In [13]:
t1

Unnamed: 0,W,X,Y,Z
A,0,1,2,3
B,4,5,6,7
C,8,9,10,11


####  设置index  &  columns

In [26]:
t2 = pd.DataFrame(np.arange(48).reshape(12,4))

In [27]:
t2

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19
5,20,21,22,23
6,24,25,26,27
7,28,29,30,31
8,32,33,34,35
9,36,37,38,39


####  不设置index  &  columns 

### DataFrame基础属性 

In [17]:
t1.shape  # 数据形状

(3, 4)

In [19]:
t1.dtypes  # 数据类型

W    int32
X    int32
Y    int32
Z    int32
dtype: object

In [20]:
t1.ndim  # 数据维度

2

In [21]:
t1.index  # 行索引

Index(['A', 'B', 'C'], dtype='object')

In [22]:
t1.columns  # 列索引

Index(['W', 'X', 'Y', 'Z'], dtype='object')

In [23]:
t1.values  # 值

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

### DataFrame整体情况查询 

In [28]:
t2.head()  # 前多少条数据，默认5条

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [29]:
t2.head(3)  # 前3条数据

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [30]:
t2.tail()  # 末尾多少条数据，默认5条

Unnamed: 0,0,1,2,3
7,28,29,30,31
8,32,33,34,35
9,36,37,38,39
10,40,41,42,43
11,44,45,46,47


In [31]:
t2.tail(3)  # 末尾3条数据

Unnamed: 0,0,1,2,3
9,36,37,38,39
10,40,41,42,43
11,44,45,46,47


In [34]:
t2.info()  # t2的属性：行数，列数，列索引，列非空值个数，列类型，列类型，内存占用

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 4 columns):
0    12 non-null int32
1    12 non-null int32
2    12 non-null int32
3    12 non-null int32
dtypes: int32(4)
memory usage: 272.0 bytes


In [35]:
t2.describe()  # count：计数；mean：均值；std：标准差；min：最小值；max：最大值；四分位数：25%，50%，75%；

Unnamed: 0,0,1,2,3
count,12.0,12.0,12.0,12.0
mean,22.0,23.0,24.0,25.0
std,14.422205,14.422205,14.422205,14.422205
min,0.0,1.0,2.0,3.0
25%,11.0,12.0,13.0,14.0
50%,22.0,23.0,24.0,25.0
75%,33.0,34.0,35.0,36.0
max,44.0,45.0,46.0,47.0


### DataFrame读取本地csv文件数据 

In [37]:
file_path = "./dogNames2.csv"

In [38]:
dog_name = pd.read_csv(file_path)

#### 提取使用名字次数最多的5个名字 

In [41]:
dog_name.sort_values(by="Count_AnimalName", ascending=False).head(5)  # ascending:升序，ascending=False为降序

Unnamed: 0,Row_Labels,Count_AnimalName
1156,BELLA,1195
9140,MAX,1153
2660,CHARLIE,856
3251,COCO,852
12368,ROCKY,823


####  先对数据进行排序，然后通过  .head(5)   取前5个名字

### DataFrame索引数据
    loc[]  通过标签索引行数据
    iloc[] 通过位置索引列数据，如第0行到底3行  iloc[0:3]

In [61]:
df = pd.DataFrame(np.arange(48).reshape(6,8),index=list(string.ascii_uppercase[:6]), columns=list(string.ascii_uppercase[-8:]))

In [64]:
df.loc[:]  # 

Unnamed: 0,S,T,U,V,W,X,Y,Z
A,0,1,2,3,4,5,6,7
B,8,9,10,11,12,13,14,15
C,16,17,18,19,20,21,22,23
D,24,25,26,27,28,29,30,31
E,32,33,34,35,36,37,38,39
F,40,41,42,43,44,45,46,47


In [65]:
df.loc[:"D"]  # 索引到第D行的数据

Unnamed: 0,S,T,U,V,W,X,Y,Z
A,0,1,2,3,4,5,6,7
B,8,9,10,11,12,13,14,15
C,16,17,18,19,20,21,22,23
D,24,25,26,27,28,29,30,31


In [72]:
df.loc[["A","C"]]  # 取A,C行数据

Unnamed: 0,S,T,U,V,W,X,Y,Z
A,0,1,2,3,4,5,6,7
C,16,17,18,19,20,21,22,23


In [73]:
df.loc[["A","C"],["S","X","Y"]]  # 索引A C行，S X Y列数据

Unnamed: 0,S,X,Y
A,0,5,6
C,16,21,22


In [74]:
df.loc["A":"C", "V":"Z"]  # 类似切片索引数据

Unnamed: 0,V,W,X,Y,Z
A,3,4,5,6,7
B,11,12,13,14,15
C,19,20,21,22,23


In [76]:
df.iloc[0:3,3:8]  # iloc 使用传入位置，数字

Unnamed: 0,V,W,X,Y,Z
A,3,4,5,6,7
B,11,12,13,14,15
C,19,20,21,22,23


#### 赋值 

In [77]:
df.loc["A","V"] = 999

In [78]:
df

Unnamed: 0,S,T,U,V,W,X,Y,Z
A,0,1,2,999,4,5,6,7
B,8,9,10,11,12,13,14,15
C,16,17,18,19,20,21,22,23
D,24,25,26,27,28,29,30,31
E,32,33,34,35,36,37,38,39
F,40,41,42,43,44,45,46,47


In [80]:
df.loc["A":"C","S"] = 111

In [81]:
df

Unnamed: 0,S,T,U,V,W,X,Y,Z
A,111,1,2,999,4,5,6,7
B,111,9,10,11,12,13,14,15
C,111,17,18,19,20,21,22,23
D,24,25,26,27,28,29,30,31
E,32,33,34,35,36,37,38,39
F,40,41,42,43,44,45,46,47


### DataFrame中的布尔索引 
   获取dogsName中name使用次数超过800的狗的名字

In [85]:
dog_name[:5]

Unnamed: 0,Row_Labels,Count_AnimalName
0,1,1
1,2,2
2,40804,1
3,90201,1
4,90203,1


In [89]:
dog_name[dog_name["Count_AnimalName"]>800]  # 布尔索引，提取狗的名字使用次数大于800的名字

Unnamed: 0,Row_Labels,Count_AnimalName
1156,BELLA,1195
2660,CHARLIE,856
3251,COCO,852
9140,MAX,1153
12368,ROCKY,823


#### 找到所有的使用次数超过700并且名字的字符串的长度大于4的狗的名字

In [91]:
dog_name[(dog_name["Count_AnimalName"]>700)&(dog_name["Row_Labels"].str.len()>4)]  

Unnamed: 0,Row_Labels,Count_AnimalName
1156,BELLA,1195
2660,CHARLIE,856
8552,LUCKY,723
12368,ROCKY,823


####  数据字符串长度: data["列名"].str.len()
####  有多个条件是可使用    &(且)     |(或)  
####  不同条件必须使用括号包裹

####  字符串的方法
![image.png](attachment:image.png)

### 缺失数据的处理

#### ![image.png](attachment:image.png) 

#### 一种缺失数据表示：采用 NaN 与numpy中的np.nan一样
####  另一种缺失数据表示：为蓝色框，0

#### 判断数据中是否存在NaN
    pd.isnull()  是否存在NaN
    pd.notnull()  不存在NaN

#### 处理NaN
    1.dropna(axis=0, how="any", inplace=False)  # 删除NaN
    2.df.fillna(df.mean()) # 填充均值    df.fillna(df.median()) # 填充中位数    df.fillna(0)  # 给NaN填充数据