In [20]:
%matplotlib inline 
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import decimal
import sys
import os


def myprint(msg):
    print("  ")
    print("============== %s ==============" % msg)
    print("  ")


def get_data(s: int = 10):
    boolean = [True, False]
    gender = ["男", "女"]
    color = ["white", "black", "yellow"]
    df = pd.DataFrame({
            "height": np.random.randint(150, 190, s),
            "weight": np.random.randint(40, 90, s),
            "smoker": [boolean[x] for x in np.random.randint(0, 2, s)],
            "gender": [gender[x] for x in np.random.randint(0, 2, s)],
            "age": np.random.randint(15, 90, s),
            "color": [color[x] for x in np.random.randint(0, len(color), s)]
          })
    return df


def get_data2(s: int = 10):
    company = ['A', 'B', 'C']
    gender = ["男", "女"]
    data = {"company": [company[x] for x in np.random.randint(0, len(company), s)],
            "salary": np.random.randint(5, 50, s),
            "gender": [gender[x] for x in np.random.randint(0, 2, s)],
            "age": np.random.randint(15, 50, s)
            }
    return pd.DataFrame(data)


def demo_basis():
    df = get_data(5)
    #print(df)

    myprint("info: 打印索引和列的数据类型、内存、存储等基础信息")
    print(df.info())

    myprint("describe: 生成描述性统计汇总")
    print(df.describe())

    myprint("value_counts: 统计分类变量中每个类的数量")
    print(df['color'].describe())

    myprint("isna: 判断数据是否为缺失值")
    print(df.isna().any())

    myprint("dropna: 删掉含有缺失值的数据")
    print(df.dropna())

    myprint("fillna: 填充缺失数据")
    print(df.fillna('B'))

    myprint("sort_values: 按照某列进行排序")
    print(df.sort_values(by = 'age'))

    myprint("astype: 修改字段的数据类型")
    print(df['age'].astype(int))

    myprint("rename: 修改DataFrame的列名")
    print(df.rename(columns = {'age': 'number'}, inplace=True))

    myprint("set_index: 将DataFrame中的某一（多）个字段设置为索引")
    print(df.rename(columns = {'age': 'number'}, inplace=True))

    myprint("reset_index: 重置索引，默认重置后的索引为0~len(df)-1")
    print(df.reset_index(drop = True))

    # drop_duplicates()  去掉重复值
    # drop() 删掉DataFrame中的某些字段
    # isin() 构建布尔索引，对DataFrame的数据进行条件筛选 data.loc[data['company'].isin(['A','C'])]
    # pd.cut() | pd.qcut() 将连续变量离散化，比如将人的年龄划分为各个区间
    # print(pd.cut(df.age, bins = 5))
    # where() 将不符合条件的值替换掉成指定值，相当于执行了一个if-else
    # pd.concat() 将多个Series或DataFrame拼起来
    myprint("透视表")
    print(get_data2().pivot_table(values = 'salary', index = 'company', columns = 'gender', aggfunc = np.mean))


def gender_map(x):
    if x == 1:
        return "女"
    else:
        return "男"


def demo_map():
    # map函数：传入一个参数，对该参数进行处理。不能传入多个参数
    df = get_data()
    #print(df)

    myprint("修改列值")
    df["gender"] = df["gender"].map({"男": 1, "女": 2})
    print(df)

    myprint("根据条件修改列值，男女对调")
    df["gender"] = df["gender"].map(gender_map)
    print(df)


def demo_apply():
    # apply是自由度最高的函数，但是效率相对会低一些
    # axis = 0 代表 columns 轴，axis = 1 代表 row 轴
    df = get_data()
    # print(df)

    myprint("沿着 y 轴求和")
    df1 = df[["height", "weight", "age"]].apply(np.sum, axis=0)
    print(df1)

    myprint("沿着 y 轴求对数")
    df2 = df[["height", "weight", "age"]].apply(np.log, axis=0)
    print(df2)


def demo_applymap():
    # 对每个单元格执行指定函数的操作
    df = pd.DataFrame({
        "A": np.random.randn(5),
        "B": np.random.randn(5),
        "C": np.random.randn(5),
        "D": np.random.randn(5),
        "E": np.random.randn(5),
    })

    print(df)

    myprint("applymap: 对单元格里值保留2位小数")
    print(df.applymap(lambda x: "%.2f" % x))



demo_basis()

  
  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   height  5 non-null      int32 
 1   weight  5 non-null      int32 
 2   smoker  5 non-null      bool  
 3   gender  5 non-null      object
 4   age     5 non-null      int32 
 5   color   5 non-null      object
dtypes: bool(1), int32(3), object(2)
memory usage: 273.0+ bytes
None
  
  
           height     weight        age
count    5.000000   5.000000   5.000000
mean   164.000000  64.200000  52.200000
std     10.416333   8.288546  28.700174
min    155.000000  53.000000  18.000000
25%    158.000000  60.000000  33.000000
50%    158.000000  65.000000  49.000000
75%    169.000000  68.000000  72.000000
max    180.000000  75.000000  89.000000
  
  
count         5
unique        3
top       white
freq          3
Name: color, dtype: object
  
  
height    False
weight    False
smoker    False
gender    False
age 