In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 001使用list构造Series

In [2]:
courses = ['语文', '数学', '英语', '计算机']
s1 = pd.Series(data=courses)
s1

0     语文
1     数学
2     英语
3    计算机
dtype: object

# 002使用dict构造Series

In [3]:
grades = {'语文':80, '数学':90, '英语':85, '计算机':100}
s2 = pd.Series(grades)
s2

语文      80
数学      90
英语      85
计算机    100
dtype: int64

# 003Series转换成list

In [4]:
grd = s2.to_list()
grd

[80, 90, 85, 100]

# 004将Series转换成DataFrame

In [5]:
df1 = pd.DataFrame(s2, columns=['grade'])
df1

Unnamed: 0,grade
语文,80
数学,90
英语,85
计算机,100


# 005借助numpy创建Series

In [6]:
s3 = pd.Series(np.arange(10,100,10), index=np.arange(101,110), dtype='float64')
s3

101    10.0
102    20.0
103    30.0
104    40.0
105    50.0
106    60.0
107    70.0
108    80.0
109    90.0
dtype: float64

# 006转换Series的数据类型

In [7]:
s4 = pd.Series(
    data=['001','002','003','004'],
    index=list('abcd')
)
# s4.astype('int64')
s4.map(int) # 这里的int是函数

a    1
b    2
c    3
d    4
dtype: int64

# 007给Series添加新的元素

## 方法1：通过Series添加

In [10]:
new_grades = {'物理':88, '化学':96}
s5 = pd.Series(new_grades)
s2_new1 = s2.append(s5) # s2.append(s5)只返回修改后的值，不修改s2
s2_new1

  s2_new1 = s2.append(s5)


语文      80
数学      90
英语      85
计算机    100
物理      88
化学      96
dtype: int64

## 方法2：直接添加

In [11]:
s2_new2 = s2.copy()
s2_new2['物理'] = 88
s2_new2['化学'] = 96
s2_new2

语文      80
数学      90
英语      85
计算机    100
物理      88
化学      96
dtype: int64

# 008用reset_index将Series转换成DataFrame

In [16]:
df2 = s2.reset_index() # drop参数为False(默认)返回DataFrame，drop参数为True, 返回Series；参数inplace=True无返回值
df2

Unnamed: 0,index,0
0,语文,80
1,数学,90
2,英语,85
3,计算机,100


更改列名

In [18]:
df2.columns = ['course', 'grade']
df2

Unnamed: 0,course,grade
0,语文,80
1,数学,90
2,英语,85
3,计算机,100


# 009使用字典创建DataFrame

In [33]:
info_dict = {
    '姓名':['小张','小王','小李','小赵'],
    '性别':['男','女','男','女'],
    '年龄':[18,19,20,18]
}
df3 = pd.DataFrame(info_dict)
df3

Unnamed: 0,姓名,性别,年龄
0,小张,男,18
1,小王,女,19
2,小李,男,20
3,小赵,女,18


# 010给DataFrame设置索引列

In [37]:
df4 = df3.copy()
df4.set_index('姓名', drop=True, inplace=True) # drop删除被用作index的列，默认为True；inplace原地修改DataFrame，默认为False
df4

Unnamed: 0_level_0,性别,年龄
姓名,Unnamed: 1_level_1,Unnamed: 2_level_1
小张,男,18
小王,女,19
小李,男,20
小赵,女,18


# 011生成一个月份所有日期

In [51]:
# df6 = pd.date_range(start='2021-10-01', end='2021-10-31', freq='D') # 方法1
df6 = pd.date_range(start='2021-10-01', periods=31, freq='D') # 方法2
df6

DatetimeIndex(['2021-10-01', '2021-10-02', '2021-10-03', '2021-10-04',
               '2021-10-05', '2021-10-06', '2021-10-07', '2021-10-08',
               '2021-10-09', '2021-10-10', '2021-10-11', '2021-10-12',
               '2021-10-13', '2021-10-14', '2021-10-15', '2021-10-16',
               '2021-10-17', '2021-10-18', '2021-10-19', '2021-10-20',
               '2021-10-21', '2021-10-22', '2021-10-23', '2021-10-24',
               '2021-10-25', '2021-10-26', '2021-10-27', '2021-10-28',
               '2021-10-29', '2021-10-30', '2021-10-31'],
              dtype='datetime64[ns]', freq='D')

# 012生成一年的所有周一日期

In [59]:
df7 = pd.date_range(start='2021-01-04', periods=52, freq='W-MON')
df7

DatetimeIndex(['2021-01-04', '2021-01-11', '2021-01-18', '2021-01-25',
               '2021-02-01', '2021-02-08', '2021-02-15', '2021-02-22',
               '2021-03-01', '2021-03-08', '2021-03-15', '2021-03-22',
               '2021-03-29', '2021-04-05', '2021-04-12', '2021-04-19',
               '2021-04-26', '2021-05-03', '2021-05-10', '2021-05-17',
               '2021-05-24', '2021-05-31', '2021-06-07', '2021-06-14',
               '2021-06-21', '2021-06-28', '2021-07-05', '2021-07-12',
               '2021-07-19', '2021-07-26', '2021-08-02', '2021-08-09',
               '2021-08-16', '2021-08-23', '2021-08-30', '2021-09-06',
               '2021-09-13', '2021-09-20', '2021-09-27', '2021-10-04',
               '2021-10-11', '2021-10-18', '2021-10-25', '2021-11-01',
               '2021-11-08', '2021-11-15', '2021-11-22', '2021-11-29',
               '2021-12-06', '2021-12-13', '2021-12-20', '2021-12-27'],
              dtype='datetime64[ns]', freq='W-MON')

# 013生成一天的所有小时

In [67]:
# df8 = pd.date_range(start='2021-10-01', periods=24, freq='H')
df8 = pd.date_range(start='2021-10-01', end='2021-10-02', freq='H', inclusive='left') # 不包含02号
df8

DatetimeIndex(['2021-10-01 00:00:00', '2021-10-01 01:00:00',
               '2021-10-01 02:00:00', '2021-10-01 03:00:00',
               '2021-10-01 04:00:00', '2021-10-01 05:00:00',
               '2021-10-01 06:00:00', '2021-10-01 07:00:00',
               '2021-10-01 08:00:00', '2021-10-01 09:00:00',
               '2021-10-01 10:00:00', '2021-10-01 11:00:00',
               '2021-10-01 12:00:00', '2021-10-01 13:00:00',
               '2021-10-01 14:00:00', '2021-10-01 15:00:00',
               '2021-10-01 16:00:00', '2021-10-01 17:00:00',
               '2021-10-01 18:00:00', '2021-10-01 19:00:00',
               '2021-10-01 20:00:00', '2021-10-01 21:00:00',
               '2021-10-01 22:00:00', '2021-10-01 23:00:00'],
              dtype='datetime64[ns]', freq='H')

# 014用日期生成DataFrame

In [75]:
day = pd.date_range(start='2021-10-01', periods=31, freq='D')
df9 = pd.DataFrame({
    'day' : day,
    'day_of_year' : day.dayofyear
})
df9

Unnamed: 0,day,day_of_year
0,2021-10-01,274
1,2021-10-02,275
2,2021-10-03,276
3,2021-10-04,277
4,2021-10-05,278
5,2021-10-06,279
6,2021-10-07,280
7,2021-10-08,281
8,2021-10-09,282
9,2021-10-10,283


# 015使用日期和随机数生成DataFrame

In [94]:
date = pd.date_range(start='2021-01-01', periods=1000, freq='D')
data = {
    'norm' : np.random.normal(loc=0, scale=1, size=1000),
    'uniform' : np.random.uniform(low=0, high=1, size=1000),
    'binomial' : np.random.binomial(n=1, p=0.2, size=1000)
}
df10 = pd.DataFrame(data=data, index=date)
df10

Unnamed: 0,norm,uniform,binomial
2021-01-01,-0.943707,0.738327,0
2021-01-02,-0.111825,0.968524,1
2021-01-03,0.071392,0.390900,0
2021-01-04,-1.403814,0.615796,0
2021-01-05,1.520362,0.018564,0
...,...,...,...
2023-09-23,0.077795,0.400052,0
2023-09-24,-1.814570,0.050924,0
2023-09-25,1.013824,0.805262,0
2023-09-26,-0.061858,0.808770,0


# 016打印DataFrame的前后数据行

In [96]:
print(df10.head(10))
print('\n')
print(df10.tail(5))

                norm   uniform  binomial
2021-01-01 -0.943707  0.738327         0
2021-01-02 -0.111825  0.968524         1
2021-01-03  0.071392  0.390900         0
2021-01-04 -1.403814  0.615796         0
2021-01-05  1.520362  0.018564         0
2021-01-06  0.419204  0.509082         0
2021-01-07  0.676923  0.160707         0
2021-01-08 -0.446566  0.274765         0
2021-01-09  2.507684  0.611809         0
2021-01-10  1.066659  0.334029         1


                norm   uniform  binomial
2023-09-23  0.077795  0.400052         0
2023-09-24 -1.814570  0.050924         0
2023-09-25  1.013824  0.805262         0
2023-09-26 -0.061858  0.808770         0
2023-09-27 -0.398729  0.736480         0


# 017DataFrame的信息和基本数据统计

In [99]:
print(df10.info())
print('\n')
print(df10.describe())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1000 entries, 2021-01-01 to 2023-09-27
Freq: D
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   norm      1000 non-null   float64
 1   uniform   1000 non-null   float64
 2   binomial  1000 non-null   int64  
dtypes: float64(2), int64(1)
memory usage: 31.2 KB
None


              norm      uniform     binomial
count  1000.000000  1000.000000  1000.000000
mean     -0.018324     0.507354     0.203000
std       0.980199     0.293294     0.402434
min      -2.679493     0.000859     0.000000
25%      -0.712928     0.250209     0.000000
50%      -0.060179     0.507055     0.000000
75%       0.648760     0.762867     0.000000
max       2.914688     0.999570     1.000000


# 018统计数据列的值出现次数

In [103]:
df10['binomial'].value_counts()

0    797
1    203
Name: binomial, dtype: int64

# 019DataFrame前N行存入csv文件

In [113]:
df10.head(100).to_csv('df10_100.csv')

# 020加载csv文件到DataFrame

In [121]:
df10_100 = pd.read_csv('./df10_100.csv', index_col=0)
df10_100

Unnamed: 0,norm,uniform,binomial
2021-01-01,-0.943707,0.738327,0
2021-01-02,-0.111825,0.968524,1
2021-01-03,0.071392,0.390900,0
2021-01-04,-1.403814,0.615796,0
2021-01-05,1.520362,0.018564,0
...,...,...,...
2021-04-06,-0.037234,0.722495,0
2021-04-07,-0.396256,0.606541,0
2021-04-08,-0.545420,0.670817,0
2021-04-09,0.196980,0.645179,0


# 021加载股票数据csv文件

In [139]:
df11 = pd.read_csv('./00700.HK.csv')
df11

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,2021-09-30,456.000,464.600,453.800,461.400,17335451
1,2021-09-29,461.600,465.000,450.200,465.000,18250450
2,2021-09-28,467.000,476.200,464.600,469.800,20947276
3,2021-09-27,459.000,473.000,455.200,464.600,17966998
4,2021-09-24,461.400,473.400,456.200,460.200,16656914
...,...,...,...,...,...,...
4262,2004-06-23,4.050,4.450,4.025,4.425,55016000
4263,2004-06-21,4.125,4.125,3.950,4.000,22817000
4264,2004-06-18,4.200,4.250,3.950,4.025,36598000
4265,2004-06-17,4.150,4.375,4.125,4.225,83801500


# 022查看基本信息和数据统计

In [144]:
print(df11.info())
print()
print(df11.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4267 entries, 0 to 4266
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    4267 non-null   object 
 1   Open    4267 non-null   float64
 2   High    4267 non-null   float64
 3   Low     4267 non-null   float64
 4   Close   4267 non-null   float64
 5   Volume  4267 non-null   int64  
dtypes: float64(4), int64(1), object(1)
memory usage: 200.1+ KB
None

              Open         High          Low        Close        Volume
count  4267.000000  4267.000000  4267.000000  4267.000000  4.267000e+03
mean    211.010729   213.575773   208.063849   210.778935  1.198943e+07
std     170.116303   172.173404   167.716394   169.807145  1.413407e+07
min       3.375000     3.500000     3.375000     3.375000  4.523670e+05
25%      58.000000    59.200000    56.050000    57.500000  3.600698e+06
50%     170.900000   173.300000   168.000000   170.700000  7.647284e+06
75%     330.700000   333

# 023更改索引列为普通数据列

In [189]:
df11 = pd.read_csv('./00700.HK.csv', index_col=0)
print(df11.head())
df11.reset_index(inplace=True)
print(df11.head())

             Open   High    Low  Close    Volume
Date                                            
2021-09-30  456.0  464.6  453.8  461.4  17335451
2021-09-29  461.6  465.0  450.2  465.0  18250450
2021-09-28  467.0  476.2  464.6  469.8  20947276
2021-09-27  459.0  473.0  455.2  464.6  17966998
2021-09-24  461.4  473.4  456.2  460.2  16656914
         Date   Open   High    Low  Close    Volume
0  2021-09-30  456.0  464.6  453.8  461.4  17335451
1  2021-09-29  461.6  465.0  450.2  465.0  18250450
2  2021-09-28  467.0  476.2  464.6  469.8  20947276
3  2021-09-27  459.0  473.0  455.2  464.6  17966998
4  2021-09-24  461.4  473.4  456.2  460.2  16656914


# 024给股票数据新增年份和月份

先将日期列的字符串类型转换为日期类型

In [194]:
df11 = pd.read_csv('./00700.HK.csv')
print(df11['Date'].dtype)
# df11['Date'] = pd.to_datetime(df11['Date']) # 方法1
df11['Date'] = df11['Date'].astype(dtype='datetime64[ns]') # 方法2
print(df11['Date'].dtype)

object
datetime64[ns]


In [195]:
df11['Year'] = df11['Date'].dt.year
df11['Month'] = df11['Date'].dt.month
print(df11.head())
print(df11.tail())

        Date   Open   High    Low  Close    Volume  Year  Month
0 2021-09-30  456.0  464.6  453.8  461.4  17335451  2021      9
1 2021-09-29  461.6  465.0  450.2  465.0  18250450  2021      9
2 2021-09-28  467.0  476.2  464.6  469.8  20947276  2021      9
3 2021-09-27  459.0  473.0  455.2  464.6  17966998  2021      9
4 2021-09-24  461.4  473.4  456.2  460.2  16656914  2021      9
           Date   Open   High    Low  Close     Volume  Year  Month
4262 2004-06-23  4.050  4.450  4.025  4.425   55016000  2004      6
4263 2004-06-21  4.125  4.125  3.950  4.000   22817000  2004      6
4264 2004-06-18  4.200  4.250  3.950  4.025   36598000  2004      6
4265 2004-06-17  4.150  4.375  4.125  4.225   83801500  2004      6
4266 2004-06-16  4.375  4.625  4.075  4.150  439775000  2004      6


# 025计算股票每年的平均收盘价

In [213]:
df11.groupby(by='Year')['Close'].mean()

Year
2004      4.338686
2005      6.568927
2006     15.865951
2007     37.882724
2008     54.818367
2009     96.369679
2010    157.299598
2011    189.737398
2012    228.987045
2013    337.136066
2014    271.291498
2015    144.824291
2016    176.562041
2017    291.066667
2018    372.678862
2019    346.225203
2020    479.141129
2021    586.649189
Name: Close, dtype: float64

# 026股票数据找出收盘价最低行

In [223]:
# df11['Close'].min() # 返回最小值
df11.loc[df11['Close'].argmin()] # argmin()返回最小值所在的索引

Date      2004-07-26 00:00:00
Open                     3.45
High                      3.5
Low                     3.375
Close                   3.375
Volume                7439000
Year                     2004
Name: 4240, dtype: object

In [226]:
df11.loc[[df11['Close'].argmin()]] # loc多加一个[]返回的是DataFrame，不加返回的是Series

Unnamed: 0,Date,Open,High,Low,Close,Volume,Year
4240,2004-07-26,3.45,3.5,3.375,3.375,7439000,2004


# 027筛选部分数据列

In [230]:
df11[['Date', 'Open', 'Close', 'Volume']].head()

Unnamed: 0,Date,Open,Close,Volume
0,2021-09-30,456.0,461.4,17335451
1,2021-09-29,461.6,465.0,18250450
2,2021-09-28,467.0,469.8,20947276
3,2021-09-27,459.0,464.6,17966998
4,2021-09-24,461.4,460.2,16656914


# 028设置日期列为索引列

方法1：读取时设置

In [240]:
df11 = pd.read_csv('./00700.HK.csv', index_col='Date')
df11.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-09-30,456.0,464.6,453.8,461.4,17335451
2021-09-29,461.6,465.0,450.2,465.0,18250450
2021-09-28,467.0,476.2,464.6,469.8,20947276
2021-09-27,459.0,473.0,455.2,464.6,17966998
2021-09-24,461.4,473.4,456.2,460.2,16656914


方法2：后续设置

In [238]:
df11 = pd.read_csv('./00700.HK.csv')
df11.set_index('Date', inplace=True)
df11.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-09-30,456.0,464.6,453.8,461.4,17335451
2021-09-29,461.6,465.0,450.2,465.0,18250450
2021-09-28,467.0,476.2,464.6,469.8,20947276
2021-09-27,459.0,473.0,455.2,464.6,17966998
2021-09-24,461.4,473.4,456.2,460.2,16656914


# 029删除不需要的数据列

方法1：pop按个删除

In [252]:
df11 = pd.read_csv('./00700.HK.csv')
df11.pop('High')
df11.pop('Low')
df11.head()

Unnamed: 0,Date,Open,Close,Volume
0,2021-09-30,456.0,461.4,17335451
1,2021-09-29,461.6,465.0,18250450
2,2021-09-28,467.0,469.8,20947276
3,2021-09-27,459.0,464.6,17966998
4,2021-09-24,461.4,460.2,16656914


方法2：drop方法一起删除

In [253]:
df11 = pd.read_csv('./00700.HK.csv')
df11.drop(columns=['High', 'Low'], inplace=True)
df11.head()

Unnamed: 0,Date,Open,Close,Volume
0,2021-09-30,456.0,461.4,17335451
1,2021-09-29,461.6,465.0,18250450
2,2021-09-28,467.0,469.8,20947276
3,2021-09-27,459.0,464.6,17966998
4,2021-09-24,461.4,460.2,16656914


# 030对列进行重命名