### Pandas 中的数据结构

#### Pandas Series

In [23]:
import numpy as np
import pandas as pd
from imageio.v3 import improps
from pandas import DataFrame, Series

arr = np.arange(6, 11)
s1 = Series(arr, index=['A', 'B', 'C', 'D', 'E'])
print(s1, '\n')

array2 = np.array([100, 200, 300, 400, 500])
Series2 = Series(array2, index=[0, 1, 2, 3, 4])
print(Series2, '\n')

data = {
    'name': 'xinyi',
    'student_id': 'ZF2408D41'
}

data_series = Series(data, index=['student_id', 1])
print(data_series, '\n')

ss2 = Series(33, index=np.arange(1, 13))
print(ss2, '\n')

# access Series with index and using slice same as array in NumPy

ss3 = Series([100, 200, 300, 400, 500], index=['a', 'b', 'c', 'd', 'e'])
print(ss3, '\n')

# deprecated
# print(ss3[0])

ss3.iloc[0] = 33
print(ss3, '\n')

print(ss3[2:3], '\n')
print(ss3[2:], '\n')
print(ss3[-1:], '\n')

print(data_series['student_id'], '\n')

# head() tail()
ss4 = pd.Series([100, 200, 300, 400, 500], index=['a', 'b', 'c', 'd', 'e'])
ss5 = ss4.head(2)
print(ss5, '\n')

ss6 = ss5.tail(1)
print(ss6, '\n')

A     6
B     7
C     8
D     9
E    10
dtype: int32 

0    100
1    200
2    300
3    400
4    500
dtype: int32 

student_id    ZF2408D41
1                   NaN
dtype: object 

1     33
2     33
3     33
4     33
5     33
6     33
7     33
8     33
9     33
10    33
11    33
12    33
dtype: int64 

a    100
b    200
c    300
d    400
e    500
dtype: int64 

a     33
b    200
c    300
d    400
e    500
dtype: int64 

c    300
dtype: int64 

c    300
d    400
e    500
dtype: int64 

e    500
dtype: int64 

ZF2408D41 

a    100
b    200
dtype: int64 

b    200
dtype: int64 



#### Pandas DataFrame

In [3]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

s = Series(np.arange(1, 10))
print(s, '\n')

DataFrame(s, columns=['val'])

# ...
df = DataFrame((np.arange(20) * 2).reshape(4, 5), columns=['col1', 'col2', 'col3', 'col4', 'col5'])
print(df, '\n')

# ...
person_info = [['xin', 10], ['bob', 20], ['haha', 30]]
person_info_dataframe = DataFrame(person_info, columns=['Name', 'Age'])  # 添加dtype报错
print(person_info_dataframe, '\n')

# build with dict instance
data = {
    'Name': ['Tom', 'Jack', 'Steve', 'Ricky'],
    'Age': [21, 22, 23, 24]
}

frame = DataFrame(data, columns=['Name', 'Age'], index=['rank1', 'rank2', 'rank3', 'rank4'])
print(frame, '\n')

# build DataFrame from Series
# key is column in dataframe
d = {
    'one': Series([1, 2, 3], index=['a', 'b', 'c']),
    'two': Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
}

dd = DataFrame(d)
print(dd, '\n')
print(dd.T, '\n')

dff = DataFrame(np.random.rand(8, 4), columns=['A', 'B', 'C', 'D'])
print(dff.C, '\n')
# print(dff['C', 'D'], '\n') # error
print(dff.C, '\n')
print(dff[['C', 'D']], '\n')
print(dff[1:3], '\n')

#
ddff = DataFrame(
    np.random.rand(8, 4),
    index=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'],
    columns=['A', 'B', 'C', 'D']
)

print(ddff, '\n')

print(ddff['a':'d'], '\n')
print(ddff.loc['a':'d'], '\n')
print(ddff.loc[['a', 'c', 'd']], '\n')

print(ddff.loc[:, ['A', 'C']], '\n')
print(ddff.loc[:, 'A':'C'], '\n')

# ----

ipl_data = {
    'Team': ['Riders', 'Riders', 'Devils', 'Devils', 'Kings', 'kings', 'Kings',
             'Kings', 'Riders', 'Royals', 'Royals', 'Riders'],
    'Rank': [1, 2, 2, 3, 3, 4, 1, 1, 2, 4, 1, 2],
    'Year': [2014, 2015, 2014, 2015, 2014, 2015, 2016, 2017, 2016, 2014, 2015, 2017],
    'Points': [876, 789, 863, 673, 741, 812, 756, 788, 694, 701, 804, 690]
}

ff = DataFrame(ipl_data, columns=['Team', 'Rank', 'Year', 'Points'])
print(ff, '\n')
print(ff.groupby('Team').groups, '\n')

# from csv
# https://blog.csdn.net/FrankieHello/article/details/97272990
csv_dataframe = pd.read_csv('./data.csv')
print(csv_dataframe, '\n')

# groupby function return DataFrameGroupBy instance
grouped = csv_dataframe.groupby('Gender')
grouped_muti = csv_dataframe.groupby(['Gender','Age'])

print(grouped.size(), '\n')
print(grouped_muti.size(), '\n')

# get_group function return DataFrame instance
print(grouped.get_group('Female'), '\n')



0    1
1    2
2    3
3    4
4    5
5    6
6    7
7    8
8    9
dtype: int32 

   col1  col2  col3  col4  col5
0     0     2     4     6     8
1    10    12    14    16    18
2    20    22    24    26    28
3    30    32    34    36    38 

   Name  Age
0   xin   10
1   bob   20
2  haha   30 

        Name  Age
rank1    Tom   21
rank2   Jack   22
rank3  Steve   23
rank4  Ricky   24 

   one  two
a  1.0    1
b  2.0    2
c  3.0    3
d  NaN    4 

       a    b    c    d
one  1.0  2.0  3.0  NaN
two  1.0  2.0  3.0  4.0 

0    0.191131
1    0.221236
2    0.514270
3    0.363147
4    0.387799
5    0.340266
6    0.622136
7    0.878015
Name: C, dtype: float64 

0    0.191131
1    0.221236
2    0.514270
3    0.363147
4    0.387799
5    0.340266
6    0.622136
7    0.878015
Name: C, dtype: float64 

          C         D
0  0.191131  0.150563
1  0.221236  0.644417
2  0.514270  0.290360
3  0.363147  0.520527
4  0.387799  0.229385
5  0.340266  0.609483
6  0.622136  0.269507
7  0.878015  0.009092 

  

#### DataFrame 日期数据

In [5]:
date_list = pd.date_range('20250910', periods=6)
date_list

DatetimeIndex(['2025-09-10', '2025-09-11', '2025-09-12', '2025-09-13',
               '2025-09-14', '2025-09-15'],
              dtype='datetime64[ns]', freq='D')

In [7]:
dataframe = pd.DataFrame(np.random.randn(6, 4), index=date_list,columns=['A', 'B', 'C', 'D'])

ddd = pd.read_csv('./data.csv', index_col=0)
ddd

Unnamed: 0,Name,Gender,Age,Score
0,Alen,Male,18,80
1,Bob,Male,19,90
2,Cidy,Female,18,93
3,Daniel,Male,20,87
4,Ellen,Female,17,96
5,Frankie,Male,21,100
6,Gate,Male,20,88
7,Hebe,Female,22,98
