In [184]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号

In [185]:
courses = ['语文', '数学', '英语', '计算机']

ser1 = pd.Series(data=courses)
ser1

0     语文
1     数学
2     英语
3    计算机
dtype: object

In [186]:
grades = {"语文": 80, "数学": 90, "英语": 85, "计算机": 100}

ser2 = pd.Series(data=grades)
ser2

语文      80
数学      90
英语      85
计算机    100
dtype: int64

In [187]:
list1 = ser2.to_list()
list1

[80, 90, 85, 100]

In [188]:
df1 = pd.DataFrame(data=ser2, columns=['grade'])
df1

Unnamed: 0,grade
语文,80
数学,90
英语,85
计算机,100


In [189]:
ser3 = pd.Series(data=np.arange(10, 100, 10), index=np.arange(101, 110, 1), dtype=np.float64)
ser3

101    10.0
102    20.0
103    30.0
104    40.0
105    50.0
106    60.0
107    70.0
108    80.0
109    90.0
dtype: float64

In [190]:
ser4 = pd.Series(data=["001", "002", "003", "004"], index=["a", "b", "c", "d"])
# 转换类型
ser4 = ser4.astype(dtype=np.int64)
ser4

a    1
b    2
c    3
d    4
dtype: int64

In [191]:
ser2 = ser2.append(pd.Series({"物理": 95, "化学": 90}))
ser2

  ser2 = ser2.append(pd.Series({"物理": 95, "化学": 90}))


语文      80
数学      90
英语      85
计算机    100
物理      95
化学      90
dtype: int64

In [192]:
df1.reset_index(names=['course', 'grade'])

Unnamed: 0,course,grade
0,语文,80
1,数学,90
2,英语,85
3,计算机,100


In [193]:
df2 = pd.DataFrame(data={
    "姓名": ["小张", "小王", "小李", "小赵"],
    "性别": ["男", "女", "男", "女"],
    "年龄": [18, 19, 20, 18],
})
df2

Unnamed: 0,姓名,性别,年龄
0,小张,男,18
1,小王,女,19
2,小李,男,20
3,小赵,女,18


In [194]:
df2 = df2.set_index("姓名")
df2

Unnamed: 0_level_0,性别,年龄
姓名,Unnamed: 1_level_1,Unnamed: 2_level_1
小张,男,18
小王,女,19
小李,男,20
小赵,女,18


In [195]:
# 生成一个月份的所有天
pd.date_range(start="2021-10-01", end="2021-10-31", freq="D")


DatetimeIndex(['2021-10-01', '2021-10-02', '2021-10-03', '2021-10-04',
               '2021-10-05', '2021-10-06', '2021-10-07', '2021-10-08',
               '2021-10-09', '2021-10-10', '2021-10-11', '2021-10-12',
               '2021-10-13', '2021-10-14', '2021-10-15', '2021-10-16',
               '2021-10-17', '2021-10-18', '2021-10-19', '2021-10-20',
               '2021-10-21', '2021-10-22', '2021-10-23', '2021-10-24',
               '2021-10-25', '2021-10-26', '2021-10-27', '2021-10-28',
               '2021-10-29', '2021-10-30', '2021-10-31'],
              dtype='datetime64[ns]', freq='D')

In [196]:
pd.date_range(start="2021-01-01", end="2021-12-31", freq="W-MON")

DatetimeIndex(['2021-01-04', '2021-01-11', '2021-01-18', '2021-01-25',
               '2021-02-01', '2021-02-08', '2021-02-15', '2021-02-22',
               '2021-03-01', '2021-03-08', '2021-03-15', '2021-03-22',
               '2021-03-29', '2021-04-05', '2021-04-12', '2021-04-19',
               '2021-04-26', '2021-05-03', '2021-05-10', '2021-05-17',
               '2021-05-24', '2021-05-31', '2021-06-07', '2021-06-14',
               '2021-06-21', '2021-06-28', '2021-07-05', '2021-07-12',
               '2021-07-19', '2021-07-26', '2021-08-02', '2021-08-09',
               '2021-08-16', '2021-08-23', '2021-08-30', '2021-09-06',
               '2021-09-13', '2021-09-20', '2021-09-27', '2021-10-04',
               '2021-10-11', '2021-10-18', '2021-10-25', '2021-11-01',
               '2021-11-08', '2021-11-15', '2021-11-22', '2021-11-29',
               '2021-12-06', '2021-12-13', '2021-12-20', '2021-12-27'],
              dtype='datetime64[ns]', freq='W-MON')

In [197]:
pd.date_range(start="2021-01-01", periods=24, freq="H")

DatetimeIndex(['2021-01-01 00:00:00', '2021-01-01 01:00:00',
               '2021-01-01 02:00:00', '2021-01-01 03:00:00',
               '2021-01-01 04:00:00', '2021-01-01 05:00:00',
               '2021-01-01 06:00:00', '2021-01-01 07:00:00',
               '2021-01-01 08:00:00', '2021-01-01 09:00:00',
               '2021-01-01 10:00:00', '2021-01-01 11:00:00',
               '2021-01-01 12:00:00', '2021-01-01 13:00:00',
               '2021-01-01 14:00:00', '2021-01-01 15:00:00',
               '2021-01-01 16:00:00', '2021-01-01 17:00:00',
               '2021-01-01 18:00:00', '2021-01-01 19:00:00',
               '2021-01-01 20:00:00', '2021-01-01 21:00:00',
               '2021-01-01 22:00:00', '2021-01-01 23:00:00'],
              dtype='datetime64[ns]', freq='H')

In [198]:
day = pd.date_range(start="2021-10-01", end="2021-10-31", freq="D")
df3 = pd.DataFrame(data=day, columns=["day"])
df3['day_of_year'] = df3.day.dt.dayofyear
df3

Unnamed: 0,day,day_of_year
0,2021-10-01,274
1,2021-10-02,275
2,2021-10-03,276
3,2021-10-04,277
4,2021-10-05,278
5,2021-10-06,279
6,2021-10-07,280
7,2021-10-08,281
8,2021-10-09,282
9,2021-10-10,283


In [199]:
day = pd.date_range(start="2021-01-01", freq="D", periods=1000)

data = {
    'normal': np.random.normal(loc=0, scale=1, size=1000),
    'uniform': np.random.uniform(low=0, high=1, size=1000),
    'binomial': np.random.binomial(n=1, p=0.2, size=1000)
}
df4 = pd.DataFrame(data=data, index=day)
df4

Unnamed: 0,normal,uniform,binomial
2021-01-01,-0.281058,0.676171,1
2021-01-02,-1.001062,0.837030,1
2021-01-03,0.193402,0.064085,0
2021-01-04,0.105596,0.562649,0
2021-01-05,1.106827,0.690843,0
...,...,...,...
2023-09-23,-0.041311,0.347304,0
2023-09-24,-0.173578,0.967672,0
2023-09-25,-0.689982,0.959804,1
2023-09-26,-0.533933,0.939226,1


In [200]:
df4.head(10)

Unnamed: 0,normal,uniform,binomial
2021-01-01,-0.281058,0.676171,1
2021-01-02,-1.001062,0.83703,1
2021-01-03,0.193402,0.064085,0
2021-01-04,0.105596,0.562649,0
2021-01-05,1.106827,0.690843,0
2021-01-06,0.097741,0.072218,1
2021-01-07,-1.015255,0.723153,0
2021-01-08,-0.00071,0.94643,1
2021-01-09,2.41825,0.144694,0
2021-01-10,0.375052,0.882115,1


In [201]:
# 后5行
df4.tail(5)

Unnamed: 0,normal,uniform,binomial
2023-09-23,-0.041311,0.347304,0
2023-09-24,-0.173578,0.967672,0
2023-09-25,-0.689982,0.959804,1
2023-09-26,-0.533933,0.939226,1
2023-09-27,-1.009373,0.26577,1


In [202]:
df4.to_csv(r"C:\Users\wdl\Data-analysis\Practice_03\data\分布数据前50.csv")

In [203]:
df5 = pd.read_csv(r"C:\Users\wdl\Data-analysis\Practice_03\data\百度股票数据.csv", index_col=0)
df5.info()

<class 'pandas.core.frame.DataFrame'>
Index: 558 entries, 2021/1/4 to 2023/4/21
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   open    558 non-null    float64
 1   close   558 non-null    float64
 2   high    558 non-null    float64
 3   low     558 non-null    float64
 4   volume  558 non-null    int64  
 5   code    558 non-null    int64  
dtypes: float64(4), int64(2)
memory usage: 30.5+ KB


In [204]:
df5.head()

Unnamed: 0_level_0,open,close,high,low,volume,code
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021/1/4,8.75,8.8,8.84,8.66,629069,600000
2021/1/5,8.79,8.79,8.79,8.63,538592,600000
2021/1/6,8.73,8.93,8.94,8.73,618813,600000
2021/1/7,8.94,8.92,9.04,8.77,570904,600000
2021/1/8,8.94,8.94,9.01,8.84,558015,600000


In [205]:
df5.reset_index(inplace=True)
df5

Unnamed: 0,date,open,close,high,low,volume,code
0,2021/1/4,8.75,8.80,8.84,8.66,629069,600000
1,2021/1/5,8.79,8.79,8.79,8.63,538592,600000
2,2021/1/6,8.73,8.93,8.94,8.73,618813,600000
3,2021/1/7,8.94,8.92,9.04,8.77,570904,600000
4,2021/1/8,8.94,8.94,9.01,8.84,558015,600000
...,...,...,...,...,...,...,...
553,2023/4/17,7.26,7.39,7.40,7.26,360278,600000
554,2023/4/18,7.38,7.54,7.59,7.37,774123,600000
555,2023/4/19,7.49,7.53,7.58,7.43,522698,600000
556,2023/4/20,7.50,7.68,7.69,7.50,723407,600000


In [206]:
df5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 558 entries, 0 to 557
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   date    558 non-null    object 
 1   open    558 non-null    float64
 2   close   558 non-null    float64
 3   high    558 non-null    float64
 4   low     558 non-null    float64
 5   volume  558 non-null    int64  
 6   code    558 non-null    int64  
dtypes: float64(4), int64(2), object(1)
memory usage: 30.6+ KB


In [207]:
df5['date'] = df5.date.astype(dtype=np.datetime64)

In [208]:
df5['year'] = df5.date.dt.year
df5['month'] = df5.date.dt.month

In [209]:
np.round(df5.groupby('year').mean(), 2)

  np.round(df5.groupby('year').mean(), 2)


Unnamed: 0_level_0,open,close,high,low,volume,code,month
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021,9.01,9.01,9.09,8.93,493553.44,600000.0,6.61
2022,7.45,7.44,7.5,7.4,296075.83,600000.0,6.62
2023,7.26,7.26,7.3,7.22,246995.75,600000.0,2.48


In [210]:
# 输出最小收盘价的那行
df5[df5.close == df5.close.min()]

Unnamed: 0,date,open,close,high,low,volume,code,year,month
440,2022-10-31,6.73,6.64,6.75,6.63,322374,600000,2022,10


In [211]:
df6 = df5.drop(columns=['year', 'month', 'high', 'low'])
df6

Unnamed: 0,date,open,close,volume,code
0,2021-01-04,8.75,8.80,629069,600000
1,2021-01-05,8.79,8.79,538592,600000
2,2021-01-06,8.73,8.93,618813,600000
3,2021-01-07,8.94,8.92,570904,600000
4,2021-01-08,8.94,8.94,558015,600000
...,...,...,...,...,...
553,2023-04-17,7.26,7.39,360278,600000
554,2023-04-18,7.38,7.54,774123,600000
555,2023-04-19,7.49,7.53,522698,600000
556,2023-04-20,7.50,7.68,723407,600000


In [212]:
df6.set_index('date', inplace=True)
df6

Unnamed: 0_level_0,open,close,volume,code
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-01-04,8.75,8.80,629069,600000
2021-01-05,8.79,8.79,538592,600000
2021-01-06,8.73,8.93,618813,600000
2021-01-07,8.94,8.92,570904,600000
2021-01-08,8.94,8.94,558015,600000
...,...,...,...,...
2023-04-17,7.26,7.39,360278,600000
2023-04-18,7.38,7.54,774123,600000
2023-04-19,7.49,7.53,522698,600000
2023-04-20,7.50,7.68,723407,600000


In [213]:
df5.rename({'date': 'D', 'open': 'O', 'close': 'C', 'high': 'H', 'low': 'L', 'volume': 'V'}, inplace=True, axis=1)
df5

Unnamed: 0,D,O,C,H,L,V,code,year,month
0,2021-01-04,8.75,8.80,8.84,8.66,629069,600000,2021,1
1,2021-01-05,8.79,8.79,8.79,8.63,538592,600000,2021,1
2,2021-01-06,8.73,8.93,8.94,8.73,618813,600000,2021,1
3,2021-01-07,8.94,8.92,9.04,8.77,570904,600000,2021,1
4,2021-01-08,8.94,8.94,9.01,8.84,558015,600000,2021,1
...,...,...,...,...,...,...,...,...,...
553,2023-04-17,7.26,7.39,7.40,7.26,360278,600000,2023,4
554,2023-04-18,7.38,7.54,7.59,7.37,774123,600000,2023,4
555,2023-04-19,7.49,7.53,7.58,7.43,522698,600000,2023,4
556,2023-04-20,7.50,7.68,7.69,7.50,723407,600000,2023,4


In [214]:
df7 = pd.read_csv(r'C:\Users\wdl\Data-analysis\Practice_03\data\Telco-Customer-Churn.csv', )
df7.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [215]:
df7.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [216]:
df7.TotalCharges.value_counts()

          11
20.2      11
19.75      9
20.05      8
19.9       8
          ..
6849.4     1
692.35     1
130.15     1
3211.9     1
6844.5     1
Name: TotalCharges, Length: 6531, dtype: int64

In [217]:
median = df7.TotalCharges[df7.TotalCharges != ' '].median()
df7.TotalCharges.replace(' ', median, inplace=True)

In [218]:
df7.TotalCharges.value_counts()

1397.475    11
20.2        11
19.75        9
20.05        8
19.9         8
            ..
6849.4       1
692.35       1
130.15       1
3211.9       1
6844.5       1
Name: TotalCharges, Length: 6531, dtype: int64