# 用于对本月的历史数据按照板块名称进行分组统计，结果保存到result.csv文件

In [109]:
import pandas as pd
from datetime import datetime
# import sys
# sys.path.append('../utils')
# import constants
from functools import lru_cache
OPTION_DICT = {
    "all": (float('-inf'), float('inf')),
    "0-100": (0, 100),
    "100-500": (100, 500),
    "500-1000": (500, 1000),
    "1000-30000": (1000, 30000),
}

RANGE = ["跌停", "跌<-5%",  "-3%<-5%",     "-3<-1%",
         "平盘", "<3%",     "3-5%",   "5%-涨停", "涨停"]


In [79]:
def get_data() -> tuple[pd.DataFrame, list]:
    """
    获得股票历史信息，并计算总市值
    """
    # 显示结果
    df = pd.read_csv(
        f"../data/merge_{datetime.now().strftime('%Y-%m-%d')}.csv", parse_dates=['日期'], index_col=0, dtype={"股票代码": object})
    # dates = df.index.unique().sort_values().to_list()
    # print(type(dates[0]))
    # dates = [x.strftime("%Y-%m-%d") for x in dates]
    # 获得当前结果集的日期列表
    # dates_list = [date.strftime('%Y-%m-%d') for date in dates]
    value = pd.read_csv("../data/总股本.csv", index_col=0, dtype={"股票代码": object})
    value_dict = value['总股本'].to_dict()
    df['总股本'] = df['股票代码'].apply(lambda x: value_dict.get(x))
    df['总市值'] = df['总股本']*df['收盘']

    return df


In [110]:
print(OPTION_DICT)

{'all': (-inf, inf), '0-100': (0, 100), '100-500': (100, 500), '500-1000': (500, 1000), '1000-30000': (1000, 30000)}


In [81]:
df = get_data()
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 161314 entries, 2023-03-01 to 2023-03-23
Data columns (total 15 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   开盘      161314 non-null  float64
 1   收盘      161314 non-null  float64
 2   最高      161314 non-null  float64
 3   最低      161314 non-null  float64
 4   成交量     161314 non-null  int64  
 5   成交额     161314 non-null  float64
 6   振幅      161314 non-null  float64
 7   涨跌幅     161314 non-null  float64
 8   涨跌额     161314 non-null  float64
 9   换手率     161314 non-null  float64
 10  股票代码    161314 non-null  object 
 11  股票名称    161314 non-null  object 
 12  板块名称    161314 non-null  object 
 13  总股本     154485 non-null  float64
 14  总市值     154485 non-null  float64
dtypes: float64(11), int64(1), object(3)
memory usage: 19.7+ MB


In [82]:
df.columns

Index(['开盘', '收盘', '最高', '最低', '成交量', '成交额', '振幅', '涨跌幅', '涨跌额', '换手率', '股票代码',
       '股票名称', '板块名称', '总股本', '总市值'],
      dtype='object')

## 按日期、板块名称分组，并统计涨幅大于0和小于0的股票数量

In [91]:

def get_count(cur_df):
    """
    根据日期,板块名称对涨跌幅进行
    """
    # cur_df = df.copy()
    # 按股票名称分组，并统计涨幅大于0和小于0的股票数量
    result = cur_df.groupby(['日期', '板块名称'])['涨跌幅'].agg(
        [('涨的数量', lambda x: sum(x > 0)), ('跌的数量', lambda x: sum(x < 0))])
    result['涨幅比'] = result['涨的数量']/(result['涨的数量']+result['跌的数量'])*100
    # result.reset_index(inplace=True)
    return result


## 按日期、板块名称分组，并统计涨幅平均值以及总市值求和

In [92]:

def get_sum(cur_df):
    """
    根据start_value, end_value过滤总市值
    """
    value_df = cur_df.groupby(['日期', "板块名称"]).agg(
        {"涨跌幅": "mean", "总市值": "sum"})
    # value_df.reset_index(inplace=True)
    return value_df


## 合并结果

## 按照涨跌幅的区间进行统计

In [85]:

def get_range(cur_df):  # 按涨跌幅统计
      
     # db = df.loc['2023-03-01']
     bins = [-20, -10, -5, -3, -0.099, 0.099, 3, 5, 10, 20]
     # bins = list(range(-11, 12))
     cuts = pd.cut(cur_df['涨跌幅'], bins=bins)
     pct_chg_list = cur_df.groupby(["日期", "板块名称", cuts])['涨跌幅'].count()
     cur_df = pct_chg_list.unstack()
     return cur_df


### 用于测试

In [93]:
cur_df = df.copy()
# cur_df = cur_df[(cur_df['总市值'] >= (start_value)*100_000_000)
#                 & (cur_df['总市值'] <= (end_value)*100_000_000)]
result = get_count(cur_df)
value_df = get_sum(cur_df)
result


Unnamed: 0_level_0,Unnamed: 1_level_0,涨的数量,跌的数量,涨幅比
日期,板块名称,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-03-01,IT服务,120,7,94.488189
2023-03-01,LED,31,7,81.578947
2023-03-01,专业工程,31,3,91.176471
2023-03-01,专业服务,20,6,76.923077
2023-03-01,专业连锁,6,1,85.714286
...,...,...,...,...
2023-03-23,饮料制造,37,9,80.434783
2023-03-23,饰品,4,11,26.666667
2023-03-23,饲料,5,13,27.777778
2023-03-23,高速公路,5,13,27.777778


In [94]:
value_df

Unnamed: 0_level_0,Unnamed: 1_level_0,涨跌幅,总市值
日期,板块名称,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-03-01,IT服务,4.119685,1.235165e+12
2023-03-01,LED,1.099000,2.977415e+11
2023-03-01,专业工程,1.343529,3.725464e+11
2023-03-01,专业服务,0.585556,1.848354e+11
2023-03-01,专业连锁,0.792500,6.418736e+10
...,...,...,...
2023-03-23,饮料制造,0.896809,4.807890e+12
2023-03-23,饰品,-0.560625,1.222653e+11
2023-03-23,饲料,-0.498889,2.795893e+11
2023-03-23,高速公路,-0.145000,2.334328e+11


In [96]:
final_df = result.join(value_df, on=["日期", "板块名称"])
final_df

Unnamed: 0_level_0,Unnamed: 1_level_0,涨的数量,跌的数量,涨幅比,涨跌幅,总市值
日期,板块名称,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-03-01,IT服务,120,7,94.488189,4.119685,1.235165e+12
2023-03-01,LED,31,7,81.578947,1.099000,2.977415e+11
2023-03-01,专业工程,31,3,91.176471,1.343529,3.725464e+11
2023-03-01,专业服务,20,6,76.923077,0.585556,1.848354e+11
2023-03-01,专业连锁,6,1,85.714286,0.792500,6.418736e+10
...,...,...,...,...,...,...
2023-03-23,饮料制造,37,9,80.434783,0.896809,4.807890e+12
2023-03-23,饰品,4,11,26.666667,-0.560625,1.222653e+11
2023-03-23,饲料,5,13,27.777778,-0.498889,2.795893e+11
2023-03-23,高速公路,5,13,27.777778,-0.145000,2.334328e+11


In [114]:
# final_df.dropna(inplace=True, axis=0)
# 将Salary列格式化为亿元
final_df['总市值亿元'] = final_df['总市值'].apply(
    lambda x: '{:.2f}'.format(x/100000000))

db = get_range(cur_df)
result = pd.merge(final_df, db, on=["日期",'板块名称'])
result.reset_index(inplace=True,level=[0,1])
a=result.columns[:8].to_list()
a.extend(RANGE)
result.columns = a


In [115]:
result.columns

Index(['日期', '板块名称', '涨的数量', '跌的数量', '涨幅比', '涨跌幅', '总市值', '总市值亿元', '跌停',
       '跌<-5%', '-3%<-5%', '-3<-1%', '平盘', '<3%', '3-5%', '5%-涨停', '涨停'],
      dtype='object')

## 正式运行

In [116]:
for key,value in OPTION_DICT.items():
    start_value, end_value = value
    cur_df = df.copy()
    cur_df = cur_df[(cur_df['总市值'] >= (start_value)*100_000_000)
                    & (cur_df['总市值'] <= (end_value)*100_000_000)]
    result = get_count(cur_df)
    value_df = get_sum(cur_df)
    final_df = result.join(value_df, on=["日期", "板块名称"])
    # final_df.dropna(inplace=True, axis=0)
    # 将Salary列格式化为亿元
    final_df['总市值亿元'] = final_df['总市值'].apply(
        lambda x: '{:.2f}'.format(x/100000000))
    
    db = get_range(cur_df)
    result = pd.merge(final_df, db, on=["日期",'板块名称'])
    result.reset_index(inplace=True,level=[0,1])
    # 获得字段的前八列
    a = result.columns[:8].to_list()
    a.extend(RANGE)
    result.columns = a

    result.to_csv(
        f"../data/result_{key}_{datetime.now().strftime('%Y%m%d')}.csv", index=False)


In [None]:
# db = pd.DataFrame(pct_chg_list)
# db.reset_index(inplace=True,level=[0, 1])



## 校验一下数据结果

In [None]:
df = pd.read_csv(
    f"../data/result_{datetime.now().strftime('%Y%m%d')}.csv", index_col=0, parse_dates=True)
df.info()

In [None]:
df.head()

In [None]:
color = ["green", "green", "green", "green",
         "yellow", "red", "red", "red", "red"]


In [None]:
my_df=df.loc["2023-03-01"]
my_df=my_df[my_df['板块名称'] == "IT服务"]
x = ["跌停", "跌<-5%",  "-5<-3%",     "-1<3%",    "平盘",
     "1<3%",     "3-5%",   "5%-涨停", "涨停"]
color = ["green", "green", "green", "green",
         "yellow", "red", "red", "red", "red"]

my=my_df[x].unstack()
my=my.reset_index()
my.columns=['x',"date","y"]
# my
data = pd.DataFrame({"x": my["x"], "y": my["y"], "color": color})
data

In [None]:
df.groupby(df.index).sum() 