In [1]:
import os
import sys
import json
import codecs

import time
from datetime import datetime

In [2]:
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext, SQLContext
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql.window import Window
from pyspark import SparkFiles
from pyspark.sql.functions import col, lit, length, row_number, when

In [3]:
from lib import spark_utils

In [4]:
spark = spark_utils.get_spark()

25/03/18 15:22:22 WARN Utils: Your hostname, Mac-MD2XX1D4WV.local resolves to a loopback address: 127.0.0.1; using 192.168.11.215 instead (on interface en0)
25/03/18 15:22:22 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/18 15:22:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
spark

In [18]:
import glob
import pandas as pd

In [19]:
hedge_data_files = '/Users/liuda/Local/data/trading/data/hedgeparsing/*_form13f/alltrading/part*'

In [20]:
def read_multiple_files(file_paths, sep='\t', index_col=False):
    file_path_result = []
    for ep in file_paths.split(','):
        file_path_result += glob.glob(ep)

    frames = []
    for file_path in file_path_result:
        df = pd.read_csv(file_path, sep=sep, index_col=index_col)
        frames.append(df)

    merged_df = pd.concat(frames, ignore_index=True)
    return merged_df


In [21]:
hedge_data = read_multiple_files(hedge_data_files, sep='\t', index_col=False)

In [23]:
mapping_file = '/Users/liuda/Local/data/trading/data/cnsfails/merge/cnsfails.csv'

In [24]:
def load_ticker_mapping(input_file):
    mapping = pd.read_csv(input_file, sep='\t')
    mapping['CUSIP'] = mapping['CUSIP'].str.lower()
    mapping = mapping[['CUSIP', 'SYMBOL']]
    mapping = mapping[['CUSIP', 'SYMBOL']].drop_duplicates()
    return mapping

In [25]:
ticker_mapping = load_ticker_mapping(mapping_file)

In [27]:
hedge_data.shape

(1912293, 21)

In [28]:
hedge_data_1 = pd.merge(hedge_data, ticker_mapping, on='CUSIP', how='left')

In [29]:
hedge_data_1.shape

(1971645, 22)

In [45]:
test_df = ticker_mapping.groupby(['CUSIP']).agg(
    CUSIP_count=('CUSIP', 'count'),
    ticker_ids=('SYMBOL', list)
).reset_index(drop=False)

In [47]:
test_df[test_df['CUSIP_count'] > 1]  
# 存在一个CUSIP被映射到两个或以上ticker_id的情况，举例  
# 00180g304		[AMTD, AMTDZZZZ]
# y6430l160		[OP, OPXXXX, OPZZZZ]

Unnamed: 0,CUSIP,CUSIP_count,ticker_ids
5,000375204,2,"[ABBNY, ABB]"
80,001431303,2,"[AISSF, AISSD]"
124,00180g205,2,"[AMTD, AMTDXXXX]"
125,00180g304,2,"[AMTD, AMTDZZZZ]"
145,002120202,2,"[LIFE, ATYR]"
...,...,...,...
21440,y6430l103,2,"[OP, OPXXXX]"
21441,y6430l160,3,"[OP, OPXXXX, OPZZZZ]"
21442,y6430l202,2,"[OP, OPZZZZ]"
21459,y8564w103,2,"[TK, TKXXXX]"


In [48]:
hedge_data_1 = hedge_data_1.rename(columns={'SYMBOL': 'TICKER'})

In [49]:
def redefine_level(data):
    data = data[data['MANAGER_LEVEL'].isin([1, 2, 3, 4, 5])]

    data_l0 = data.copy(deep=True)
    # level 0对应全部数据
    data_l0['MANAGER_LEVEL'] = 0

    data = pd.concat([data, data_l0], ignore_index=True)

    data = data.groupby(
        ['CUSIP', 'TICKER', 'MANAGER_LEVEL', 'YEAR', 'QUARTER', 'DATA_YEAR', 'DATA_QURT']
    ).agg(
        {
            'INC_VALUE': 'sum', 'DRC_VALUE': 'sum', 'INC_COUNT': 'sum', 'DRC_COUNT': 'sum', 'TOTAL_VALUE': 'sum',
            'TOTAL_PREVALUE': 'sum', 'TOTAL_SSHPRNAMT': 'sum', 'TOTAL_PRESSHPRNAMT': 'sum',
            'HIS_AVG_DRC_VAL': 'sum', 'HIS_DRC_COUNT': 'sum',
            'HIS_AVG_INC_VAL': 'sum', 'HIS_INC_COUNT': 'sum',
            'TOP_HOLD_COUNT': 'sum',
        }
    ).reset_index()

    return data

In [51]:
hedge_data_1 = redefine_level(hedge_data_1)  # 抽取1,2,3,4,5的数据，并合并出0的数据

In [52]:
hedge_data_1.groupby(['MANAGER_LEVEL']).agg({'INC_VALUE': 'sum'})

Unnamed: 0_level_0,INC_VALUE
MANAGER_LEVEL,Unnamed: 1_level_1
0.0,102275869801958
1.0,10533791386051
2.0,6252450839622
3.0,2847287028665
4.0,9017447485961
5.0,73624893061659


In [55]:
hedge_data_1.shape

(1423787, 20)

In [53]:
hedge_data_2 = hedge_data_1.groupby(
    ['CUSIP', 'TICKER', 'MANAGER_LEVEL', 'YEAR', 'QUARTER', 'DATA_YEAR', 'DATA_QURT']
).agg({
    'INC_VALUE': 'sum', 'DRC_VALUE': 'sum', 'INC_COUNT': 'sum', 'DRC_COUNT': 'sum', 'TOTAL_VALUE': 'sum',
    'TOTAL_PREVALUE': 'sum', 'TOTAL_SSHPRNAMT': 'sum', 'TOTAL_PRESSHPRNAMT': 'sum',  # 这里之前的数据是对基金的评估
    'HIS_AVG_DRC_VAL': 'sum', 'HIS_DRC_COUNT': 'sum',  # 该行及之后的数据是当前基金在当前ticker上的情况
    'HIS_AVG_INC_VAL': 'sum', 'HIS_INC_COUNT': 'sum',
}).reset_index()


In [54]:
hedge_data_2.groupby(['MANAGER_LEVEL']).agg({'INC_VALUE': 'sum'})

Unnamed: 0_level_0,INC_VALUE
MANAGER_LEVEL,Unnamed: 1_level_1
0.0,102275869801958
1.0,10533791386051
2.0,6252450839622
3.0,2847287028665
4.0,9017447485961
5.0,73624893061659


In [56]:
hedge_data_2.shape

(1423787, 19)

In [73]:
hedge_data_3 = hedge_data_2[hedge_data_2['MANAGER_LEVEL'].isin([1, 2, 3, 4, 5])].copy()  # 这里的0是top99以上作为是异常值但在总数里会计

In [76]:
hedge_data_3['MANAGER_LEVEL'] = 0

In [77]:
hedge_data_3.groupby(['MANAGER_LEVEL']).agg({'INC_VALUE': 'sum'})

Unnamed: 0_level_0,INC_VALUE
MANAGER_LEVEL,Unnamed: 1_level_1
0,102275869801958


In [78]:
hedge_data_3 = hedge_data_3 .groupby(
    ['CUSIP', 'TICKER', 'MANAGER_LEVEL', 'YEAR', 'QUARTER', 'DATA_YEAR', 'DATA_QURT']
).agg({
    # INC_VALUE和DRC_VALUE是按qt的
    'INC_VALUE': 'sum', 'DRC_VALUE': 'sum', 'INC_COUNT': 'sum', 'DRC_COUNT': 'sum', 'TOTAL_VALUE': 'sum',
    'TOTAL_PREVALUE': 'sum', 'TOTAL_SSHPRNAMT': 'sum', 'TOTAL_PRESSHPRNAMT': 'sum',  # 这里之前的数据是对基金的评估
    # 按年聚合后的？见backend.hedge.preprocess output_path = '%s%sq%s_form13f/ticker/'
    'HIS_AVG_DRC_VAL': 'sum', 'HIS_DRC_COUNT': 'sum',  # 该行及之后的数据是当前基金在当前ticker上的情况
    'HIS_AVG_INC_VAL': 'sum', 'HIS_INC_COUNT': 'sum',
}).reset_index()


In [79]:
hedge_data_3.groupby(['MANAGER_LEVEL']).agg({'INC_VALUE': 'sum'})

Unnamed: 0_level_0,INC_VALUE
MANAGER_LEVEL,Unnamed: 1_level_1
0,102275869801958


In [87]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000000)
pd.set_option('display.width', 4000)

hedge_data_2[hedge_data_2['TICKER']=='TSM'][['TICKER', 'MANAGER_LEVEL', 'YEAR', 'QUARTER', 'TOTAL_VALUE', 'TOTAL_PREVALUE']].reset_index(drop=True)

Unnamed: 0,TICKER,MANAGER_LEVEL,YEAR,QUARTER,TOTAL_VALUE,TOTAL_PREVALUE
0,TSM,0.0,2013,3,3276809000,8384000
1,TSM,0.0,2013,4,2484983000,3263494000
2,TSM,0.0,2014,1,2100321000,2366166000
3,TSM,0.0,2014,2,2522280000,2095875000
4,TSM,0.0,2014,3,2585623000,2512858000
5,TSM,0.0,2014,4,2387733000,2578583000
6,TSM,0.0,2015,1,2954282000,2384440000
7,TSM,0.0,2015,2,3584266000,2922217000
8,TSM,0.0,2015,3,3002579000,3582776000
9,TSM,0.0,2015,4,2724778000,2868571000
