# Realtion extraction

Posline = 둘다 긍정

Negline = 둘다 부정

Under = 앞에 부정 뒤에 긍정

Over= 앞에 긍정 뒤에 부정

Pos = 앞에만 긍정

Neg = 앞에만 부정

No-rel : 연관없음

In [1]:
# from a2t.data import Dataset
from a2t.tasks import BinaryTask, BinaryFeatures
import pandas as pd
from itertools import combinations
import numpy as np
from tqdm import tqdm



templates = {
    ## Business (Found, Buy, Sell, Merge, Operate, IPO, Privatize, Invest, Cancel, Bid)
    # Found (Posline)
    "Posline-Found": [
        "{X} founded {Y}",
        "{X} is founded by {Y}"
        ],     # "A" founded B / "A" is founded by B / 둘다 긍정
    # Buy (Under, Posline, Pos)
    "Under-Buy": [
        "{X} buys {Y}"
        ],            # "A" buys B / 앞에꺼 부정 뒤에꺼 긍정
    "Posline-Buy": [
        "{X} and {Y} are bought",
        "{X} buys {Y}",
        ],          # "A" buys B #돌다긍정
    "Pos-Buy": [
        "{X} buys {Y}"
        ],    # "A" is bought by B #앞에꺼만 긍정
    # Sell (Over, Under, Posline, Negline, Pos, Neg) - Distinction ?
    "Over-Sell": [
        "{X} is sold to {Y}"
        ],    # "A" is sold to B # 앞에꺼만 긍정 뒤에꺼 부정 
    "Posline-Sell": [ 
        "{X} and {Y} are sold",
        "{X} is sold to {Y}",
        "{X} spuns {Y} off"
        ],        # "A" sells B # 둘다 긍정
    "Negline-Sell": [
        "{X} winds down {Y}"
        ], # "A" is sold to B # 둘다 부정
    "Pos-Sell": [
        "{X} sells {Y}",
        "{X} is sold to {Y}",
        "{X} divests {Y}"],            # "A" sells B / "A" is sold to B # 앞에꺼만 긍정
    "Neg-Sell": [
        "{X} sells {Y}",
        "{X} leaves {Y}",
        "{X} spinoffs {Y}"],            # "A" sells B # 앞에꺼만 부정
    # Merge (Posline)
    "Posline-Merge": [
        "{X} merges with {Y}"
        ],    # "A" is merged with B #둘다 긍정
    # Operate (Posline, Negline, Pos, Neg)
    "Posline-Operate": [
        "{X} increases operation (of) {Y}"
        ],  # "A" increased operation of B #둘다 긍정 
    "Negline-Operate": [
        "{X} decreases operation (of) {Y}"
        ],  # "A" decreased operation of B # 둘다 부정 
    "Pos-Operate": [
        "{X} increases operation (in) {Y}"
        ],                   # "A" increased operation # 앞에꺼 긍정
    "Neg-Operate": [
        "{X} decreases operation (in) {Y}"
        ],                   # "A" decreased operation (+ recall) # 뒤에꺼 부정
    # IPO (Posline)
    "Posline-IPO": [
        "{X} IPO {Y}"
        ],   # "A" IPO # 둘다 긍정 
    # Privatize (Over, Under, Posline)
    "Over-Privatize": [
        "{X} is taken private by {Y}"
        ],  # "A" is taken private by B # 앞에꺼만 긍정 뒤에꺼 부정
    "Under-Privatize": [ 
        "{X} privatizes {Y}"
        ],            # "A" privatizes B #앞에꺼마 부정 뒤에꺼 긍정
    "Posline-Privatize": [
        "{X} and {Y} are privatized"
        ],          # "A" privatizes # 둘다 긍정
    # Invest (Posline)
    "Posline-Invest": [
        "{X} invests (in) {Y}",
        "{X} is invested (by) {Y}"
        ], # "A" invested in B / "A" is invested by B ? # 둘다 긍정
    # Cancel (Neg, Over)
    "Neg-Cancel": [
        "{X} cancels {Y}"
        ],        # "A" cancels B / "A" is canceled by B / "A" rejects (denies) "B" # 앞에꺼만 부정
    "Over-Cancel": [
        "{X} rejects {Y}"
        ],       # "A" cancels B ("A" doesn't support B) / "A" leaves B # 앞에꺼만 긍정 뒤에꺼 부정
    # Bid (Over, Under, Pos)
    "Over-Bid": [
        "{X} wins the bid (against) {Y}"
        ],   # "A" wins the bid against B #앞에꺼 긍정 뒤에꺼 부정
    "Under-Bid": [
        "{X} loses the bid (to) {Y}"
        ],      # "A" loses the bid to B #앞에께 부정 뒤에꼐 긍정
    "Pos-Bid": [
        "{X} wins the bid (in) {Y}"
        ],         # "A" wins the bid in B #앞에꺼만 긍정
    
    ## Family / Ownership
    "Posline-Family": [
        "{X} same family as {Y}"
        ],     # "A" Same Family as B #둘다 긍정
    
    ## Cooperation (win-win)
    "Posline-Cooperate": [
        "{X} cooperates with {Y}"
        ], # "A" cooperates with B / "A" uses B #둘다 긍정
    
    ## Performance (Outperform, Underperform, Inline, Rate)
    # Perform (Over, Under, Pos, Neg)
    "Over-Perform": [
        "{X} outperforms {Y}"
        ],          # "A" outperforms B 앞에꺼 긍정 뒤에꺼 부정
    "Under-Perform": [
        "{X} underperforms {Y}"
        ],       # "A" underperforms B 앞에꺼 부정 뒤에꺼 긍정
    "Pos-Perform": [
        "{X} Performs well (in) {Y}"
        ],         # "A" performs well 앞에만 긍정
    "Neg-Perform": [
        "{X} Performs bad (in) {Y}"
        ],          # "A" performs bad 앞에만 부정
    # Inline (Posline, Negline)
    "Posline-Inline": [
        "{X} performs positively as {Y}"
        ],   # "A" performs positively as B
    "Negline-Inline": [
        "{X} performs negatively as {Y}"
        ],   # "A" performs negatively as B
    # Rate (Over, Under, Posline, Negline, Pos, Neg)
    "Over-Rate": [
        "{X} undervalues {Y}",
        "{X} is skeptical of {Y}"],              # "A" undervalues B
    "Posline-Rate": [
        "{X} recognizes {Y}",
        "{X} retains its rating on {Y}",
        "{X} upgrades its rating on {Y}"],           # "A" recognizes B
    "Negline-Rate": [
        "{X} is undervalued as {Y}",
        "{X} and {Y} are undervalued"
        ],    # "A" is undervalued as B
    "Pos-Rate": [
        "{X} is recognized (by) {Y}",
        "{X} positive analysis (by) {Y}"
        "{X} is awarded (by) {Y}",
        "{X} is named (to) {Y}",
        "{X} earns {Y}"],       # "A" is recognized by B
    "Neg-Rate": [
        "{X} is undervalued (by) {Y}",
        "{X} is downgraded its rating (by) {Y}"
        "{X} is rejected (by) {Y}",
        "{X} is invalidated (by) {Y}"
        ],      # "A" is undervalued by B
    
    ## Recruitment (Hire, Fire, Quit, Lose)
    # Hire (Over, Posline)
    "Over-Hire": [
        "{X} hires from {Y}",
        "{X} poaches from {Y}"
        ],  # "A" hires from B
    "Posline-Hire": [
        "{X} hires {Y}",
        "{X} poaches {Y}"
        ],    # "A" hires B
    # Fire
    "Over-Fire": [
        "{X} fires {Y}"
        ],   # "A" fires B
    # Quit
    "Over-Quit": [
        "{X} quits {Y}",
        "{X} retires (from) {Y}"
        ],   # "A" quits B
    # Lose
    "Under-Lose": [
        "{X} loses (to) {Y}"
        ], # "A" loses (to) B

    ## Legal (Allege, Regulate, Verdict)
    # Allege (Over, Under, Negline, Neg)
    "Over-Allege": [
        "{X} files a lawsuit (against) {Y}",
        "{X} sues {Y}",
        "{X} persues a legal challenge (against) {Y}",
        "{X} appels the ruling (against) {Y}",
        "{X} alleges {Y}",
        "{X} accuses {Y}",
        "{X} urges {Y}",
        "{X} lashes out at {Y}"
        ],           # "A" is alleged (by) B
    "Under-Allege": [
        "{X} is alleged by {Y}",
        "{X} is sued by {Y}",
        "{X} is indicted by {Y}",
        "{X} is accused by {Y}"
        ],           # "A" is alleged by B
    "Negline-Allege": [
        "{X} is alleged as {Y}",
        "{X} is accused of {Y}"
        ],       # "A" is alleged as B
    "Neg-Allege": [
        "{X} angers {Y}"],           # "A" is alleged (by) B
    # Regulate (Over, Neg)
    "Over-Regulate": [
        "{X} regulates {Y}",
        "{X} orders {Y}",
        "{X} investigates {Y}",
        "{X} probes {Y}",
        "{X} requires {Y}"
        ],               # "A" regulates B
    "Neg-Regulate": [
        "{X} is subponaed (by) {Y}"
        ],
    # Verdict (Over, Under, Posline, Negline, Pos, Neg)
    "Over-Verdict": [
        "{X} wins against {Y}",
        "{X} disallows {Y}",
        "{X} fined {Y}",
        "{X} sentences {Y}"
        ],                 # "A" wins against B
    "Under-Verdict": [
        "{X} loses to {Y}"
        ],                   # "A" loses to B
    "Posline-Verdict": [
        "{X} allows {Y}",
        "{X} settles (with) {Y}"],                # "A" allows B 
    "Negline-Verdict": [
        "{X} is order by {Y}"
        ],                   # "A" loses (in) B
    "Pos-Verdict": [
        "{X} wins (in) {Y}",
        "{X} settles {Y}"
        ],                     # "A" wins (in) B
    "Neg-Verdict": [
        "{X} loses (in) {Y}",
        "{X} is ordered as {Y}"
        ],                   # "A" loses (in) B

    ## News release (Launch, Patent, Authorize)
    # Launch
    "Posline-Launch": [
        "{X} launches (as) {Y}"
        ],          # "A" launches (as) B
    # Patent
    "Posline-Patent": [
        "{X} files a patent (for) {Y}"
        ],   # "A" files a patent for B
    # Authorize (Pos, Neg)
    "Pos-Authorize": [
        "{X} is authorized (by) {Y}"
        ],      # "A" is authorized by B [Medecin]
    "Neg-Authorize": [
        "{X} not authorized (by) {Y}"
        ],     # "A" not authorized by B [Medecin]

    ## Bankruptcy
    "Pos-Bankruptcy": [
        "{X} exits bankruptcy {Y}"
        ], # "A" exits bankrupt
    "Neg-Bankruptcy": [
        "{X} goes bankrupt {Y}"
        ],  # "A" goes bankrupt
    # "No-rel": "Unrelated",
}

labels = list(templates.keys())

In [2]:
valid_conditions = {key:['ORGANIZATION:ORGANIZATION'] for key in labels}

In [3]:
labels = list(dict.fromkeys(['No-rel']+labels))

In [4]:
task = BinaryTask(
    name="Relation Classification task",
    required_variables=["X", "Y"],
    additional_variables=["inst_type"],
    labels=labels,
    templates=templates,
    valid_conditions=valid_conditions,
    negative_label_id=0,
    multi_label=True,
    features_class=BinaryFeatures
)

In [5]:
from a2t.base import EntailmentClassifier

nlp = EntailmentClassifier(
    # "microsoft/deberta-v2-xlarge-mnli",
    "roberta-large-mnli",
    use_tqdm=False,
    use_cuda=True, 
    half=True
)

In [6]:
ticker_df=pd.read_csv('C:/Users/Quantec/finBERT-master_pro/notebooks/0811_finner.csv',encoding='utf-8-sig')
ticker_df.drop(columns=['Unnamed: 0'],inplace=True)
ticker_df.drop('prediction',axis=1,inplace=True)

ticker_df.entity_list=ticker_df.entity_list.astype(str)
ticker_df.entity_list=ticker_df['entity_list'].str.replace('[',"",regex=True)
ticker_df.entity_list=ticker_df['entity_list'].str.replace(']',"",regex=True)
ticker_df.entity_list=ticker_df['entity_list'].str.replace('"',"",regex=True)
ticker_df.entity_list=ticker_df['entity_list'].str.replace('\'',"",regex=True)

In [54]:
# symbol_df=pd.read_csv('symbol_dictionary.csv')
# symbol_df=symbol_df.drop(columns='Unnamed: 0')

# symbol_df.text=symbol_df.text.astype(str)
# symbol_df.text=symbol_df['text'].str.replace('[',"",regex=True)
# symbol_df.text=symbol_df['text'].str.replace(']',"",regex=True)
# symbol_df.text=symbol_df['text'].str.replace('"',"",regex=True)
# symbol_df.text=symbol_df['text'].str.replace('\'',"",regex=True)

In [7]:
#티커 변환 사전 load
import pickle
with open('symbol_dict.pickle', 'rb') as fr:
    symbol_dict = pickle.load(fr)
    
def ticker_change(x):
    for symbol, names in symbol_dict.items():
        if x in names:
            return symbol
    return np.nan


In [8]:
# entity_list 2개이상인것만 추출
Relation_df=ticker_df[ticker_df.entity_list.apply(lambda x: True if len(x.split(', ')) >=2 else False)].reset_index(drop=True)

In [9]:
# ticker_dfticker_df.entity_list.apply(lambda x : list(dict.fromkeys(x.split(', '))))

In [10]:
#entity_list 1개인것 추출
no_Relation_df=ticker_df[ticker_df.entity_list.apply(lambda x: False if len(x.split(', ')) >=2 else True)]

In [11]:
x_list=[]
y_list=[]
index_list=[]

for i in range(Relation_df.shape[0]):
    items=[i for i in range(len(Relation_df.entity_list[i].split(', ')))]
    items_tuple=list(combinations(items, 2))
    for j,k in items_tuple:
        X=Relation_df.entity_list[i].split(', ')[j]
        Y=Relation_df.entity_list[i].split(', ')[k]
        x_list.append(X)
        y_list.append(Y)
        index_list.append(i)

                       
        

In [12]:
text_list=[]
news_label_list=[]
#ticker_list=[]
sentiment_score=[]
for i in range(len(index_list)):
    idx=index_list[i]
    text=Relation_df.contents[idx]
    text_list.append(text)
    label=Relation_df.news_label[idx]
    news_label_list.append(label)
    #ticker=ticker_df.ticker[idx]
    #ticker_list.append(ticker)
    score=Relation_df.sentiment_score[idx]
    sentiment_score.append(score)

In [13]:
x_y_df=pd.DataFrame([news_label_list,text_list,x_list,y_list,sentiment_score]).T
x_y_df.columns=['news_label','contents','X','Y','sentiment_score']

In [14]:
same_x_y=x_y_df[x_y_df.X == x_y_df.Y]#.loc[:,['index','score']]
same_x_y=same_x_y.drop('Y',axis=1).rename(columns={'X':'entity_list'})

In [15]:
no_Relation_df=pd.concat([no_Relation_df,same_x_y])
no_Relation_df.reset_index(drop=True,inplace=True)

In [16]:
no_Relation_df['entity_list'] = no_Relation_df['entity_list'].apply(ticker_change)
no_relation_ticker_change=no_Relation_df.dropna(how='any',axis=0).reset_index(drop=True)

In [17]:
#단일 티커 socre
no_relation_ticker_change

Unnamed: 0,news_label,contents,entity_list,sentiment_score
0,0,DDOG reported revenue growth that significantl...,DD,0.940325
1,0,Barclays analyst Raimo Lenschow maintained an ...,BCS,-0.039824
2,2,Ball Corp management has indicated that the de...,BLL,-0.963595
3,2,Deutsche Bank analyst Kyle White downgraded Ba...,DB,-0.78644
4,4,It was business as usual at Tesla’s (TSLA) ann...,TSLA,0.006504
...,...,...,...,...
1199,443,Why tech workers are quitting great jobs at co...,GOOG,-0.377019
1200,443,Why tech workers are quitting great jobs at co...,GOOG,-0.377019
1201,443,Why tech workers are quitting great jobs at co...,GOOG,-0.377019
1202,443,Why tech workers are quitting great jobs at co...,CVX,-0.377019


In [18]:
filter_Relation=x_y_df[x_y_df.X !=x_y_df.Y]
filter_Relation.reset_index(drop=True,inplace=True)

In [19]:
BinaryFeatures_list=[]

for i in range(filter_Relation.shape[0]):
    
    Binarytext=BinaryFeatures(X=filter_Relation.X[i],Y=filter_Relation.Y[i],inst_type="ORGANIZATION:ORGANIZATION",context=filter_Relation.contents[i])
    BinaryFeatures_list.append(Binarytext)

In [20]:
output_list=[]
for i in tqdm(range(0,filter_Relation.shape[0],100)):
    test_sample=BinaryFeatures_list[i:i+100]
    output=nlp(task=task,features=test_sample,return_labels=True, return_confidencces=True)
    output_list.append(output)

100%|██████████| 37/37 [41:40<00:00, 67.57s/it]  


In [21]:
result=sum(output_list,[])

In [22]:
filter_Relation['RE']=result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filter_Relation['RE']=result


In [23]:
filter_Relation=filter_Relation[filter_Relation.X!='Inc']
filter_Relation=filter_Relation[filter_Relation.Y!='Inc']
filter_Relation.reset_index(drop=True,inplace=True)

In [25]:
filter_Relation.to_csv('중간RE_0811.csv',encoding='utf-8-sig')

In [74]:
import pickle
with open('symbol_dict.pickle', 'rb') as fr:
    symbol_dict = pickle.load(fr)
    
def ticker_change(x):
    for symbol, names in symbol_dict.items():
        if x in names:
            return symbol
    return np.nan

filter_Relation_cp = filter_Relation.copy()
filter_Relation[['X','Y']] = filter_Relation[['X','Y']].applymap(ticker_change)

In [73]:
#위의 데이터 저장한거 불러오기
filter_Relation=pd.read_csv('중간RE_0811.csv')
filter_Relation.drop('Unnamed: 0',axis=1,inplace=True)

In [75]:
def sentiment_score(x):
    if 'Pos' == x.split('-')[0]:
        return 1
    elif 'Posline' == x.split('-')[0]:
        return 2
    elif 'No' == x.split('-')[0]:
        return 0 
    elif 'Neg' == x.split('-')[0]:
        return -1
    elif 'Negline' == x.split('-')[0]:
        return -2
    elif 'Over' == x.split('-')[0]:
        return 3
    elif 'Under' == x.split('-')[0]:
        return 4
    

In [76]:
filter_Relation['RE_label']=filter_Relation.RE.apply(sentiment_score)

In [77]:
#x_y_df.to_csv('week_RE.csv',encoding='utf-8-sig')

In [78]:
#pd.read_csv('week_RE.csv',encoding='utf-8-sig')

In [79]:
def RE_sentiment_X(RE):

    if RE == 0 :
        X= 0
        Y= 0
        return X

    elif RE == 1 :
        X= 1
        Y= 0
        return X

    elif RE == 2 :
        X= 1
        Y= 1
        return X

    elif RE == 3 :
        X= 1
        Y= -1
        return X

    elif RE == 4 :
        X= -1
        Y= 1
        return X

    elif RE == -1 :
        X= -1
        Y= 0
        return X

    elif RE == -2 :
        X= -1
        Y= -1
        return X

    
    
    
    
    

In [80]:
def RE_sentiment_Y(RE):

    if RE == 0 :
        X= 0
        Y= 0
        return Y

    elif RE == 1 :
        X= 1
        Y= 0
        return Y

    elif RE == 2 :
        X= 1
        Y= 1
        return Y

    elif RE == 3 :
        X= 1
        Y= -1
        return Y

    elif RE == 4 :
        X= -1
        Y= 1
        return Y

    elif RE == -1 :
        X= -1
        Y= 0
        return Y

    elif RE == -2 :
        X= -1
        Y= -1
        return Y

    
    
    
    
    

In [81]:
filter_Relation['X_sentiment']=filter_Relation['RE_label'].apply(RE_sentiment_X)
filter_Relation['Y_sentiment']=filter_Relation['RE_label'].apply(RE_sentiment_Y)


In [82]:
from sklearn.preprocessing import minmax_scale
filter_Relation['scale']=abs(filter_Relation.sentiment_score)


In [83]:
filter_Relation['X_score']=filter_Relation.scale * filter_Relation.X_sentiment

filter_Relation['Y_score']=filter_Relation.scale * filter_Relation.Y_sentiment


In [84]:
no_relation_ticker_change

Unnamed: 0,news_label,contents,symbol,sentiment_score
0,0,DDOG reported revenue growth that significantl...,DD,0.940325
1,0,Barclays analyst Raimo Lenschow maintained an ...,BCS,-0.039824
2,2,Ball Corp management has indicated that the de...,BLL,-0.963595
3,2,Deutsche Bank analyst Kyle White downgraded Ba...,DB,-0.78644
4,4,It was business as usual at Tesla’s (TSLA) ann...,TSLA,0.006504
...,...,...,...,...
1199,443,Why tech workers are quitting great jobs at co...,GOOG,-0.377019
1200,443,Why tech workers are quitting great jobs at co...,GOOG,-0.377019
1201,443,Why tech workers are quitting great jobs at co...,GOOG,-0.377019
1202,443,Why tech workers are quitting great jobs at co...,CVX,-0.377019


In [85]:
ticker_df=pd.read_csv('C:/Users/Quantec/quantec/0811_unique_topics.csv',encoding='utf-8-sig')
ticker_df.rename(columns={'Unnamed: 0':'news_labels'},inplace=True)
only_ticker=ticker_df.loc[:,['news_labels','topics','Name']]

In [86]:
Re_topic_df=pd.merge(filter_Relation,only_ticker,how='left',left_on='news_label',right_on='news_labels')
No_Re_topic_df=pd.merge(no_relation_ticker_change,only_ticker,how='left',left_on='news_label',right_on='news_labels')
No_Re_topic_df.rename(columns={'entity_list':'symbol','Name':'theme'},inplace=True)

In [87]:
X_last=Re_topic_df.loc[:,['X','X_score','Name']]
X_last.columns=['symbol','sentiment_score','theme']
Y_last=Re_topic_df.loc[:,['Y','Y_score','Name']]
Y_last.columns=['symbol','sentiment_score','theme']

symbol_df=pd.concat([X_last,Y_last]).reset_index(drop=True)

In [88]:
all_topic_no_re=No_Re_topic_df.loc[:,['symbol','sentiment_score','theme']]

In [89]:
sum_all_topic=pd.concat([symbol_df,all_topic_no_re]).dropna(axis=0).reset_index(drop=True)

In [90]:
sum_all_topic

Unnamed: 0,symbol,sentiment_score,theme
0,DDOG,-0.938656,15_target_suisse_mizuho_rating
1,GS,0.170234,15_target_suisse_mizuho_rating
2,MS,0.170234,15_target_suisse_mizuho_rating
3,PFE,0.061066,8_pfizer_gbt_sickle_cell
4,PFE,-0.061066,8_pfizer_gbt_sickle_cell
...,...,...,...
4235,GOOG,-0.377019,3_vehicle_tesla_chip_baidu
4236,GOOG,-0.377019,3_vehicle_tesla_chip_baidu
4237,GOOG,-0.377019,3_vehicle_tesla_chip_baidu
4238,CVX,-0.377019,3_vehicle_tesla_chip_baidu


In [91]:
all_theme_count_symbol=pd.DataFrame(sum_all_topic.groupby('theme')['symbol'].value_counts())
all_theme_count_symbol.rename(columns={'symbol':'count'},inplace=True)
all_theme_count_symbol.reset_index(inplace=True)

In [92]:
all_real_last=sum_all_topic.groupby(['theme','symbol'])['sentiment_score'].agg(['mean'])
#real_last.rename(columns={'mean':'count'},inplace=True)
all_real_last.reset_index(inplace=True)

In [93]:
all_last=pd.merge(all_real_last,all_theme_count_symbol,how='left',left_on=['theme','symbol'],right_on=['theme','symbol'])


In [94]:
all_last=all_last.sort_values(by=['theme','count'],ascending=False)

In [97]:
all_last.to_csv('last_test_all_0811.csv',encoding='utf-8-sig')

In [98]:
all_last

Unnamed: 0,theme,symbol,mean,count
557,9_monkeypox_vaccine_shot_novavax,NVAX,-0.320314,37
558,9_monkeypox_vaccine_shot_novavax,PFE,0.534511,16
553,9_monkeypox_vaccine_shot_novavax,BNTX,0.268434,14
556,9_monkeypox_vaccine_shot_novavax,MRNA,-0.274427,11
554,9_monkeypox_vaccine_shot_novavax,GILD,0.075286,7
...,...,...,...,...
102,0_berkshire_inflation_rate_amc,TWLO,-0.966292,1
104,0_berkshire_inflation_rate_amc,UNP,-0.946144,1
110,0_berkshire_inflation_rate_amc,XP,-0.151251,1
111,0_berkshire_inflation_rate_amc,Y,0.728947,1


# 끝

In [50]:
topic_re_sa=pd.DataFrame(symbol_df.groupby(['symbol','theme'])['sentiment_score'].mean())

In [51]:
topic_sa=pd.DataFrame(No_Re_topic_df.groupby(['symbol','theme'])['sentiment_score'].mean())

In [52]:
ri_topic_sa=topic_sa.reset_index()

In [53]:
ri_topic_re_sa=topic_re_sa.reset_index()

In [54]:
symbol_theme_sum=pd.concat([ri_topic_sa,ri_topic_re_sa])


In [55]:
last_df=pd.DataFrame(symbol_theme_sum.groupby(['symbol','theme'])['sentiment_score'].mean())

In [56]:
theme_count_symbol=pd.DataFrame(symbol_theme_sum.groupby('theme')['symbol'].value_counts())
theme_count_symbol.rename(columns={'symbol':'count'},inplace=True)
theme_count_symbol.reset_index(inplace=True)

In [57]:
test=last_df.reset_index()

In [58]:
real_last=test.groupby(['theme','symbol'])['sentiment_score'].agg(['mean'])
#real_last.rename(columns={'mean':'count'},inplace=True)
real_last.reset_index(inplace=True)

In [59]:
last_mean_count=pd.merge(real_last,theme_count_symbol,how='left',left_on=['theme','symbol'],right_on=['theme','symbol'])

In [61]:
last_mean_count.to_csv('last_test0811.csv',encoding='utf-8-sig')

In [62]:
theme_count_symbol

Unnamed: 0,theme,symbol,count
0,0_berkshire_inflation_rate_amc,A,2
1,0_berkshire_inflation_rate_amc,AAPL,2
2,0_berkshire_inflation_rate_amc,AIG,2
3,0_berkshire_inflation_rate_amc,AMC,2
4,0_berkshire_inflation_rate_amc,BAC,2
...,...,...,...
2355,9_monkeypox_vaccine_shot_novavax,The Human Rights Campaign (HRC,1
2356,9_monkeypox_vaccine_shot_novavax,World Health Organization,1
2357,9_monkeypox_vaccine_shot_novavax,the National Black Justice Coalition,1
2358,9_monkeypox_vaccine_shot_novavax,the National Center for Lesbian Rights,1


In [63]:
symbol_df=symbol_df.dropna(how='any',axis=0).reset_index(drop=True)

In [64]:
no_relation_ticker_change=no_relation_ticker_change.rename(columns={'entity_list':'symbol'})

In [65]:
no_re_symbol=no_relation_ticker_change.loc[:,['symbol','sentiment_score']]

In [66]:
sum_sentiment_df=pd.concat([symbol_df,no_re_symbol])

In [67]:
week_re_sentiment_mean=pd.DataFrame(sum_sentiment_df.groupby('symbol')['sentiment_score'].mean()).reset_index()

In [338]:
week_re_sentiment_mean.to_csv('0729_ticker_sentiment.csv',encoding='utf-8-sig')

In [68]:
week_re_sentiment_mean

Unnamed: 0,symbol,sentiment_score
0,(AMZN,0.851448
1,) Ford India Private Limited,0.303740
2,. Mersana,0.139378
3,.S. Treasury Department,0.009806
4,1Life Healthcare,0.081598
...,...,...
1836,the US Department of Commerce,0.035542
1837,the University of Chicago,0.020437
1838,the Vision Fund,-0.739225
1839,the Wall Street Journal,0.394289


# 생성된 데이터 라벨링

In [18]:
import pandas as pd
sentiment_df=pd.read_csv('C:/Users/Quantec/finBERT-master_pro/notebooks/finber.csv',encoding='utf-8-sig')
sentiment_df.drop('Unnamed: 0',axis=1,inplace=True)

In [19]:
import pandas as pd
x_y_df=pd.read_csv('week_RE.csv',encoding='utf-8-sig',usecols=[idx for idx in range(1,7)])

In [248]:
import pickle
with open('symbol_dict.pickle', 'rb') as fr:
    symbol_dict = pickle.load(fr)

In [249]:
def ticker_change(x):
    for symbol, names in symbol_dict.items():
        if x in names:
            return symbol
    return np.nan

In [251]:
x_y_df_cp = x_y_df.copy()
x_y_df[['X','Y']] = x_y_df[['X','Y']].applymap(ticker_change)

In [252]:
def sentiment_score(x):
    if 'Pos' == x.split('-')[0]:
        return 1
    elif 'Posline' == x.split('-')[0]:
        return 2
    elif 'No' == x.split('-')[0]:
        return 0 
    elif 'Neg' == x.split('-')[0]:
        return -1
    elif 'Negline' == x.split('-')[0]:
        return -2
    elif 'Over' == x.split('-')[0]:
        return 3
    elif 'Under' == x.split('-')[0]:
        return 4
    

In [253]:
x_y_df.RE=x_y_df.RE.apply(sentiment_score)

In [254]:
def RE_sentiment_X(RE):

    if RE == 0 :
        X= 0
        Y= 0
        return X

    elif RE == 1 :
        X= 1
        Y= 0
        return X

    elif RE == 2 :
        X= 1
        Y= 1
        return X

    elif RE == 3 :
        X= 1
        Y= -1
        return X

    elif RE == 4 :
        X= -1
        Y= 1
        return X

    elif RE == -1 :
        X= -1
        Y= 0
        return X

    elif RE == -2 :
        X= -1
        Y= -1
        return X

    
    
    
    
    

In [255]:
def RE_sentiment_Y(RE):

    if RE == 0 :
        X= 0
        Y= 0
        return Y

    elif RE == 1 :
        X= 1
        Y= 0
        return Y

    elif RE == 2 :
        X= 1
        Y= 1
        return Y

    elif RE == 3 :
        X= 1
        Y= -1
        return Y

    elif RE == 4 :
        X= -1
        Y= 1
        return Y

    elif RE == -1 :
        X= -1
        Y= 0
        return Y

    elif RE == -2 :
        X= -1
        Y= -1
        return Y

    
    
    
    
    

In [256]:
x_y_df['X_sentiment']=x_y_df['RE'].apply(RE_sentiment_X)
x_y_df['Y_sentiment']=x_y_df['RE'].apply(RE_sentiment_Y)


In [257]:
from sklearn.preprocessing import minmax_scale
x_y_df['scale']=minmax_scale(x_y_df.score)


In [258]:
x_y_df['X_score']=x_y_df.scale * x_y_df.X_sentiment

x_y_df['Y_score']=x_y_df.scale * x_y_df.Y_sentiment


In [259]:
X_last=x_y_df.loc[:,['X','X_score']]
X_last.columns=['symbol','sentiment_score']
Y_last=x_y_df.loc[:,['Y','Y_score']]
Y_last.columns=['symbol','sentiment_score']

symbol_df=pd.concat([X_last,Y_last]).reset_index(drop=True)

In [264]:
symbol_df=symbol_df.dropna(how='any',axis=0).reset_index(drop=True)

In [272]:
week_re_sentiment=pd.DataFrame(symbol_df.groupby('symbol')['sentiment_score'].mean()).reset_index()

In [280]:
week_re_sentiment.to_csv('week_re_sa.csv',encoding='utf-8-sig')

In [281]:
x_y_df.to_csv('week_re_sum.csv',encoding='utf-8-sig')

# example

In [38]:
test_examples = [
    BinaryFeatures(X='Apple', Y='Watch', inst_type='ORGANIZATION:ORGANIZATION', context='Apple launched a new edition of Watch on WWDC 2020'),
    BinaryFeatures(X='GM', Y='Chevrolet', inst_type='ORGANIZATION:ORGANIZATION', context='GM halted production of Chevrolet due to mechanical failure.'),
    # BinaryFeatures(X='He', Y='University of Maryland in College Park', inst_type='PERSON:ORGANIZATION', context='He received an undergraduate degree from Morgan State University in 1950 and applied for admission to graduate school at the University of Maryland in College Park.', label='no_relation')
]

nlp(task=task, features=test_examples, return_labels=True, return_confidences=True)

[('Posline-Launch', 0.7734375), ('Negline-Operate', 0.97021484375)]