# Introduction:
## In this file, we create a dyadic dataset from the word embeddings of 10K filing reports. To do so, in the beginning, there are some file reading and filtering, which might be time-consuming to run. So, if you have the file titled "words_vector_filtered," you can skip to the section mentioning the beginning of the code.

Note: The difference between this file and 10K_embeddings is that we combine the Compustat file and filter firms with low total assets and the number of analysts' coverage.

---

In [7]:
import pandas as pd
import numpy as np
import os
import jason
import jsonlines
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
pd.set_option('display.max_columns', None)


In [2]:
wrds_vector_filter= pd.read_csv('.../wrds_vector_filter.csv')

In [3]:
wrds_vector_filter.shape[0]

103447

In [4]:
wrds_index= pd.read_csv('.../wrds_index.csv')

## Excluding firm having not been covered by any analyst in a given year

In [4]:
wrds_vector_filter.rename(columns={'cusip_8': 'CUSIP'}, inplace=True)
wrds_vector_filter ['CUSIP'] = wrds_vector_filter['CUSIP'].str.strip()
wrds_vector_filter ['CUSIP'] = wrds_vector_filter['CUSIP'].str.upper()
wrds_vector_filter ['CUSIP'] = wrds_vector_filter['CUSIP'].astype(str)
wrds_vector_filter ['CUSIP'] = wrds_vector_filter ['CUSIP'].astype(str).str.zfill(8)

# Reading the analysts estimate to eliminate firms with no analysts coverage:

In [5]:
eps_estimate= pd.read_csv('.../EPS_estimate_1FPI.csv') # analyst-by-analyst forecast file

eps_estimate ['CUSIP'] = eps_estimate['CUSIP'].str.strip()
eps_estimate ['CUSIP'] = eps_estimate['CUSIP'].str.upper()
eps_estimate ['CUSIP'] = eps_estimate['CUSIP'].astype(str)

  eps_estimate= pd.read_csv('/Users/milad/Desktop/Dynamic Dyadic OD/10K embeddings Dataset/EPS_estimate_1FPI.csv') # analyst-by-analyst forecast file


In [8]:
eps_estimate.iloc[0:2]

Unnamed: 0,TICKER,CUSIP,OFTIC,CNAME,ACTDATS,ESTIMATOR,ANALYS,CURRFL,PDF,FPI,MEASURE,VALUE,CURR,USFIRM,FPEDATS,ACTTIMS,REVDATS,REVTIMS,ANNDATS,ANNTIMS,ACTUAL,ACTDATS_ACT,ACTTIMS_ACT,ANNDATS_ACT,ANNTIMS_ACT,CURR_ACT,report_curr
0,0,87482X10,TLMR,TALMER BANCORP,2014-03-11,149,119962,,D,1,EPS,0.73,,1,2014-12-31,15:17:12,2014-03-11,15:17:12,2014-03-09,17:05:00,1.21,2015-01-30,16:54:47,2015-01-30,16:30:00,USD,USD
1,0,87482X10,TLMR,TALMER BANCORP,2014-03-11,228,80474,,D,1,EPS,0.83,,1,2014-12-31,15:49:22,2014-03-11,15:49:22,2014-03-10,6:48:00,1.21,2015-01-30,16:54:47,2015-01-30,16:30:00,USD,USD


In [7]:
analysts_year_firm= eps_estimate.groupby(['CUSIP', 'FPEDATS'])['ANALYS'].unique().reset_index(name='analysts_year_firm')

analysts_year_firm['datadate']= analysts_year_firm['FPEDATS']

analysts_year_firm ['analysts_focal_number'] = analysts_year_firm ['analysts_year_firm'].apply(lambda x: len(x) if isinstance(x, (list, np.ndarray)) else 0)


In [8]:
analysts_year_firm.iloc[0:3]

Unnamed: 0,CUSIP,FPEDATS,analysts_year_firm,datadate,analysts_focal_number
0,0,2010-12-31,"[125563, 122859, 113263, 113260, 113264, 113271]",2010-12-31,6
1,0,2016-06-30,[155036],2016-06-30,1
2,0,2016-12-31,"[142052, 137068]",2016-12-31,2


# Eliminating firms with no analyst coverage

In [9]:
analysts_year_firm= analysts_year_firm [analysts_year_firm['analysts_focal_number'] >= 1]

In [10]:
wrds_vector_filter = pd.merge(wrds_vector_filter, analysts_year_firm, on=['CUSIP', 'datadate'], how='inner')

# Excluding financial and utility sector

In [11]:
wrds_index ['datadate'] = pd.to_datetime(wrds_index['datadate'])
wrds_vector_filter['datadate'] = pd.to_datetime(wrds_vector_filter['datadate'])

  wrds_index ['datadate'] = pd.to_datetime(wrds_index['datadate'])


In [12]:
wrds_vector_filter = pd.merge(wrds_vector_filter, wrds_index [['GVKEY','datadate', 'naicsh2', 'naicsh4']] , on=['GVKEY','datadate'], how='left')

In [13]:
wrds_vector_filter = wrds_vector_filter [ (wrds_vector_filter['naicsh2'] != 22) &
                                         (wrds_vector_filter['naicsh2'] != 52) ]

In [14]:
wrds_vector_filter.shape[0]

52234

## This wrds_vector_filter only contains firms with a condition on analysts coverage (more than 1) and the exclusion of naicsh2 equal to 22 and 52

# Finding competitors in each year/ Firm-year

In [15]:
duplicated_df = wrds_vector_filter [wrds_vector_filter.duplicated(subset=['GVKEY', 'year_modified'], keep=False)]

duplicated_df

# They published a 10-K report in less than a year (around 6 month)

Unnamed: 0,GVKEY,datadate,cik,conm,rdate,fdate,file_id,regsic,CUSIP,embedding_0,...,yearr,monthr,yeard,monthd,year_modified,FPEDATS,analysts_year_firm,analysts_focal_number,naicsh2,naicsh4


In [16]:
print(wrds_vector_filter['year_modified'].min())
print(wrds_vector_filter['year_modified'].max())

1993
2022


# Finding Competitors or Dyads
We can skip this section as we have saved the results_nest

In [17]:
results = {}
results_25 = {}


## Finding the 10 most similar:


### I temporally changed the code to 10. As we have the data for 10, we can sort for other number of rivals.
### I created two datasets: one restricts the firms based on NAICS H2 and similaraity at the same time, while the other focuses only on the similarity indices among firms.


In [None]:
# Get the unique years and iterate over each year
# The for loop starts from the tM1

for year in sorted(wrds_vector_filter['year_modified'].unique())[0:]: 
    
    t0_data = wrds_vector_filter[wrds_vector_filter['year_modified'] == year]
    
    t1_data = wrds_vector_filter[wrds_vector_filter['year_modified'] == year+1]
    
    t2_data = wrds_vector_filter[wrds_vector_filter['year_modified'] == year+2]

    common_gvkeys = set(t1_data['GVKEY']).intersection(set(t2_data['GVKEY']))
    
    # common_gvkeys = set(previous_year_data['GVKEY']).intersection(set(next_year_data['GVKEY']), set(before_previous_year_data['GVKEY']))
    t0_data= t0_data[t0_data['GVKEY'].isin(common_gvkeys)]
    
    # Get the unique firms in current year
    for firm in t0_data['GVKEY'].unique():

        t0_firm = t0_data [t0_data['GVKEY'] == firm]
        
        firm_naics_code = t0_firm['naicsh2'].iloc[0]
                
        t0_other_firms = t0_data [ (t0_data['GVKEY'] != firm)] 
        
        #Additional condition for finding competitors
        # t0_other_firms = t0_data [ (t0_data['GVKEY'] != firm) & (t0_data['naicsh2'] == firm_naics_code)] 

        
        if t0_other_firms.empty:
            continue  
        
        # Compute cosine similarity between current firm and others
        embedding_columns = [column for column in wrds_vector_filter.columns if 'embedding' in column]
        
        similarity_scores = cosine_similarity(t0_firm [embedding_columns], t0_other_firms [embedding_columns])
        
        t0_other_firms ['similarity'] = similarity_scores[0] # all the similarity scores (check the above example)

        # Get the top 5 firms with highest similarity scores
        top_10_firms = t0_other_firms.sort_values(by='similarity', ascending=False).head(10)['GVKEY'].values

        # Store in result
        results[(firm, year)] = top_10_firms.tolist()

## Finding the 25 most similar:

In [None]:
# Get the unique years and iterate over each year
# The for loop starts from the tM1

for year in sorted(wrds_vector_filter['year_modified'].unique())[0:]: 
    
    t0_data = wrds_vector_filter[wrds_vector_filter['year_modified'] == year]
    
    t1_data = wrds_vector_filter[wrds_vector_filter['year_modified'] == year+1]
    
    t2_data = wrds_vector_filter[wrds_vector_filter['year_modified'] == year+2]

    common_gvkeys = set(t1_data['GVKEY']).intersection(set(t2_data['GVKEY']))
    
    # common_gvkeys = set(previous_year_data['GVKEY']).intersection(set(next_year_data['GVKEY']), set(before_previous_year_data['GVKEY']))
    
    t0_data= t0_data[t0_data['GVKEY'].isin(common_gvkeys)]

    # Get the unique firms in current year
    for firm in t0_data['GVKEY'].unique():

        t0_firm = t0_data [t0_data['GVKEY'] == firm]
        
        # t0_other_firms = t0_data [t0_data['GVKEY'] != firm] 
        firm_naics_code = t0_firm['naicsh2'].iloc[0]
                
        t0_other_firms = t0_data [ (t0_data['GVKEY'] != firm)] 
        
        # The additional condition for finding competitors
        # t0_other_firms = t0_data [ (t0_data['GVKEY'] != firm) & (t0_data['naicsh2'] == firm_naics_code)] 

        
        if t0_other_firms.empty:
            continue  
        
        # Get those firms that had observation in the t0 and t1 years
                
        # Compute cosine similarity between current firm and others
        embedding_columns = [column for column in wrds_vector_filter.columns if 'embedding' in column]

        similarity_scores = cosine_similarity(t0_firm [embedding_columns], t0_other_firms [embedding_columns])
        
        t0_other_firms ['similarity'] = similarity_scores[0] # all the similarity scores (check the above example)

        # Get the top 25 firms with highest similarity scores
        top_25_firms = t0_other_firms.sort_values(by='similarity', ascending=False).head(25)['GVKEY'].values

        # Store in result
        results_25 [(firm, year)] = top_25_firms.tolist()

In [22]:
results_nest = {}

# Loop over the keys and values in the original dictionary
for (gv, year), value in results.items():

    if year not in results_nest:
        results_nest[year] = {}
    
    # Add the firm and value to the appropriate year in the new dictionary
    results_nest[year][gv] = value


In [20]:
results_nest_25 = {}

# Loop over the keys and values in the original dictionary
for (gv, year), value in results_25.items():

    if year not in results_nest_25:
        results_nest_25[year] = {}
    
    # Add the firm and value to the appropriate year in the new dictionary
    results_nest_25[year][gv] = value


------

# Building the main data frame

In [24]:
# For building the 10

main_dict={'year_modified':[], 'datadateF':[], 'fdate':[], 'GVKEY focal':[],'GVKEY peer':[], 'Cos_f1_p0':[],
           'Cos_f1_p1':[], 'Cos_f0_p0':[], 'Cos_f0_p1':[], 'Cos_f2_p1':[], 'Cos_f2_p2':[], 'Cos_f1_p2':[], 
           'Cos_f0_p2':[], 'Cos_f4_p2':[], 'Cos_f3_p1':[]
           }

In [21]:
# For building the 25
main_dict_25 = {'year_modified':[], 'datadateF':[], 'fdate':[], 'GVKEY focal':[],'GVKEY peer':[], 'Cos_f1_p1':[]}

# Intended Variables: 
Change_similarity_f0p0 (willingness for imitation): The comparison of change between the focal and peer at t0

Change_similarity_f1p0: The comparison of change between the change of the focal at t1 and peer at t0

Peer_change_toward_f0p0 (The extent of imitation): The magnitude of peer firms' changes (at t0) is multiplied by the change_similarity_f0p0
In other words, to what extent the peer firm's change is similar to the focal firm's at t0

Focal_change_toward_f1p0: The magnitude of focal firms' changes (at t1) is multiplied by the change_similarity_f1p0
In other words, to what extent the focal firm's change t1 change is similar to the peer firm's t0 change

Cos_f0_pM1: The cosine between the position of the peer firm at (t-1) and the focal at t0

Cos_f0_p0: The cosine between the position of the peer firm at (t0) and the focal at t0

Cos_f1_p0: The cosine between the position of the peer firm at (t0) and the focal at t1

## Explanations of files used: 

results[1013, 2007] find the competitors of a firm in a given year with GVKEY

results_nest [2007][1013] a nested version of the above format.

wrds_vector_filter for finding embeddings of focal and peer firm in different years. 

Starts from the t0 (between 't-2' and 't1')


In [22]:
wrds_vector_filter ['datadate']= pd.to_datetime(wrds_vector_filter['datadate'])
wrds_vector_filter['fdate']= pd.to_datetime(wrds_vector_filter['fdate'])

In [23]:
embed_cols = ['embedding_' + str(i) for i in range(768)]

In [None]:
print (min(results_nest.keys()))
print (max (results_nest.keys()))

# print (min(results_nest_25.keys()))
# print (max (results_nest_25.keys()))

# Run the following for building the 25 rival firms:

In [25]:
# t0_min= 1993
# t0_max= 2020 

embed_cols = ['embedding_' + str(i) for i in range(768)]

for year, focals in results_nest_25.items():    
    
    for focal, peers in focals.items():        
        
        for peer in peers:

            #year column
            year_mod = wrds_vector_filter['year_modified'][(wrds_vector_filter['GVKEY'] == focal) & 
                                                          (wrds_vector_filter['year_modified']== year+1)].values[0]
            main_dict_25 ['year_modified'].append(year_mod)

            main_dict_25 ['GVKEY focal'].append(focal)

            main_dict_25 ['GVKEY peer'].append(peer)
            
            datadate = wrds_vector_filter['datadate'][(wrds_vector_filter['GVKEY']== focal) & (wrds_vector_filter['year_modified']== year+1)].dt.strftime('%Y-%m-%d').values[0]
            main_dict_25 ['datadateF'].append(datadate)
            
            fdate= wrds_vector_filter['fdate'][(wrds_vector_filter['GVKEY']== focal) & (wrds_vector_filter['year_modified']== year+1)].dt.strftime('%Y-%m-%d').values[0]
            main_dict_25 ['fdate'].append(fdate)
            
            
            # For building the 5
            companies = [focal, peer]
            
            # filter the data for companies (focal and peer) at t2
            filtered_data_t1 = wrds_vector_filter[(wrds_vector_filter['year_modified'] == year+1) & (wrds_vector_filter['GVKEY'].isin(companies))]
            
            # find the vector of embeddings
            embeddings_focal_t1 = filtered_data_t1 [filtered_data_t1 ['GVKEY'] == companies[0]][embed_cols].values
                        
            embeddings_peer_t1 = filtered_data_t1 [filtered_data_t1['GVKEY'] == companies[1]][embed_cols].values
            
            #find the similaty at time t2
            similarity_f1p1 = cosine_similarity(embeddings_focal_t1, embeddings_peer_t1)[0][0]
            main_dict_25['Cos_f1_p1'].append(similarity_f1p1)
            
                 

In [26]:
main_dataframe_25 = pd.DataFrame(main_dict_25)

# Run the following for building the 10 rival firms:

In [31]:
# t0_min= 1993
# t0_max= 2020 


embed_cols = ['embedding_' + str(i) for i in range(768)]

for year, focals in results_nest.items():    
    
    for focal, peers in focals.items():        
        
        for peer in peers:

            #year column
            year_mod_t1 = wrds_vector_filter['year_modified'][(wrds_vector_filter['GVKEY'] == focal) & 
                                                          (wrds_vector_filter['year_modified']== year+1)].values[0]
            main_dict['year_modified'].append(year_mod_t1)
            main_dict['GVKEY focal'].append(focal)
            main_dict['GVKEY peer'].append(peer)
            datadate = wrds_vector_filter['datadate'][(wrds_vector_filter['GVKEY']== focal) & (wrds_vector_filter['year_modified']== year+1)].dt.strftime('%Y-%m-%d').values[0]
            main_dict['datadateF'].append(datadate)
            fdate= wrds_vector_filter['fdate'][(wrds_vector_filter['GVKEY']== focal) & (wrds_vector_filter['year_modified']== year+1 )].dt.strftime('%Y-%m-%d').values[0]
            main_dict['fdate'].append(fdate)
            
            
            companies = [focal, peer]
            filtered_data_t1 = wrds_vector_filter[(wrds_vector_filter['year_modified'] == year+1) & (wrds_vector_filter['GVKEY'].isin(companies))]
            
            # find the vector of embeddings
            embeddings_focal_t1 = filtered_data_t1[filtered_data_t1['GVKEY'] == companies[0]][embed_cols].values
            embeddings_peer_t1 = filtered_data_t1[filtered_data_t1['GVKEY'] == companies[1]][embed_cols].values
            
            embeddings_focal_t0 = wrds_vector_filter[(wrds_vector_filter['year_modified'] == year) & (wrds_vector_filter['GVKEY'] == focal)][embed_cols].values
            embeddings_peer_t0 = wrds_vector_filter[(wrds_vector_filter['year_modified'] == year) & (wrds_vector_filter['GVKEY'] == peer)][embed_cols].values

            similarity_f0p1 = cosine_similarity(embeddings_focal_t0, embeddings_peer_t1)[0][0]
            similarity_f1p1 = cosine_similarity(embeddings_focal_t1, embeddings_peer_t1)[0][0]
            similarity_f0p0 = cosine_similarity(embeddings_focal_t0, embeddings_peer_t0)[0][0]
            similarity_f1p0 = cosine_similarity(embeddings_focal_t1, embeddings_peer_t0)[0][0]

            main_dict['Cos_f0_p1'].append(similarity_f0p1)
            main_dict['Cos_f1_p1'].append(similarity_f1p1)
            main_dict['Cos_f0_p0'].append(similarity_f0p0)
            main_dict['Cos_f1_p0'].append(similarity_f1p0)

            #focal and peer firm embeddings t1
            embeddings_focal_t2 = wrds_vector_filter[(wrds_vector_filter['year_modified'] == year+2) & (wrds_vector_filter['GVKEY'] == focal)][embed_cols].values
            embeddings_peer_t2 = wrds_vector_filter[(wrds_vector_filter['year_modified'] == year+2) & (wrds_vector_filter['GVKEY'] == peer)][embed_cols].values

            similarity_f2p1 = cosine_similarity (embeddings_focal_t2, embeddings_peer_t1 )[0][0]
            similarity_f2p2 = cosine_similarity (embeddings_focal_t2, embeddings_peer_t2 )[0][0]
            similarity_f1p2 = cosine_similarity (embeddings_focal_t1, embeddings_peer_t2 )[0][0]

            main_dict['Cos_f2_p1'].append(similarity_f2p1)
            main_dict['Cos_f2_p2'].append(similarity_f2p2)
            main_dict['Cos_f1_p2'].append(similarity_f1p2)
                     
            
            # Robustness Check:
            
            similarity_f0_p2= cosine_similarity(embeddings_focal_t0, embeddings_peer_t2)[0][0]
            main_dict['Cos_f0_p2'].append(similarity_f0_p2)
            
            # Emeddings for t3 and t4
                            
            embeddings_focal_t4 = wrds_vector_filter[(wrds_vector_filter['year_modified'] == year+4) & (wrds_vector_filter['GVKEY'] == focal)][embed_cols].values
            
            if embeddings_focal_t4.size > 0:
                similarity_f4_p2 = cosine_similarity(embeddings_focal_t4, embeddings_peer_t2)[0][0]
            else:
                similarity_f4_p2 = np.nan  # Assign NaN if embeddings are not found

            main_dict['Cos_f4_p2'].append(similarity_f4_p2)


            embeddings_focal_t3 = wrds_vector_filter[(wrds_vector_filter['year_modified'] == year+3) & (wrds_vector_filter['GVKEY'] == focal)][embed_cols].values
            
            if embeddings_focal_t3.size > 0:
                similarity_f3_p1 = cosine_similarity(embeddings_focal_t3, embeddings_peer_t1)[0][0]
            else:
                similarity_f3_p1 = np.nan  # Assign NaN if embeddings are not found

            main_dict['Cos_f3_p1'].append(similarity_f3_p1)

            

In [32]:
main_dataframe = pd.DataFrame(main_dict)

In [39]:
print (main_dataframe.shape[0])
print (main_dataframe_25.shape[0])

348520
871300


In [34]:
# It excludes the finance and utility companies
# Only includes firms having at least one analyst covering them.  
from datetime import date

today = date.today()

# Format the date as YYYYMMDD
formatted_date = today.strftime('%Y%m%d')

# Create a filename using the date
filename = f"main_dataframe_10R_{formatted_date}.csv"

main_dataframe.to_csv(filename, index=False) 

In [28]:
# It excludes the finance and utility companies
# Only includes firms having at least one analyst covering them. 

from datetime import date

today = date.today()

# Format the date as YYYYMMDD
formatted_date = today.strftime('%Y%m%d')

# Create a filename using the date
filename25 = f"main_dataframe_t2_25R_{formatted_date}.csv"

main_dataframe_25.to_csv(filename25, index=False) 