# Demonstration of the evomap Package

Replication notebook for the paper

Matthe, Maximilian (2023), "evomap: A Python Package for Dynamic Mapping". 

Python environment: evomap-package

ToDo List
- Shepard Diagram --> CHeck if order in isotonic regression is really correct? (for shepard diagram AND mds code)

# Setup

First, create path variabels for each subfolder and load the essential packages.

In [7]:
import pandas as pd
import numpy as np
import os

PATH_HOME = os.path.dirname(os.getcwd()) # parent folder
PATH_OUT = os.path.join(PATH_HOME, 'gen') # output folder

title_fontdict = {'size': 12, 'family': 'Arial'}
title_fontdict_large = {'size': 16, 'family': 'Arial'}
label_fontdict = {'size': 10, 'family': 'Arial'}
label_fontdict_large = {'size': 14, 'family': 'Arial'}
np.random.seed(42)

## TEST: GET TNIC DATA

In [8]:
# Create a path variable to the data directory
PATH_DATA = os.path.join(PATH_HOME, 'data')

# Read the file tnic2_data_merged.csv from there
data = pd.read_csv(os.path.join(PATH_DATA, 'tnic2_data_merged.csv'))

In [9]:
# select the following columns from this dataframe: year, score, name1, same2, mkvalt1, mkvalt2, sic1, sic2
data = data[['year', 'score', 'name1', 'name2', 'mkvalt1', 'mkvalt2', 'sic1', 'sic2', 'gvkey1', 'gvkey2']]
data = data.dropna()
data.sic1 = data.sic1.astype(int).astype(str).map(lambda x: x[:2])
data.sic2 = data.sic2.astype(int).astype(str).map(lambda x: x[:2])
# Rename the two columns mkvalt1 and mkvalt2 to size1 and size2
data = data.rename(columns={'mkvalt1': 'size1', 'mkvalt2': 'size2'})
data['size1'] = data['size1'].map(np.sqrt)
data['size2'] = data['size2'].map(np.sqrt)
data.head()

Unnamed: 0,year,score,name1,name2,size1,size2,sic1,sic2,gvkey1,gvkey2
0,1998,0.0035,AAR CORP,AIR T INC,23.254564,3.165932,50,45,1004,1210
1,1999,0.0229,AAR CORP,AIR T INC,19.306783,3.096773,50,45,1004,1210
2,2000,0.0312,AAR CORP,AIR T INC,19.419526,3.289377,50,45,1004,1210
3,2001,0.0483,AAR CORP,AIR T INC,19.094313,3.087718,50,45,1004,1210
4,2002,0.0568,AAR CORP,AIR T INC,11.97203,1.959806,50,45,1004,1210


# Section 1: Quick Illustration

This first section provides a quick illustration of how EvoMap can be applied using the evomap package. The section also highlights the differences between applying EvoMap vs. applying existent static mapping methods.

## Loading the Data

For this illustration, we use a small sample from the TNIC data available through the dataset submodule

In [10]:
#from evomap.datasets import load_tnic_sample_small
#data = load_tnic_sample_small()
#labels = data[data['sic1'].map(lambda x: x.startswith("36"))].groupby('name1').agg({'year':'nunique'}).query('year == 20').index.values
#labels = [label for label in labels if not label == "CALAMP CORP"] # Exclude calamp corp, as it has no connections to the others in one period
#labels
#data.head()

The sample is organized as an edgelist where each observation corresponds to a single firm x firm pair.

For this simple example, we select only a few firms, drop all other firms, and remove the gvkey identifier. 

#TODO: Save either the full data (including all 10 firms), or the smaller sample including 8 firms as another dataset in the dataset module!

In [11]:
labels = np.array([
    'ADOBE INC', 'MICROSOFT CORP', 'CITRIX SYSTEMS INC', 'HP INC',
       'WESTERN DIGITAL CORP', 'NETAPP INC', 'SONIC FOUNDRY INC',
       'NORTONLIFELOCK INC', 'NUANCE COMMUNICATIONS INC', 'TIVO CORP',
       'MICROSTRATEGY INC', 'FALCONSTOR SOFTWARE INC'])

selected_names = np.array([
	'US CELLULAR CORP', 
	'AT&T INC', 
#	'COMCAST CORP', 
	'MICROSOFT CORP', 
	'WESTERN DIGITAL CORP',
    ''
#	'HP INC', 
	'ORACLE CORP', 
	'EBAY INC', 
    'MICRON TECHNOLOGY INC',
	'INTUIT INC', 
	'APPLE INC'])


data = data.query('name1 in @selected_names').query('name2 in @selected_names')
gvkeys = data.gvkey1.unique()
data = data[['year', 'name1', 'name2', 'score', 'sic1', 'sic2', 'size1', 'size2', 'gvkey1', 'gvkey2']]
data = data.sort_values(['year', 'name1'])
data.index = range(len(data))
periods = data.year.unique()
n_periods = len(periods)
data.head()

Unnamed: 0,year,name1,name2,score,sic1,sic2,size1,size2,gvkey1,gvkey2
0,1998,APPLE INC,WESTERN DIGITAL CORP,0.0657,36,35,71.792988,32.294908,1690,11399
1,1998,APPLE INC,MICROSOFT CORP,0.0601,36,73,71.792988,517.384045,1690,12141
2,1998,APPLE INC,ORACLE CORP,0.0355,36,73,71.792988,188.441005,1690,12142
3,1998,AT&T INC,US CELLULAR CORP,0.0761,48,48,324.142356,57.620292,9899,14369
4,1998,EBAY INC,MICROSOFT CORP,0.0281,73,73,98.54459,517.384045,114524,12141


In [12]:
data.to_csv(os.path.join(PATH_DATA, 'tnic_sample_technology.csv'), index = False)

## Unbalanced Data

In [13]:
gvkeys = np.append(gvkeys, 147579)
gvkeys

array([  1690,   7343,   9899,  11399,  12141,  12142,  14369,  27928,
       114524, 147579])

In [14]:
# select the following columns from this dataframe: year, score, name1, same2, mkvalt1, mkvalt2, sic1, sic2
data_full = pd.read_csv(os.path.join(PATH_DATA,'tnic_data', 'tnic2_data.txt'), sep = '\t')
data_full = data_full.query('gvkey1 in @gvkeys').query('gvkey2 in @gvkeys')
data_full.head()

Unnamed: 0,year,gvkey1,gvkey2,score
305455,2006,1690,9899,0.0029
305552,1996,1690,11399,0.0253
305553,1997,1690,11399,0.0479
305554,1998,1690,11399,0.0657
305555,1999,1690,11399,0.0549


In [15]:
data_full = data_full.query('year >= 1998')

In [16]:
data_full.groupby('gvkey1').agg({'year': 'nunique'})

Unnamed: 0_level_0,year
gvkey1,Unnamed: 1_level_1
1690,20
7343,20
9899,20
11399,20
12141,20
12142,20
14369,20
27928,20
114524,20
147579,16


In [17]:
data_full

Unnamed: 0,year,gvkey1,gvkey2,score
305455,2006,1690,9899,0.0029
305554,1998,1690,11399,0.0657
305555,1999,1690,11399,0.0549
305556,2000,1690,11399,0.0813
305557,2001,1690,11399,0.0479
...,...,...,...,...
32253422,2013,147579,147579,
32253423,2014,147579,147579,
32253424,2015,147579,147579,
32253425,2016,147579,147579,


In [18]:
data_names = data.drop_duplicates('name1')[['name1', 'gvkey1']]
data_names.index = range(len(data_names))
data_names

Unnamed: 0,name1,gvkey1
0,APPLE INC,1690
1,AT&T INC,9899
2,EBAY INC,114524
3,INTUIT INC,27928
4,MICRON TECHNOLOGY INC,7343
5,MICROSOFT CORP,12141
6,ORACLE CORP,12142
7,US CELLULAR CORP,14369
8,WESTERN DIGITAL CORP,11399


In [19]:
data_names = pd.concat([data_names, pd.DataFrame([{'name1': 'NETFLIX INC', 'gvkey1': 147579}])], ignore_index=True)
data_names.loc[:, 'gvkey2'] = data_names.gvkey1
data_names.loc[:, 'name2'] = data_names.name1
data_names

Unnamed: 0,name1,gvkey1,gvkey2,name2
0,APPLE INC,1690,1690,APPLE INC
1,AT&T INC,9899,9899,AT&T INC
2,EBAY INC,114524,114524,EBAY INC
3,INTUIT INC,27928,27928,INTUIT INC
4,MICRON TECHNOLOGY INC,7343,7343,MICRON TECHNOLOGY INC
5,MICROSOFT CORP,12141,12141,MICROSOFT CORP
6,ORACLE CORP,12142,12142,ORACLE CORP
7,US CELLULAR CORP,14369,14369,US CELLULAR CORP
8,WESTERN DIGITAL CORP,11399,11399,WESTERN DIGITAL CORP
9,NETFLIX INC,147579,147579,NETFLIX INC


In [20]:
data_full = data_full.merge(data_names[['gvkey1', 'name1']], on='gvkey1', how='left')
data_full

Unnamed: 0,year,gvkey1,gvkey2,score,name1
0,2006,1690,9899,0.0029,APPLE INC
1,1998,1690,11399,0.0657,APPLE INC
2,1999,1690,11399,0.0549,APPLE INC
3,2000,1690,11399,0.0813,APPLE INC
4,2001,1690,11399,0.0479,APPLE INC
...,...,...,...,...,...
707,2013,147579,147579,,NETFLIX INC
708,2014,147579,147579,,NETFLIX INC
709,2015,147579,147579,,NETFLIX INC
710,2016,147579,147579,,NETFLIX INC


In [21]:
data_full = data_full.merge(data_names[['gvkey2', 'name2']], on='gvkey2', how='left')
data_full

Unnamed: 0,year,gvkey1,gvkey2,score,name1,name2
0,2006,1690,9899,0.0029,APPLE INC,AT&T INC
1,1998,1690,11399,0.0657,APPLE INC,WESTERN DIGITAL CORP
2,1999,1690,11399,0.0549,APPLE INC,WESTERN DIGITAL CORP
3,2000,1690,11399,0.0813,APPLE INC,WESTERN DIGITAL CORP
4,2001,1690,11399,0.0479,APPLE INC,WESTERN DIGITAL CORP
...,...,...,...,...,...,...
707,2013,147579,147579,,NETFLIX INC,NETFLIX INC
708,2014,147579,147579,,NETFLIX INC,NETFLIX INC
709,2015,147579,147579,,NETFLIX INC,NETFLIX INC
710,2016,147579,147579,,NETFLIX INC,NETFLIX INC


In [22]:
data_full.groupby('name1').agg({'year': 'nunique'})

Unnamed: 0_level_0,year
name1,Unnamed: 1_level_1
APPLE INC,20
AT&T INC,20
EBAY INC,20
INTUIT INC,20
MICRON TECHNOLOGY INC,20
MICROSOFT CORP,20
NETFLIX INC,16
ORACLE CORP,20
US CELLULAR CORP,20
WESTERN DIGITAL CORP,20


In [23]:
data_full.to_csv(os.path.join(PATH_DATA, 'tnic_sample_technology_with_netflix.csv'), index = False)