In [1]:
import os

import numpy as np
import pandas as pd
import xlrd

pd.set_option("display.max_rows", 50000)

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
stocks = pd.read_csv("/content/drive/MyDrive/Fiverr/PROJECT_MIMI/Stocks_DS_tab_delimited.dat", sep="\t")

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
fundamentals = pd.read_csv("/content/drive/MyDrive/Fiverr/PROJECT_MIMI/Fundamentals_Full.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
ratings = pd.read_csv("/content/drive/MyDrive/Fiverr/PROJECT_MIMI/Ratings_Full.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
securities = pd.read_csv("/content/drive/MyDrive/Fiverr/PROJECT_MIMI/Securities_Full.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
settlements = pd.read_excel("/content/drive/MyDrive/Fiverr/PROJECT_MIMI/Settlements.xlsx", na_values="#NULL!")

In [8]:
def excel_to_dictionary(sheetname, filename="/content/drive/MyDrive/Fiverr/PROJECT_MIMI/DataDictionary.xlsx"):
    abbreviation_dict = dict()
    
    abbreviation = xlrd.open_workbook(os.path.abspath(filename))
    abbreviation_sheet = abbreviation.sheet_by_name(sheetname)
    
    for i in range(1, abbreviation_sheet.nrows):
        abb = abbreviation_sheet.cell(i, 0).value
        abb_definition = abbreviation_sheet.cell(i, 1).value
        abbreviation_dict[abb] = abb_definition
        
    return abbreviation_dict

In [9]:
stocks_dict = excel_to_dictionary(sheetname="Stocks")
fundamentals_dict = excel_to_dictionary(sheetname="Fundamentals")
ratings_dict = excel_to_dictionary(sheetname="Ratings")
securities_dict = excel_to_dictionary(sheetname="Securities")

In [10]:
def rename_columns(df, df_dict):
  df.rename(columns=df_dict, inplace=True)
  df.columns = [column.lower().replace(" ", "_") for column in df.columns]
  return df

In [11]:
stocks = rename_columns(stocks, stocks_dict)
fundamentals = rename_columns(fundamentals, fundamentals_dict)
ratings = rename_columns(ratings, ratings_dict)
securities = rename_columns(securities, securities_dict)

In [12]:
for name, df in zip(["stocks", "fundamentals", "ratings", "securities", "settlements"], [stocks, fundamentals, ratings, securities, settlements]):
  print("{}: {}\n".format(name, df.shape[0]))

stocks: 4187047

fundamentals: 61429

ratings: 502561

securities: 915386

settlements: 1892



In [13]:
sorted(stocks.columns)

['active/inactive_status_marker',
 'address_line_1',
 'address_line_2',
 'address_line_3',
 'address_line_4',
 'adjustment_factor_(issue)-cumulative_by_ex-date',
 'adr_ratio_-_daily',
 'capital_gains_-_daily',
 'capital_gains_payment_date',
 'cash_dividends_-_daily',
 'cash_dividends_-_daily_payment_date',
 'cash_dividends_-_daily_payment_date_indicator',
 'cash_equivalent_distributions',
 'cash_equivalent_distributions_per_share_payment_date',
 'cik_number',
 'city',
 'company_initial_public_offering_date',
 'company_legal_name',
 'company_name',
 'county_code',
 'current_eps',
 'current_eps_month',
 'current_fiscal_year_end_month',
 'current_iso_country_code_-_headquarters',
 'current_iso_country_code_-_incorporation',
 'current_primary_issue_tag_-_canada',
 'current_primary_issue_tag_-_us',
 'current_state/province_of_incorporation_code',
 'cusip',
 'daily_total_return_factor',
 'data_date_-_dividends',
 'dividend_declaration_date',
 'dividend_payment_date',
 'dividend_payment_date_

In [14]:
shared_stocks = list()

for column in stocks.columns:
  if column in fundamentals.columns:
    shared_stocks.append(column)

sorted(shared_stocks)

['active/inactive_status_marker',
 'address_line_1',
 'address_line_2',
 'address_line_3',
 'address_line_4',
 'city',
 'company_initial_public_offering_date',
 'company_legal_name',
 'company_name',
 'county_code',
 'current_fiscal_year_end_month',
 'current_iso_country_code_-_headquarters',
 'current_iso_country_code_-_incorporation',
 'current_primary_issue_tag_-_canada',
 'current_primary_issue_tag_-_us',
 'current_state/province_of_incorporation_code',
 'employer_identification_number',
 'fax_number',
 'gic_groups',
 'gic_industries',
 'gic_sectors',
 'gic_sub-industries',
 'global_company_key',
 'international,_domestic,_both_indicator',
 'north_american_industry_classification_code',
 'phone_number',
 'postal_code',
 'primary_issue_tag_-_rest_of_world',
 'research_co_reason_for_deletion',
 'research_company_deletion_date',
 's&p_business_description',
 's&p_economic_sector_code',
 's&p_industry_sector_code',
 's&p_quality_ranking_-_current',
 'standard_industry_classification_co

In [15]:
stocks.groupby(["global_company_key", "ticker_symbol"])["ticker_symbol"].agg("count")

global_company_key  ticker_symbol
1003                ANTQ              788
1021                IWKS             1138
1076                AAN              1138
                    AAN.2             238
1078                ABT              1138
1109                WAFR             1138
1177                AET              1138
1183                IDAI              747
1190                AGDY             1138
1234                ATRI             1138
1239                ACV               341
1259                ACEL              836
1266                ALCO             1138
1363                BSWY              226
1408                BEAM             1088
                    BEAM.PA           719
1444                AVCS              899
1468                1717B              15
                    AM.1              907
1518                HIST               91
1523                AMAC              499
1559                AMS              1138
1602                AMGN             1138


In [16]:
pd.pivot_table(data=stocks,
               index=["global_company_key", "ticker_symbol"],
               values="company_name",
               aggfunc="count")

Unnamed: 0_level_0,Unnamed: 1_level_0,company_name
global_company_key,ticker_symbol,Unnamed: 2_level_1
1003,ANTQ,788
1021,IWKS,1138
1076,AAN,1138
1076,AAN.2,238
1078,ABT,1138
1109,WAFR,1138
1177,AET,1138
1183,IDAI,747
1190,AGDY,1138
1234,ATRI,1138
