# Initial exploration of data

The main objective here is to get an understanding of how the data is arranged, and what useful features can be extracted from the data

In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.core.pylabtools import figsize

import seaborn as sns
import plotly.express as px

import numpy as np
import pandas as pd
import polars as pl

import statsmodels.formula.api as smf


In [2]:
datadir = 'data/2023q4/'

In [3]:
ls data/2023q4/

num.txt     pre.txt     readme.htm  sub.txt     tag.txt


In [4]:
# load tables

num = pd.read_csv( datadir+'num.txt', delimiter='\t')
pre = pd.read_csv( datadir+'pre.txt', delimiter='\t')
sub = pd.read_csv( datadir+'sub.txt', delimiter='\t')
tag = pd.read_csv( datadir+'tag.txt', delimiter='\t')

sub - forms submitted  
tag - descriptions of fields  




## Overall composition of the forms recieved

In [5]:
# how many reports are received?
sub.shape

(24097, 36)

In [6]:
# how many companies were reports recieved from?
sub.name.unique().shape

(6641,)

In [7]:
sub[['form']].groupby('form').size()

form
10-K         392
10-K/A        89
10-KT          1
10-Q        5726
10-Q/A       146
20-F          54
20-F/A        30
20FR12B        1
40-F          11
424B2         25
424B3         51
424B5          1
424B7          2
6-K          128
6-K/A          8
8-K        16301
8-K/A        383
8-K12B         3
8-K12G3        4
DEF 14A      146
DEFA14A        1
DEFR14A        3
F-1           10
F-1/A         20
F-4            1
F-4/A          1
N-2           16
N-2/A         17
N-2ASR         2
N-4           21
N-4/A          2
N-6/A          4
N-CSR         44
N-CSR/A        1
N-CSRS        35
POS 8C        13
POS AM        23
POS AMI        3
POS EX        21
PRE 14A       17
S-1           82
S-1/A        128
S-11/A         2
S-3            3
S-3/A          1
S-4           20
S-4/A        104
dtype: int64

The most filed forms are 8-K, which notifies investors of some change, and form 10-Q, which is a summary of financial information that is required by the SEC from all public companies.

Form 10-Q composition

In [8]:
sub[ sub.form == '10-Q'].shape

(5726, 36)

In [9]:
sub[ sub.form == '10-Q'].cik.unique().shape

(5643,)

Some companies did file multiple 10-Q forms

In [10]:
sub.fp.unique()

array([nan, 'FY', 'Q1', 'Q2', 'Q3'], dtype=object)

_None_ of the froms filed in this data set are for quarter 4. Odd since this is supposed to be the q4 data set.

In [11]:
subq3 = pd.read_csv('data/2023q3/sub.txt', delimiter='\t')

In [12]:
subq3.fp.unique()

array([nan, 'FY', 'Q2', 'Q1', 'Q3'], dtype=object)

In [13]:
companiesQ4 = sub.cik.unique()
companiesQ4.sort()

companiesQ3 = subq3.cik.unique()
companiesQ3.sort()

In [14]:
len( set(companiesQ3).intersection(set(companiesQ4)) )

6194

It's good to see that there are a lot of overlaps between the companies filing each quarter. (at least)

In [15]:
subq3[(subq3.form == '10-Q')&(subq3.fp=='FY')].shape

(0, 36)

In [16]:
subq1 = pd.read_csv('data/2023q1/sub.txt', delimiter='\t')

In [17]:
subq1[(subq1.form == '10-Q')].shape

(1090, 36)

There are no q4 results, and no FY results for form 10-q because this form is not required by the SEC at the end of the year. Only the form 10-k

In [18]:
sub.pivot_table(index='form', columns=['fp'], values='cik', aggfunc='count').fillna(0)

fp,FY,Q1,Q2,Q3
form,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10-K,392.0,0.0,0.0,0.0
10-K/A,89.0,0.0,0.0,0.0
10-KT,1.0,0.0,0.0,0.0
10-Q,0.0,392.0,361.0,4973.0
10-Q/A,0.0,41.0,54.0,51.0
20-F,54.0,0.0,0.0,0.0
20-F/A,30.0,0.0,0.0,0.0
20FR12B,0.0,0.0,1.0,0.0
40-F,11.0,0.0,0.0,0.0
6-K,0.0,2.0,63.0,41.0


In [19]:
sub.pivot_table(index='form', columns=['fy'], values='cik', aggfunc='count').fillna(0)

fy,2012.0,2013.0,2015.0,2016.0,2019.0,2020.0,2021.0,2022.0,2023.0,2024.0
form,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
10-K,1.0,0.0,1.0,0.0,1.0,0.0,1.0,16.0,372.0,0.0
10-K/A,0.0,0.0,0.0,0.0,0.0,0.0,3.0,51.0,35.0,0.0
10-KT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
10-Q,0.0,2.0,0.0,2.0,0.0,4.0,0.0,8.0,5050.0,660.0
10-Q/A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,118.0,10.0
20-F,0.0,0.0,0.0,0.0,0.0,0.0,1.0,5.0,48.0,0.0
20-F/A,0.0,0.0,0.0,0.0,0.0,1.0,1.0,23.0,5.0,0.0
20FR12B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
40-F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,0.0
6-K,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,96.0,10.0


Aha. The key point here is that the database is for forms recieved (?) in q4, but not necessarily related to q4.

Some go back as far as 2012 (!) at some forms appear to be pre-filed (?) for quarters that haven't even occurred yet?

In [20]:
sub.isnull().any()

adsh          False
cik           False
name          False
sic            True
countryba      True
stprba         True
cityba         True
zipba          True
bas1           True
bas2           True
baph           True
countryma      True
stprma         True
cityma         True
zipma          True
mas1           True
mas2           True
countryinc     True
stprinc        True
ein           False
former         True
changed        True
afs            True
wksi          False
fye            True
form          False
period         True
fy             True
fp             True
filed         False
accepted      False
prevrpt       False
detail        False
instance      False
nciks         False
aciks          True
dtype: bool

In [21]:
ls data

[1m[36m2022q1[m[m/ [1m[36m2022q2[m[m/ [1m[36m2022q3[m[m/ [1m[36m2022q4[m[m/ [1m[36m2023q1[m[m/ [1m[36m2023q2[m[m/ [1m[36m2023q3[m[m/ [1m[36m2023q4[m[m/


## Analsis of form 10-k

It seems like the form 10-k statements should work for our purposes. How many can we find for 2022?

In [22]:
import glob

submissions = glob.glob('data/*/sub.txt')
submissions.sort()


In [23]:
companies_reporting = pd.DataFrame([])

for fname in submissions:
    data = pd.read_csv(fname, delimiter='\t')

    companies_seen = data[(data.fy==2022.) & (data.form == '10-K')][['name']]
    companies_reporting = pd.concat([companies_reporting, companies_seen])

    print(fname, companies_seen.shape)



data/2022q1/sub.txt (0, 1)
data/2022q2/sub.txt (28, 1)
data/2022q3/sub.txt (312, 1)
data/2022q4/sub.txt (388, 1)
data/2023q1/sub.txt (4703, 1)
data/2023q2/sub.txt (762, 1)
data/2023q3/sub.txt (71, 1)
data/2023q4/sub.txt (16, 1)


In [24]:
companies_reporting['name'].unique().shape

(6280,)

In [25]:
companies_reporting.shape

(6280, 1)

Ok, great.
1. We have a nice set of companies that we did get a form 10-k from
2. (In this year) we never recieved more than one form 10-k.
3. Most of the reports were filed in 2023 q1. However, there were a fair amount of reports filed both before and after this period.
   - Strangely, this includes many filings in q3 and q4 2022, before the fiscal year was even over. It's possible that these companies are not subject to full reporting.
   - There are ~5996 stocks in the NYSE and Nasdaq, which leaves about 300 companies to spare...

In [26]:
data = pd.read_csv('data/2022q2/sub.txt', delimiter='\t')

q222filings = data[(data.fy==2022.) & (data.form == '10-K')]

In [27]:
q222filings

Unnamed: 0,adsh,cik,name,sic,countryba,stprba,cityba,zipba,bas1,bas2,...,period,fy,fp,filed,accepted,prevrpt,detail,instance,nciks,aciks
1421,0001193125-22-185257,40704,GENERAL MILLS INC,2040.0,US,MN,MINNEAPOLIS,55426,NUMBER ONE GENERAL MILLS BLVD,,...,20220531.0,2022.0,FY,20220630,2022-06-29 18:56:00.0,0,1,d313744d10k_htm.xml,1,
1615,0000794619-22-000069,794619,AMERICAN WOODMARK CORP,2430.0,US,VA,WINCHESTER,22601,3102 SHAWNEE DRIVE,,...,20220430.0,2022.0,FY,20220629,2022-06-29 16:59:00.0,0,1,amwd-20220430_htm.xml,1,
1623,0000891024-22-000005,891024,"PATTERSON COMPANIES, INC.",5047.0,US,MN,ST PAUL,55120-1401,1031 MENDOTA HEIGHTS RD,,...,20220430.0,2022.0,FY,20220629,2022-06-29 13:29:00.0,0,1,pdco-20220430_htm.xml,1,
1749,0001437749-22-016173,69891,NATIONAL BEVERAGE CORP,2086.0,US,FL,FT. LAUDERDALE,33324,8100 SW 10TH STREET,SUITE 4000,...,20220430.0,2022.0,FY,20220629,2022-06-29 16:29:00.0,0,1,fizz20220623_10k_htm.xml,1,
1790,0001558370-22-010392,1368622,AEROVIRONMENT INC,3721.0,US,VA,ARLINGTON,22202,"241 18TH STREET SOUTH, SUITE 415",,...,20220430.0,2022.0,FY,20220629,2022-06-28 18:10:00.0,0,1,avav-20220430x10k_htm.xml,1,
1814,0001628280-22-018227,713425,AMERICAN SOFTWARE INC,7372.0,US,GA,ATLANTA,30305,470 E PACES FERRY RD NE,,...,20220430.0,2022.0,FY,20220629,2022-06-29 17:17:00.0,0,1,amswa-20220430_htm.xml,1,
1817,0001634117-22-000070,1634117,"BARNES & NOBLE EDUCATION, INC.",5940.0,US,NJ,BASKING RIDGE,07920,120 MOUNTAIN VIEW BOULEVARD,,...,20220430.0,2022.0,FY,20220629,2022-06-29 16:21:00.0,0,1,bned-20220430_htm.xml,1,
1831,0001683168-22-004704,704562,"AVID BIOSERVICES, INC.",2834.0,US,CA,TUSTIN,92780,"2642 MICHELLE DRIVE, SUITE 200",,...,20220430.0,2022.0,FY,20220629,2022-06-29 17:05:00.0,0,1,avid_i10k-043022_htm.xml,1,
1987,0001477932-22-004724,1495648,"BRISSET BEER INTERNATIONAL, INC.",1311.0,CA,QC,MONTREAL,H3J-1S6,370 GUY,SUITE G9,...,20220531.0,2022.0,FY,20220628,2022-06-28 16:37:00.0,0,1,bbii_10k_htm.xml,1,
2019,0001564590-22-024447,56679,KORN FERRY,7361.0,US,CA,LOS ANGELES,90067,1900 AVENUE OF THE STARS,SUITE 1500,...,20220430.0,2022.0,FY,20220628,2022-06-28 14:15:00.0,0,1,kfy-10k_20220430_htm.xml,1,


Many of the q2 filings actually come from large companies...


Aha. The __answer__ is that _different companies use different fiscal years_.  
This is a bit annoying because it introduces an element of time-dependent staggaring into any face-to-face comparison between companies.
- Pooling the data over several years should reduce this issue.
- I can compare earnings across time for individual companies without issue.
- The best way around this is actually to use both 10-Q and 10-K forms to get quarterly earnings data that can then be aligned to any given quarter for all companies. This is work for a future time.

In [28]:
pl.from_pandas(data[['cik', 'name']]).unique().shape

(7340, 2)

In [29]:
pl.from_pandas(data[['name']]).unique().shape

(7338, 1)

Some names have multiple cik's, and some cik's have multiple names.

Another problem that I will need to solve in the future is to determine the ticker symbol and current price for a given stock offering. Again, that is for the future.

#### For now, lets look into the fields that we get.

What fields do we care about from each table?

In [30]:
num = pd.read_csv( 'data/2023q1/num.txt', delimiter='\t')
pre = pd.read_csv( 'data/2023q1/pre.txt', delimiter='\t')
sub = pd.read_csv( 'data/2023q1/sub.txt', delimiter='\t')
tag = pd.read_csv( 'data/2023q1/tag.txt', delimiter='\t')

In [33]:
form10k = sub[ sub.form == '10-K']

In [44]:
form10k.columns

Index(['adsh', 'cik', 'name', 'sic', 'countryba', 'stprba', 'cityba', 'zipba',
       'bas1', 'bas2', 'baph', 'countryma', 'stprma', 'cityma', 'zipma',
       'mas1', 'mas2', 'countryinc', 'stprinc', 'ein', 'former', 'changed',
       'afs', 'wksi', 'fye', 'form', 'period', 'fy', 'fp', 'filed', 'accepted',
       'prevrpt', 'detail', 'instance', 'nciks', 'aciks'],
      dtype='object')

In [52]:
form10k.iloc[0]

adsh            0000109198-23-000004
cik                           109198
name          TJX COMPANIES INC /DE/
sic                           5651.0
countryba                         US
stprba                            MA
cityba                    FRAMINGHAM
zipba                          01701
bas1               770 COCHITUATE RD
bas2                             NaN
baph                    508-390-1000
countryma                         US
stprma                            MA
cityma                    FRAMINGHAM
zipma                          01701
mas1             770 COCHITUATE ROAD
mas2                             NaN
countryinc                        US
stprinc                           DE
ein                         42207613
former                    ZAYRE CORP
changed                   19890625.0
afs                            1-LAF
wksi                               1
fye                            131.0
form                            10-K
period                    20230131.0
f

In [53]:
adsh = form10k.iloc[0]['adsh']

num[ (num.adsh == adsh) & (num.tag=='AccountsPayableCurrent')]

Unnamed: 0,adsh,tag,version,coreg,ddate,qtrs,uom,value,footnote
167735,0000109198-23-000004,AccountsPayableCurrent,us-gaap/2022,,20230131,0,USD,3794000000.0,
167736,0000109198-23-000004,AccountsPayableCurrent,us-gaap/2022,,20220131,0,USD,4465000000.0,
