In [1]:
# Install required libraries (run in terminal or command line)
# pip install pyarrow pandas numpy matplotlib seaborn nltk

import pandas as pd
from pyarrow import parquet

# Load the parquet file
A = pd.read_parquet("2022.parquet")

# Filter Tesla's 10-K filing data
Tesla = A[A['cik'] == "1318605"]

# Split the 'item_1A' text into paragraphs
Tesla_risk = pd.DataFrame(Tesla['item_1A'].iloc[0].split("\n"), columns=["text"])

Tesla_risk.head()


Unnamed: 0,text
0,ITEM 1A.\tRISK FACTORS
1,You should carefully consider the risks descri...
2,Risks Related to Our Ability to Grow Our Business
3,We may be impacted by macroeconomic conditions...
4,"Since the first quarter of 2020, there has bee..."


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Ensure 'item_1A' is preprocessed (fill NaN and clean text)
A['item_1A'] = A['item_1A'].fillna("")  # Replace NaN with empty strings

# Calculate TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(A['item_1A'])

# Convert to DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
print(tfidf_df)


           00       000  0001   01  015  016  018   02  021       025  ...  \
0    0.000000  0.002259   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000  ...   
1    0.000000  0.000000   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.005602  ...   
2    0.000000  0.000000   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000  ...   
3    0.000000  0.000000   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000  ...   
4    0.000000  0.000000   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000  ...   
..        ...       ...   ...  ...  ...  ...  ...  ...  ...       ...  ...   
489  0.000000  0.000000   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000  ...   
490  0.000000  0.000000   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000  ...   
491  0.000000  0.006343   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000  ...   
492  0.000000  0.000000   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000  ...   
493  0.003078  0.002059   0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.000000  ...   

     zones    zoning  zscaler  zte  zuckerberg  zuckerman  zyng

In [None]:
# Identify rows where 'item_1A' is missing or empty
mpty_values = A[A['item_1A'].isna() | (A['item_1A'] == "")]
mpty_values.head()
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

# Tokenize 'item_1A' for all companies
A['item_1A'] = A['item_1A'].fillna("")  # Replace NaN with empty strings
A['words'] = A['item_1A'].apply(word_tokenize)

# Convert into long format (one word per row)
stock_tokens = A.explode('words').rename(columns={'words': 'word'}).filter(['cik', 'word', 'company', 'date'])
stock_tokens.to_csv("stock_tokens.csv", index=False)
