In [79]:
import pandas as pd
import numpy as np
# Visualizing pipelines in HTML
from sklearn import set_config; set_config()
# Pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
# NLP
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from sklearn.feature_extraction.text import CountVectorizer
# Suppress warning
pd.options.mode.chained_assignment = None

In [19]:
df = pd.read_excel('../raw_data/10k_company_info.xlsx')

In [52]:
selected_cols = df[['companyName','symbol','country', 'industry', 'sector', 'tags']]

In [27]:
selected_cols.head(2)

Unnamed: 0,companyName,symbol,country,industry,sector,tags
0,Arlington Asset Investment Corp,AAIC-B,US,Securities and Commodity Exchanges,Finance and Insurance,"['Finance', 'Real Estate Investment Trusts', '..."
1,Arlington Asset Investment Corp,AAIC-C,US,Securities and Commodity Exchanges,Finance and Insurance,"['Finance', 'Real Estate Investment Trusts', '..."


In [58]:
selected_cols.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9876 entries, 0 to 9875
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   companyName  9855 non-null   object
 1   symbol       9876 non-null   object
 2   country      5015 non-null   object
 3   industry     5962 non-null   object
 4   sector       5962 non-null   object
 5   tags         9876 non-null   object
 6   info         5962 non-null   object
dtypes: object(7)
memory usage: 540.2+ KB


In [61]:
selected_cols = selected_cols.dropna().drop_duplicates()

In [76]:
def clean_info(info):
    # Remove punctuation
    for p in string.punctuation:
        info = info.replace(p, '')   
    # Lower case
    info = info.lower()
    # Remove numbers
    info = ''.join(word for word in info if not word.isdigit())
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(info)
    info = [w for w in word_tokens if not w in stop_words]
    info = ' '.join(info)
    return info

In [62]:
selected_cols['info'] = selected_cols['industry'] + ' ' + selected_cols['sector'] + ' ' + selected_cols['tags']

In [63]:
selected_cols.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4407 entries, 0 to 9856
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   companyName  4407 non-null   object
 1   symbol       4407 non-null   object
 2   country      4407 non-null   object
 3   industry     4407 non-null   object
 4   sector       4407 non-null   object
 5   tags         4407 non-null   object
 6   info         4407 non-null   object
dtypes: object(7)
memory usage: 275.4+ KB


In [78]:
selected_cols['clean_info'] = selected_cols['info'].apply(clean_info)

In [87]:
vectorizer = CountVectorizer(ngram_range=(1,3), max_df=0.95, min_df=0.05)

X = vectorizer.fit_transform(selected_cols['clean_info'])

X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [88]:
np.unique(X.toarray(),return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8]),
 array([396702,  26062,   9819,   1576,   1562,    447,    120,      2,
             3]))

In [90]:
vect_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names(), index= selected_cols.index)

merged_df = selected_cols.merge(vect_df, left_index=True, right_index=True, how='left')

merged_df.drop(columns=['industry', 'sector', 'tags', 'info', 'clean_info'], inplace=True)

merged_df.head()

Unnamed: 0,companyName,symbol,country,banking,banking finance,banking finance insurance,banks,banks finance,banks finance insurance,biotechnology,...,technology,technology biotechnology,technology biotechnology manufacturing,technology services,trusts,trusts finance,trusts finance insurance,vehicles,vehicles finance,vehicles finance insurance
0,Arlington Asset Investment Corp,AAIC-B,US,0,0,0,0,0,0,0,...,0,0,0,0,1,1,1,0,0,0
1,Arlington Asset Investment Corp,AAIC-C,US,0,0,0,0,0,0,0,...,0,0,0,0,1,1,1,0,0,0
2,American Airlines Group Inc,AAL,US,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,Applied Optoelectronics Inc,AAOI,US,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
8,Advance Auto Parts Inc,AAP,US,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
