#Description

We use TF-IDF vectorization to extract the 30 most frequent words for each sector. This helps to provide a more specific description of the sector names and increases the accuracy of our models when using the zero-shot classification model. To further improve the accuracy of our models, we preprocess the descriptions by adding all verbs that appear in the dataset, as well as some additional words, as stop words. The results of our analysis are presented below:

#Preprocessing the dataset

In [None]:
import pandas as pd
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data = pd.read_csv("/content/drive/MyDrive/companyclassification/datasets/wrds_data.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,conm,gind,gsector,naics,busdesc,spcindcd,GICS_Sector,naics_main,NAICS_Sector
0,2,AAI CORP,,,,"AAI Corporation, together with its subsidiarie...",230.0,,No,
1,3,A.A. IMPORTING CO INC,255040.0,25.0,442110.0,"A.A. Importing Company, Inc. designs, manufact...",449.0,Consumer Discretionary,44,Retail Trade
2,4,AAR CORP,201010.0,20.0,423860.0,AAR Corp. provides products and services to co...,110.0,Industrials,42,Wholesale Trade
3,5,A.B.A. INDUSTRIES INC,,,,A.B.A. Industries Inc. was acquired by McSwain...,110.0,,No,
4,6,ABC INDS INC,,,,"ABC Industries, Inc. manufactures and supplies...",415.0,,No,


In [None]:
data.dropna(subset=['gind'], how='any', inplace=True)

In [None]:
data['gind'] = data['gind'].astype(int)

In [None]:
import math

energy = []
materials = []
industrials = []
consumer_discretionaty = []
consumer_staples = []
health_care = []
financials = []
information_technology = []
communication_services = []
utilities = []
real_estate = []

gics = list(data["gind"])
desc = list(data["busdesc"])

for i in range(len(gics)):
  if math.floor(gics[i] / 10000) == 10:
    energy.append(desc[i])
  if math.floor(gics[i] / 10000) == 15:
    materials.append(desc[i])
  if math.floor(gics[i] / 10000) == 20:
    industrials.append(desc[i])
  if math.floor(gics[i] / 10000) == 25:
    consumer_discretionaty.append(desc[i])
  if math.floor(gics[i] / 10000) == 30:
    consumer_staples.append(desc[i])
  if math.floor(gics[i] / 10000) == 35:
    health_care.append(desc[i])
  if math.floor(gics[i] / 10000) == 40:
    financials.append(desc[i])
  if math.floor(gics[i] / 10000) == 45:
    information_technology.append(desc[i])
  if math.floor(gics[i] / 10000) == 50:
    communication_services.append(desc[i])
  if math.floor(gics[i] / 10000) == 55:
    utilities.append(desc[i])
  if math.floor(gics[i] / 10000) == 60:
    real_estate.append(desc[i])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text

In [None]:
sectors = {"Energy" : energy, 
           "Materials" : materials, 
           "Industrials" : industrials, 
           "Consumer Discretionaty" : consumer_discretionaty, 
           "Consumer Staples" : consumer_staples, 
           "Health Care" : health_care, 
           "Financials" : financials, 
           "Information Technology" : information_technology, 
           "Communication Services" : communication_services, 
           "Utilities" : utilities, 
           "Real Estate" : real_estate}

#Finding all verbs in the dataset

In [None]:
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
def verbify(sentence):
  
  tokens = nltk.word_tokenize(sentence)
  tags = nltk.pos_tag(tokens)

  verbs = [word for word, tag in tags if tag.startswith('V')]
  return verbs

In [None]:
verbs_in_dataset = []

for sen in data["busdesc"]:
  verbs_in_dataset.extend(verbify(sen))

#Adding stop words

In [None]:
def finding_top_features(key, dataset):
  my_stop_words = text.ENGLISH_STOP_WORDS.union(["company", "canada", "product", "products", "services", "corp", "corporation", "market", "markets", "store", "stores", "provide", "provides", "china", "states", "united", "operates", "acquired", "operates", "produces", "western", "texas", "america", "based", "manufactures", "business", "group", "commercial", "international", "worldwide", "internationally", "sells", "distributes", "distribute", "engages", "engage", "subsidiaries", "owns", "primarily", "industrial", "service", "europe", "online", "new", "develops", "management", "limited", "development", "llc", "segments", "asia", "subsidiary", "non", "residential", "public", "north", "south", "east", "west", "northern", "southern", "eastern", "western", "ohio", "africa", "california", "pennsylvania", "illinois", "companies", "mexico", "various", "holdings"])
  my_stop_words = my_stop_words.union(verbs_in_dataset)
  vectorizer = TfidfVectorizer(stop_words=my_stop_words)
  tfidf_matrix = vectorizer.fit_transform(dataset)
  feature_names = vectorizer.get_feature_names_out()
  feature_scores = tfidf_matrix.mean(axis=0).tolist()[0]
  features_dict = dict(zip(feature_names, feature_scores))
  sorted_features = sorted(features_dict.items(), key=lambda x: x[1], reverse=True)
  top_features = [(feature, score) for feature, score in sorted_features[:30]]

  dictionary = {}
  dictionary[key] = top_features
  print(dictionary)

#30 most frequent words for each sector

In [None]:
for key in sectors.keys():
  finding_top_features(key, sectors[key])



{'Energy': [('gas', 0.0820210622352674), ('oil', 0.07915686534397126), ('energy', 0.06601487710926955), ('natural', 0.0614006909011839), ('exploration', 0.05658547066702546), ('production', 0.04481296137073517), ('properties', 0.03889681092895325), ('resources', 0.03290898645472864), ('acquisition', 0.03213311150503406), ('independent', 0.01733788012050555), ('alberta', 0.01679084877339075), ('liquids', 0.01500066852161682), ('reserves', 0.014612697939901623), ('interests', 0.01409854929073849), ('basin', 0.014001406091188438), ('transportation', 0.013441510736849748), ('partners', 0.012633860638049563), ('midstream', 0.012234409777889664), ('industry', 0.01155503875020017), ('operations', 0.011177009520133207), ('assets', 0.011002835894567773), ('merger', 0.010588712755502362), ('lp', 0.009924477961909052), ('trust', 0.009807779369447779), ('transaction', 0.009807189674033908), ('mineral', 0.009395150907394937), ('gulf', 0.008495113724332197), ('calgary', 0.008285268011548138), ('equi