In [1]:
%load_ext autoreload
%autoreload 2
#https://www.kaggle.com/stuarthallows/using-xgboost-with-scikit-learn

In [2]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from nltk import download
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import numpy as np
import matplotlib
from matplotlib import pyplot
matplotlib.use('Agg')
download('punkt')

[nltk_data] Downloading package punkt to /Users/mjn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
def nlp_preprocess(doc_string):
    stopwords_english = stopwords.words('english')
    stemmer = PorterStemmer() 
    
    tokens = word_tokenize(doc_string)
    remove_punct = [word.lower() for word in tokens if word.isalnum()]
    remove_stops = [word for word in remove_punct if word not in stopwords_english]
    stemmed = [stemmer.stem(word) for word in remove_stops]

    return stemmed


def vocabulary_dict(tokens_list):
    idx = 0
    vocab = {}
    for word in tokens_list:
        if word not in vocab.keys():
            vocab.update({word: idx})
            idx += 1
            
    return vocab


def tokens_to_vector(tokens, vocabulary):
    vector = np.zeros((1,len(vocabulary.keys())))
    for word in tokens:
        if word in vocabulary.keys():
            vector[0, int(vocabulary[word])] = 1
    
    return vector


def df_to_raw_string(research_df):
    raw_str = ""
    for row in research_df.iterrows():
        row_tup = (row[1]['CBInsightsDescription'], row[1]['QuidDescription'], row[1]['CrunchbaseDescription'])
        raw_str += " ".join(row_tup)
        
    return raw_str


def df_to_array(research_df, vocabulary):
    m = research_df.shape[0]
    n = len(vocabulary.keys())
    data_array = np.zeros((m, n))
    idx = 0
    for row in research_df.iterrows():
        raw_str = ""
        row_tup = (row[1]['CBInsightsDescription'], row[1]['QuidDescription'], row[1]['CrunchbaseDescription'])
        raw_str += " ".join(row_tup)
        tokens = nlp_preprocess(raw_str)
        vect = tokens_to_vector(tokens, vocabulary)
        data_array[idx] = vect
        idx += 1

    return data_array
    

In [4]:
# load data, asign classes, concatenate

addit_df = pd.read_csv('../../data/additive-manufacturing-advanced-materials.csv')
addit_df['class'] = "robotics-vehicles-defense"
agric_df = pd.read_csv('../../data/agricultural-technology.csv')
agric_df['class'] = "agricultural-technology"
artif_df = pd.read_csv('../../data/artificial-intelligence-machine-learning.csv')
artif_df['class'] = "data-ai-ml"
augme_df = pd.read_csv('../../data/augmented-reality-mixed-reality.csv')
augme_df['class'] = "cyber"
auton_df = pd.read_csv('../../data/autonomous-systems.csv')
auton_df['class'] = "robotics-vehicles-defense"
biote_df = pd.read_csv('../../data/biotechnology-pharmaceuticals.csv')
biote_df['class'] = "biotechnology-pharmaceuticals"
consu_df = pd.read_csv('../../data/consulting-market-research.csv')
consu_df['class'] = "consulting-marketing"
consm_df = pd.read_csv('../../data/consumer-products-services.csv')
consm_df['class'] = "consulting-marketing"
cyber_df = pd.read_csv('../../data/cybersecurity.csv')
cyber_df['class'] = "cyber"
analy_df = pd.read_csv('../../data/data-analytics-it-software.csv')
analy_df['class'] = "data-ai-ml"
educa_df = pd.read_csv('../../data/education-training-professional-development.csv')
educa_df['class'] = "consulting-marketing"
elect_df = pd.read_csv('../../data/electronics-microelectronics-hardware.csv')
elect_df['class'] = "electronics-microelectronics"
energ_df = pd.read_csv('../../data/energy-power.csv')
energ_df['class'] = "electronics-microelectronics"
healt_df = pd.read_csv('../../data/healthcare-human-systems.csv')
healt_df['class'] = "biotechnology-pharmaceuticals"
indus_df = pd.read_csv('../../data/industrial-manufacturing.csv')
indus_df['class'] = "manufacturing-logistics"
intel_df = pd.read_csv('../../data/intelligence-surveillance.csv')
intel_df['class'] = "robotics-vehicles-defense"
logis_df = pd.read_csv('../../data/logistics-distribution.csv')
logis_df['class'] = "manufacturing-logistics"
marke_df = pd.read_csv('../../data/marketing-media.csv')
marke_df['class'] = "consulting-marketing"
model_df = pd.read_csv('../../data/modeling-simulation.csv')
model_df['class'] = "data-ai-ml"
posit_df = pd.read_csv('../../data/position-navigation.csv')
posit_df['class'] = "robotics-vehicles-defense"
robot_df = pd.read_csv('../../data/robotics-mechatronics.csv')
robot_df['class'] = "robotics-vehicles-defense"
space_df = pd.read_csv('../../data/space-aerospace-technology.csv')
space_df['class'] = "robotics-vehicles-defense"
telec_df = pd.read_csv('../../data/telecommunication-systems-services.csv')
telec_df['class'] = "cyber"
vehic_df = pd.read_csv('../../data/vehicle-systems.csv')
vehic_df['class'] = "robotics-vehicles-defense"

#all
corpus_df = pd.concat([addit_df, agric_df, artif_df, augme_df, auton_df, 
                       biote_df, consu_df, consm_df, cyber_df, analy_df, 
                       educa_df, elect_df, energ_df, healt_df, indus_df,
                       intel_df, logis_df, marke_df, model_df, posit_df,
                       robot_df, space_df, telec_df, vehic_df], 
                      axis=0, ignore_index=True)
#health, robotics
#corpus_df = pd.concat([health_df, robotics_df], 
#                      axis=0, ignore_index=True)

#ai, modeling
#corpus_df = pd.concat([aiml_df, modeling_df], 
#                      axis=0, ignore_index=True)

print("Train/ Test examples: " + str(corpus_df.shape[0]))

Train/ Test examples: 1479


In [5]:
# process text
raw_str = df_to_raw_string(corpus_df)
tokens = nlp_preprocess(raw_str)
vocab = vocabulary_dict(tokens)
print("Total tokens: " + str(len(tokens)))
print("Unique tokens: " + str(len(vocab.keys())))

Total tokens: 141530
Unique tokens: 9605


In [6]:
# data arrays
X = df_to_array(corpus_df, vocab)
Y = corpus_df['class'].values

In [7]:
# encode Y labels
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(Y)
label_encoded_y = label_encoder.transform(Y)
a = [print(label) for label in label_encoder.classes_]

agricultural-technology
biotechnology-pharmaceuticals
consulting-marketing
cyber
data-ai-ml
electronics-microelectronics
manufacturing-logistics
robotics-vehicles-defense


In [16]:
# split data into train and test sets
test_size = 0.15
X_train, X_test, y_train, y_test = train_test_split(X, label_encoded_y, test_size=test_size)
print("Training examples: " + str(len(y_train)))
print("Testing examples: " + str(len(y_test)))

Training examples: 1257
Testing examples: 222


In [17]:
# fit model to training data
model = xgb.XGBClassifier(objective="multi:softmax", learning_rate=0.1, max_depth=100, n_estimators=100)
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=100,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [18]:
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

In [19]:
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 68.92%


## Second Front Systems

In [20]:
pred_str = "Second Front Systems is a public benefit, venture-backed software company that \
equips defense and national security professionals for long-term, continuous competition \
for access to emerging technologies. To accelerate the delivery of emerging technologies \
to U.S. and Allied warfighters.  To enable enduring strategic advantage for the U.S. and \
its allies through agile, responsive acquisition warfare. While serving in the U.S. Marine \
Corps, our co-founders Peter Dixon and Mark Butler saw firsthand the damages done by an \
outdated acquisition system to those on the frontlines of defending our country. To address \
this critical issue, they formed Second Front Systems as a public benefit corporation with \
the mission of accelerating the transition of technology to U.S. and Allied warfighters. \
Our team is comprised of trailblazers dedicated to bridging the gap between the government and private sector."
tokens = nlp_preprocess(pred_str)
vect = tokens_to_vector(tokens, vocab)
print("Vocabulary matches: " + str(vect.sum()))
pred = model.predict(vect)
pred_class = label_encoder.inverse_transform(pred)
print('Class: ' + pred_class[0])

Vocabulary matches: 55.0
Class: robotics-vehicles-defense


## Primer.ai

In [21]:
pred_str = "Primer enables organizations to quickly explore and utilize the world’s exponentially \
growing sources of text-based information. Our best-in-class natural language processing (NLP) \
engines and applications help you make sense of it all in real-time with human-level precision. \
Request a demo to learn more about: Primer Engines — assemble data processing pipelines with flexible \
building blocks pretrained on domain-specific data. Primer Automate — build your own NLP models, or \
retrain Primer Engines on your own data, with no technical skills required. Primer Analyze — create a \
scalable, self-curating knowledge base that can sift through billions of documents in seconds. \
Primer Extract — explore large caches of data quickly with translation, OCR, and image recognition capabilities. \
UNLOCK MACHINE INTELLIGENCE. Primer provides industrial-grade NLP applications for government agencies, \
financial institutions, Fortune 50 companies, and many other organizations. Organizations collect massive \
amounts of data — far more than human analysts can handle. As a result, much of it remains unexplored or \
underutilized. At Primer, we are dedicated to helping organizations make the best use of their investment in \
data. We do this by using best-in-class machine learning and natural language processing technologies to help \
our customers scale and optimize their intelligence workflows."
tokens = nlp_preprocess(pred_str)
vect = tokens_to_vector(tokens, vocab)
print("Vocabulary matches: " + str(vect.sum()))
pred = model.predict(vect)
pred_class = label_encoder.inverse_transform(pred)
print('Class: ' + pred_class[0])

Vocabulary matches: 82.0
Class: data-ai-ml


## Anduril

In [22]:
pred_str ="Anduril builds cutting-edge hardware and software products that solve complex national \
security challenges for America and its allies. At the core of all our products is Lattice, an AI \
software backbone that uses sensor fusion, machine learning, and mesh networking to integrate \
real-time data from Anduril hardware and third-party systems into a single, autonomous operating picture. \
Software Lattice Platform Artificial intelligence At the core of all our products is Lattice, an AI \
software backbone that uses sensor fusion, machine learning, and mesh networking to integrate real-time \
data from Anduril hardware and third-party systems into a single, autonomous operating picture. Hardware \
Sentry Tower Autonomous awareness Sentry towers are equipped with the latest sensors and work together \
via Lattice to augment awareness in remote locations. With a compact footprint, hardened onboard processing \
and solar power architecture, Sentry can be rapidly deployed and sustained with minimal maintenance in \
austere conditions. Hardware Ghost 4 sUAS Intelligent air support Ghost 4 is an autonomous VTOL sUAS that \
operates on the Lattice AI platform. Ghost 4 is modular, man-portable, waterproof, and combines long \
endurance, high payload capacity and a near-silent acoustic signature for a wide variety of mission \
capabilities. Hardware Anvil sUAS Precision kinetic intercept Anvil is the kinetic element of our \
end-to-end cUAS capability. It uses physical speed and onboard guidance to seek and destroy drone \
threats with positive identification and minimal collateral damage. Mission Effective Anduril is a \
product company developing technology that works from day one. We deploy in hours, not years. Our \
partners start receiving actionable intelligence within minutes of activation."
tokens = nlp_preprocess(pred_str)
vect = tokens_to_vector(tokens, vocab)
print("Vocabulary matches: " + str(vect.sum()))
pred = model.predict(vect)
pred_class = label_encoder.inverse_transform(pred)
print('Class: ' + pred_class[0])

Vocabulary matches: 106.0
Class: robotics-vehicles-defense


## Teton Telecom

In [23]:
pred_str ="Who Is Teton Telecom? What We Offer  Mergers and Acquisitions Fractional Executive Service Mergers \
and Acquisitions Fiber Optics Fractional Executive Service Mergers and Acquisitions Fractional Executive \
Service Fractional Executive Service Fractional Executive Service Comprehensive Business Services Comprehensive \
Business Services Comprehensive Business Services Stakeholder Communications and Investor Relations \
Comprehensive Business Services Comprehensive Business Services Underwriting Adding Value Comprehensive \
Business Services Adding Value Social Contact Us Contact Us Help us help you! Let us know the details on \
your project, or whatever it is you're hoping to accomplish. We'll let you know if it's something we can \
help with. We look forward to hearing from you! Teton Telecom Get in Touch  apply. GoDadd"
tokens = nlp_preprocess(pred_str)
vect = tokens_to_vector(tokens, vocab)
print("Vocabulary matches: " + str(vect.sum()))
pred = model.predict(vect)
pred_class = label_encoder.inverse_transform(pred)
print('Class: ' + pred_class[0])

Vocabulary matches: 36.0
Class: consulting-marketing
