# NLP Initial Set Up

In [1]:
import requests
import pandas as pd

## Import Test Data

In [2]:
symbols = ["AAPL", "AMZN", "FB"]

descriptions = {}

for stock in symbols:
    url = f"http://iex.lewagon.com/stable/stock/{stock}/company/"
    response = requests.get(url).json()
    # Make a list out of industry and sector 
    descriptions[stock] = [response['industry'], response['sector']]
    # Add each tag to the list
    for tag in response['tags']:
        descriptions[stock].append(tag)

In [3]:
descriptions

{'AAPL': ['Electronic Computer Manufacturing ',
  'Manufacturing',
  'Electronic Technology',
  'Telecommunications Equipment',
  'Manufacturing',
  'Electronic Computer Manufacturing '],
 'AMZN': ['Electronic Shopping and Mail-Order Houses ',
  'Retail Trade',
  'Retail Trade',
  'Internet Retail'],
 'FB': ['Data Processing, Hosting, and Related Services',
  'Information',
  'Technology Services',
  'Internet Software/Services',
  'Miscellaneous Commercial Services',
  'Commercial Services',
  'Information',
  'Data Processing, Hosting, and Related Services']}

## Cleaning

In [4]:
for stock in descriptions.keys():
    # Remove duplicated phrases
    descriptions[stock] = list(set(descriptions[stock]))
    # Remove additional spaces
    for i in range(len(descriptions[stock])):
        descriptions[stock][i] = descriptions[stock][i].strip()
    # Combine each phrase into single string
    descriptions[stock] = ', '.join(descriptions[stock])

## Create DataFrame

In [5]:
info_series = pd.Series(descriptions)

info_df = pd.DataFrame(info_series).reset_index()

info_df.rename(columns={0:'info', 'index': 'company'}, inplace=True)

## Vectorize the information

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(ngram_range=(1,3), max_df=0.95, min_df=0.05)

X = vectorizer.fit_transform(info_df['info'])

X.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 2, 1, 1, 0, 0, 1, 1, 1,
        1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0],
       [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0,
        0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [1, 0, 0, 1, 1, 2, 2, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
        1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 5, 1, 1, 1, 1, 1, 1,
        1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]])

## Merge vectorization into DataFrame

In [7]:
vect_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names(), index= info_df.index)

merged_df = info_df.merge(vect_df, left_index=True, right_index=True, how='left')

merged_df.drop(columns=['info'], inplace=True)

In [8]:
merged_df

Unnamed: 0,company,and,and mail,and mail order,and related,and related services,commercial,commercial services,commercial services commercial,commercial services data,...,software services technology,technology,technology services,technology services information,technology telecommunications,technology telecommunications equipment,telecommunications,telecommunications equipment,telecommunications equipment electronic,trade
0,AAPL,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,1,1,1,1,0
1,AMZN,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,FB,1,0,0,1,1,2,2,1,1,...,1,1,1,1,0,0,0,0,0,0
