# NLP with spacy: Stock example

In [None]:
### Example NLP
import spacy
import pandas as pd
df = pd.read_csv("stock.tsv", sep='\t')

In [None]:
df

In [None]:
symbols = df.Symbol.tolist()
companies = df.CompanyName.tolist()

In [None]:
stops = ["two"]
nlp = spacy.blank("en")
ruler = nlp.add_pipe("entity_ruler")
letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
patterns = []

for symbol in symbols:
    patterns.append({"label": "STOCK", "pattern": symbol})
    for l in letters:
        patterns.append({"label": "STOCK", "pattern": symbol+f".{l}"})
        
for company in companies:
    if company not in stops:
        patterns.append({"label": "COMPANY", "pattern": company})
        w
ruler.add_patterns(patterns)

In [None]:
#source: https://www.reuters.com/business/futures-rise-after-biden-xi-call-oil-bounce-2021-09-10/
text = '''
Sept 10 (Reuters) - Wall Street's main indexes were subdued on Friday as signs of higher inflation and a drop in Apple shares following an unfavorable court ruling offset expectations of an easing in U.S.-China tensions.

Data earlier in the day showed U.S. producer prices rose solidly in August, leading to the biggest annual gain in nearly 11 years and indicating that high inflation was likely to persist as the pandemic pressures supply chains. read more .

"Today's data on wholesale prices should be eye-opening for the Federal Reserve, as inflation pressures still don't appear to be easing and will likely continue to be felt by the consumer in the coming months," said Charlie Ripley, senior investment strategist for Allianz Investment Management.

Apple Inc (AAPL.O) fell 2.7% following a U.S. court ruling in "Fortnite" creator Epic Games' antitrust lawsuit that stroke down some of the iPhone maker's restrictions on how developers can collect payments in apps.


Sponsored by Advertising Partner
Sponsored Video
Watch to learn more
Report ad
Apple shares were set for their worst single-day fall since May this year, weighing on the Nasdaq (.IXIC) and the S&P 500 technology sub-index (.SPLRCT), which fell 0.1%.

Sentiment also took a hit from Cleveland Federal Reserve Bank President Loretta Mester's comments that she would still like the central bank to begin tapering asset purchases this year despite the weak August jobs report. read more

Investors have paid keen attention to the labor market and data hinting towards higher inflation recently for hints on a timeline for the Federal Reserve to begin tapering its massive bond-buying program.

The S&P 500 has risen around 19% so far this year on support from dovish central bank policies and re-opening optimism, but concerns over rising coronavirus infections and accelerating inflation have lately stalled its advance.


Report ad
The three main U.S. indexes got some support on Friday from news of a phone call between U.S. President Joe Biden and Chinese leader Xi Jinping that was taken as a positive sign which could bring a thaw in ties between the world's two most important trading partners.

At 1:01 p.m. ET, the Dow Jones Industrial Average (.DJI) was up 12.24 points, or 0.04%, at 34,891.62, the S&P 500 (.SPX) was up 2.83 points, or 0.06%, at 4,496.11, and the Nasdaq Composite (.IXIC) was up 12.85 points, or 0.08%, at 15,261.11.

Six of the eleven S&P 500 sub-indexes gained, with energy (.SPNY), materials (.SPLRCM) and consumer discretionary stocks (.SPLRCD) rising the most.

U.S.-listed Chinese e-commerce companies Alibaba and JD.com , music streaming company Tencent Music (TME.N) and electric car maker Nio Inc (NIO.N) all gained between 0.7% and 1.4%


Report ad
Grocer Kroger Co (KR.N) dropped 7.1% after it said global supply chain disruptions, freight costs, discounts and wastage would hit its profit margins.

Advancing issues outnumbered decliners by a 1.12-to-1 ratio on the NYSE and by a 1.02-to-1 ratio on the Nasdaq.

The S&P index recorded 14 new 52-week highs and three new lows, while the Nasdaq recorded 49 new highs and 38 new lows.
'''

In [None]:
doc = nlp(text)
for ent in doc.ents:
    print (ent.text, ent.label_)

In [None]:
from spacy import displacy
doc = nlp(text)
for ent in doc.ents:
    print (ent.text, ent.label_)


In [None]:
displacy.render(doc, style="ent")

In [None]:
doc = nlp("Hello world!")
for token in doc:
    print(token.text)


In [None]:
nlp = spacy.load("en_core_web_md")
doc = nlp("Apple is looking at buying Swiss startup for $10 billion.")

for ent in doc.ents:
    print(ent.text, ent.label_)


In [None]:
from spacy import displacy
displacy.render(doc, style="ent")


In [None]:
spacy.explain("NORP") 

# Spacy Matcher

In [None]:
import spacy
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
pattern = [{"LIKE_EMAIL": True}]
matcher.add("EMAIL_ADDRESS",[pattern])
pattern2 = [{"LOWER": "email"}, {"TEXT": "address"}]
matcher.add("EMAIL_ADDRESS2",[pattern2])
doc = nlp("This is an email address: blupp@aol.com")
matches = matcher(doc)
print (matches)


In [None]:
for match_id, start, end in matches:
	matched_span = doc[start:end]
	print(matched_span.text)


In [None]:
nlp.analyze_pipes()

# Digression (ignore that for now)

In [None]:
nlp2 = spacy.load("en_core_web_sm")
doc = nlp2("I love coffee")
lex = nlp2.vocab["coffee"]

print(lex.text, lex.orth, lex.is_alpha)


In [None]:
nlp = spacy.load("en_core_web_md")
doc1 = nlp("This is my green sentence.")
doc2 = nlp("This is a blue sentence.")
print(doc1.similarity(doc2))

In [None]:
token1 = doc1[3]
token2 = doc2[3]
print(token1.similarity(token2))

In [None]:
print(nlp.pipe_names)

In [None]:
### curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
### !pip install simpletransformers
!pip install torch

In [None]:
import pandas as pd
data = pd.read_csv("ner_dataset.csv",encoding="latin1" )
data = data.fillna(method ="ffill")

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
data["Sentence #"] = LabelEncoder().fit_transform(data["Sentence #"] )

In [None]:
data.head(30)
data.rename(columns={"Sentence #":"sentence_id","Word":"words","Tag":"labels"}, inplace =True)
data["labels"] = data["labels"].str.upper()

In [None]:
X= data[["sentence_id","words"]]
Y =data["labels"]
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size =0.2)
#building up train data and test data
train_data = pd.DataFrame({"sentence_id":x_train["sentence_id"],"words":x_train["words"],"labels":y_train})
test_data = pd.DataFrame({"sentence_id":x_test["sentence_id"],"words":x_test["words"],"labels":y_test})

In [None]:
train_data

In [None]:
from simpletransformers.ner import NERModel,NERArgs
label = data["labels"].unique().tolist()
args = NERArgs()
args.num_train_epochs = 1
args.learning_rate = 1e-4
args.overwrite_output_dir =True
args.train_batch_size = 32
args.eval_batch_size = 32

In [None]:
model = NERModel('bert', 'bert-base-cased',labels=label,args =args, use_cuda=False)

In [None]:
model.train_model(train_data,eval_data = test_data,acc=accuracy_score)

In [None]:
result, model_outputs, preds_list = model.eval_model(test_data)

In [None]:
result

In [None]:
prediction, model_output = model.predict(["What is the new name of Bangalore"])

In [None]:
prediction