## Sentiment analysis of stock news 

### Parsing article data with BeautifulSoup

In [None]:
#parsing is the process of analyzing a string of symbols, either in natural language, computer languages or data structures
#goal: to get headlines of articles and run sentiment analysis on the text of those headlines to understand if everyday is positive or negative news


In [None]:
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup


In [None]:
finviz_url = 'https://finviz.com/quote.ashx?t='
tickers = ['AMZN','AMD','FB']
for ticker in tickers:
    url =finviz_url  +ticker

    #request data from this url
    req=Request(url=url, headers={'user-agent': 'my-app'})
    response= urlopen(req)

    html=BeautifulSoup(response,'html')
    print(html) #pass html from finviz_url  
    break


In [None]:
#see where the data lies, table holds the data and the table has id="news-table"

finviz_url = 'https://finviz.com/quote.ashx?t='
tickers = ['AMZN','AMD','FB']
news_tables={}
for ticker in tickers:
    url =finviz_url  +ticker

    #request data from this url
    req=Request(url=url, headers={'user-agent': 'my-app'})
    response= urlopen(req)

    html=BeautifulSoup(response,'html')
    news_table=html.find(id='news_table') #get html objecet of the entire table
    news_tables[ticker]= news_table #take table object and store it in dictionary

    print(html) #pass html from finviz_url  
    break
    
#the dictionary holds only the table of the results from the webpage: all table rows correspond to those in the webpage
print(news_tables) 


### Manipulatie article data 

In [None]:
#see where the data lies, table holds the data and the table has id="news-table"

finviz_url = 'https://finviz.com/quote.ashx?t='
tickers = ['AMZN','AMD','FB']
news_tables={}
for ticker in tickers:
    url =finviz_url  +ticker

    #request data from this url
    req=Request(url=url, headers={'user-agent': 'my-app'})
    response= urlopen(req)

    html=BeautifulSoup(response,'html')
    news_table=html.find(id='news_table') #get html objecet of the entire table
    news_tables[ticker]= news_table #take table object and store it in dictionary

    print(html) #pass html from finviz_url  
    break
    
#the dictionary holds only the table of the results from the webpage: all table rows correspond to those in the webpage
print(news_tables) 

#parse the data to understandable format to extract the titles and timestamps of the articles and apply on sentiment analysis 
#goal: to iterate all table rows in the dataset and get the values of timestamps and text of articles in the table
#find all table rows that are relevant in the table html object
#give a list of all the different tr elements inside the html object parsed in passed in the table of all the relevant news articles
amzn_data =news_tables['AMZN']
amzn_rows=amzn_data.findAll('tr')
print(amzn_rows) 

#iterate over rows to get values
for index, row in enumerate(amzn_rows):
    title = row.a.text
    print(title) 

#to get timestamps of data
for index, row in enumerate(amzn_rows):
    title = row.a.text
    timestamp=row.td.text
    print(timestamp + " " + title) 

In [None]:
finviz_url = 'https://finviz.com/quote.ashx?t='
tickers = ['AMZN','AMD','FB']
news_tables={}
for ticker in tickers:
    url =finviz_url  +ticker

    #request data from this url
    req=Request(url=url, headers={'user-agent': 'my-app'})
    response= urlopen(req)

    html=BeautifulSoup(response,'html')
    news_table=html.find(id='news_table')
    news_tables[ticker]= news_table 

    print(html)
    break

#list object, create list inside data set that correspond to ticker, date, time and title of article
parsed_data=[]

for ticker, news_table in news_tables.items():
    for row in news_table.findAll('tr'):
        title = row.a.get_text()
        
#split text in sections based on space: if length is only 1, it's just time, if more, there are multiple values- first is date then time
        date_data = row.td.text.split(' ')
        if len(date) == 1:
            time=date_data [0]
        else:
            date = date_data [0]
            time= date_data [1]
        parsed_data.append([ticker,date,time,title])
print(parsed_data)


### Apply sentiment analysis on headlines

In [None]:
#apply sentiment analysis on every title, use nltk vader sentiment
import nltk 
nltk.download('vader_lexicon')

In [None]:
#apply sentiment analysis on any given text
from nltk.sentiment.vader import SentimentIntensityAnalyzer 

In [None]:
import pandas as pd

finviz_url = 'https://finviz.com/quote.ashx?t='
tickers = ['AMZN','AMD','FB']
news_tables={}
for ticker in tickers:
    url =finviz_url  +ticker

    req=Request(url=url, headers={'user-agent': 'my-app'})
    response= urlopen(req)

    html=BeautifulSoup(response,'html')
    news_table=html.find(id='news_table')
    news_tables[ticker]= news_table 

    print(html)
    break

parsed_data=[]

for ticker, news_table in news_tables.items():
    for row in news_table.findAll('tr'):
        title = row.a.get_text()
        
        date_data = row.td.text.split(' ')
        if len(date) == 1:
            time=date_data [0]
        else:
            date = date_data [0]
            time= date_data [1]
parsed_data.append([ticker,date,time,title])

#create dataframe to host our data in
df = pd.DataFrame(parsed_data, columns=['ticker', 'date','time','title'])

print(df.head())
vader = SentimentIntensityAnalyzer()
print(vader.polarity_scores("I don't think the weather is going to be good for hiking today."))

print(df['title'])

In [None]:
print(vader.polarity_scores("I think the weather is going to be good for skiing today."))

In [None]:
print(df['title'])

In [None]:
df = pd.DataFrame(parsed_data, columns=['ticker', 'date','time','title'])
vader = SentimentIntensityAnalyzer()

f = lambda title: vader.polarity_scores(title)['compiund']
#create column 'compound' 
df['compound']=df['title'].apply(f)
print(df.head())

### Visualize sentiment analysis

In [None]:
import matplotlib.pyplot as plt

f = lambda title: vader.polarity_scores(title)['compiund']
df['compound']=df['title'].apply(f)

#to convert date from normal string to date time format
df['date'] = pd.to_datetime(df.date).dt.date

plt.figure(figsize=(10,8))
mean_df=df.groupby(['ticker'],['date']).mean()
print(mean_df) #mean to get the average sentiment

In [None]:
plt.figure(figsize=(10,8))
mean_df=df.groupby(['ticker'],['date']).mean()

#unstack to get date at x axis
mean_df=mean_df.unstack()
mean_df=mean_df.xs('compound', axis="columns").transpose()
mean_df.plot(kind='bar')
print(mean_df)

In [34]:
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
from nltk.sentiment.vader import SentimentIntensityAnalyzer 
import pandas as pd
import re
import matplotlib.pyplot as plt

finviz_url = 'https://finviz.com/quote.ashx?t='
tickers = ['AMZN','GOOG','FB']
news_tables={}
for ticker in tickers:
    url =finviz_url+ticker

    req=Request(url=url, headers={'user-agent': 'my-app'})
    response= urlopen(req)

    html=BeautifulSoup(response,'html')
    news_table=html.find(id='news_table')
    news_tables[ticker]= news_table 

parsed_data =[]

for ticker, news_table in news_tables.items():
    
    for row in news_table.findAll('tr'):
        
        title = row.a.text
        date_data = row.td.text.split(' ')
        
        if len(date_data) == 1:
            time=date_data[0]
        else:
            date = date_data[0]
            time= date_data[1]
        parsed_data.append([ticker,date,time,title])

df = pd.DataFrame(parsed_data, columns=['ticker', 'date','time','title'])

vader = SentimentIntensityAnalyzer()

f = lambda title: vader.polarity_scores(title)['compiund']
df['compound']=df['title'].apply(f)
df['date'] = pd.to_datetime(df.date).dt.date

plt.figure(figsize=(10,8))
mean_df=df.groupby(['ticker'],['date']).mean()

mean_df=mean_df.unstack()
mean_df=mean_df.xs('compound', axis="columns").transpose()
mean_df.plot(kind='bar')
plt.show()

AttributeError: 'NoneType' object has no attribute 'findAll'