# Sentiment Analysis of Tech News Articles

In [3]:
import pandas as pd
import nltk
import math
nltk.download(["names",
     "stopwords",
     "averaged_perceptron_tagger",
     "vader_lexicon",
     "punkt",
])

[nltk_data] Downloading package names to /Users/joey/nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/joey/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/joey/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/joey/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /Users/joey/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
train_set = pd.read_csv("training.csv", header=None)
devt_set = pd.read_csv("development.csv", header=None)
test_set = pd.read_csv("test.csv", header=None, encoding= 'unicode_escape')

In [5]:
print(len(train_set))
print(len(devt_set))
print(len(test_set))

2346
1500
774


In [6]:
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

In [7]:
sia.polarity_scores(train_set[1][164])

{'neg': 0.0, 'neu': 0.493, 'pos': 0.507, 'compound': 0.7351}

In [8]:
# Checks the sentiment of a single string of text, ranking it positive, negative, or neutral.
def check_sentiment_for_test(string_to_analyze):
    scores = sia.polarity_scores(string_to_analyze)
    neg_neu_pos = [scores['neg'],scores['neu'],scores['pos']]
    highest_score = neg_neu_pos.index(max(neg_neu_pos))
    return_values = ["negative", "neutral", "positive"]
    return return_values[highest_score]

# Checks the sentiment of a single string of text, giving it a numerical value.
def check_sentiment(string_to_analyze):
    scores = sia.polarity_scores(string_to_analyze)
    return scores["compound"]

In [9]:
check_sentiment_for_test(train_set[1][164])

'positive'

In [10]:
# Scoring each individual string from the corpus
train_list = []
for x in train_set[1]:
    train_list.append(check_sentiment_for_test(x))


In [11]:
# Here, we score the guesses of our sentiment analyzer.
# The score is based on the distance of the guesses, e.g.
# if the answer is "negative" but our system guesses "positive",
# the penalty is larger than, say, if the answer was "neutral"
# but our system guessed "positive".
total_score = len(train_list)
score_dict = {"negative":-0.5, "neutral":0, "positive":0.5}
for index, value in enumerate(train_list):
    total_score -= abs(score_dict[value] - score_dict[train_set[0][index]])

In [12]:
print(total_score/len(train_list))

0.8589087809036658


In [13]:
# Filters out the POS to only have nouns
pos_tags_to_keep = ["NN","NNP","NNS","NNPS"]
noun_list = []
for entry in train_set[1]:
    tokenized = nltk.word_tokenize(entry)
    tagged = nltk.pos_tag(tokenized)
    new_phrase = ""
    for word in tagged:
        if word[1] in pos_tags_to_keep:
            new_phrase += word[0] + " "
    noun_list.append(new_phrase)

#Get sentiment for train set with only nouns:
train_list_noun = []
for x in noun_list:
    train_list_noun.append(check_sentiment_for_test(x))
total_score = len(train_list_noun)
score_dict = {"negative":-0.5, "neutral":0, "positive":0.5}
for index, value in enumerate(train_list_noun):
    
    total_score -= abs(score_dict[value] - score_dict[train_set[0][index]])

#Accuracy score with only nouns
print(total_score/len(train_list_noun))

0.8388746803069054


In [14]:
# Filters out the POS to only have adjectives
pos_tags_to_keep = ["JJS","JJR","JJ"]
adj_list = []
for entry in train_set[1]:
    tokenized = nltk.word_tokenize(entry)
    tagged = nltk.pos_tag(tokenized)
    new_phrase = ""
    for word in tagged:
        if word[1] in pos_tags_to_keep:
            new_phrase += word[0] + " "
    adj_list.append(new_phrase)

#Get sentiment for train set with only adjectives:
train_list_adj = []
for x in adj_list:
    train_list_adj.append(check_sentiment_for_test(x))
total_score = len(train_list_adj)
score_dict = {"negative":-0.5, "neutral":0, "positive":0.5}
for index, value in enumerate(train_list_adj):
    total_score -= abs(score_dict[value] - score_dict[train_set[0][index]])

#Accuracy score with only adjectives
print(total_score/len(train_list_adj))

0.7664109121909634


In [15]:
# Filters out the POS to only have adjectives AND nouns
pos_tags_to_keep = ["JJS","JJR","JJ","NN","NNP","NNS","NNPS"]
adj_noun_list = []
for entry in train_set[1]:
    tokenized = nltk.word_tokenize(entry)
    tagged = nltk.pos_tag(tokenized)
    new_phrase = ""
    for word in tagged:
        if word[1] in pos_tags_to_keep:
            new_phrase += word[0] + " "
    adj_noun_list.append(new_phrase)

#Get sentiment for train set with only adjectives:
train_list_adj_noun = []
for x in adj_noun_list:
    train_list_adj_noun.append(check_sentiment_for_test(x))
total_score = len(train_list_adj_noun)
score_dict = {"negative":-0.5, "neutral":0, "positive":0.5}
for index, value in enumerate(train_list_adj_noun):
    total_score -= abs(score_dict[value] - score_dict[train_set[0][index]])

#Accuracy score with only adjectives
print(total_score/len(train_list_adj_noun))

0.8478260869565217


In [26]:
#percent change of open and close of stock price
#stock_changes =[((357.59-349.3)/349.3),0.0,0.0,((354.9-356.76)/356.76),((352.65-352.62)/352.62),((358.49-355.02)/355.02),((365.97-363.24)/363.24),((352.65-352.62)/352.62),0.0,0.0,0.0,((372.45-368.64)/368.64),((371.78-372)/372),((373.12-371.55)/371.55),((378.8-376.39)/376.39),((373.12-371.55)/371.55),((381.95-377.35)/377.35),0.0,0.0,((380.69-380.89)/380.89),((384.79-382.08)/382.08),((380.69-380.89)/380.89),((380.62-385)/385),((387.09-384.37)/384.37),((386.49-388.04)/388.04),0.0,0.0,((382.84-384.81)/384.81),((382.95-378.13)/378.13),((379.39-382.5)/382.5),((384.92-380.04)/380.04),0.0,0.0,((388.04-385.03)/385.03)]
#stock_changes

In [17]:
#finds sentiment of each article in a given outlet
def article_sentiment(filename, filterPOS):
    list_sentiment=[]
    with open(filename) as file:
        data = file.read()
        articles = data.split("--")
        for article in articles:
            tokenized = nltk.word_tokenize(article)
            tagged = nltk.pos_tag(tokenized)
            if filterPOS:
                new_phrase = ""
                for word in tagged:
                    if filterPOS:
                        if word[1] in pos_tags_to_keep:
                            new_phrase += word[0] + " "
                list_sentiment.append(check_sentiment(new_phrase))
            else:
                list_sentiment.append(check_sentiment(article))
        return list_sentiment

In [18]:
#We will first try without any POS filtering
wsj_sentiment_raw = article_sentiment("WSJ.txt",False)
ap_sentiment_raw = article_sentiment("AP.txt",False)

#gets the average sentiment of the two articles
daily_score_raw=[]
for i in range(len(wsj_sentiment_raw)):
    daily_score_raw.append((wsj_sentiment_raw[i]+ap_sentiment_raw[i])/2) 
    
daily_score_raw

[0.98035,
 0.99725,
 0.97225,
 0.12600000000000003,
 0.3601,
 0.89695,
 0.1442,
 0.48975,
 -0.05075000000000002,
 0.02310000000000001,
 0.46485000000000004,
 0.12140000000000001,
 0.91265,
 0.7720499999999999,
 0.697,
 0.9934499999999999,
 0.9879,
 0.9961,
 -0.15709999999999996,
 0.07244999999999996,
 0.9541999999999999,
 0.9366,
 0.96755,
 0.9672000000000001,
 0.98495,
 -0.01479999999999998,
 0.91455,
 0.9272,
 0.9963,
 0.98285,
 0.9761500000000001,
 0.966]

In [25]:
#Now we try by filtering only nouns

pos_tags_to_keep = ["NN","NNP","NNS","NNPS"]



wsj_sentiment_noun = article_sentiment("WSJ.txt",True)
ap_sentiment_noun = article_sentiment("AP.txt",True)

#gets the average sentiment of the two articles
daily_score_noun=[]
for i in range(len(wsj_sentiment_noun)):
    daily_score_noun.append((wsj_sentiment_noun[i]+ap_sentiment_noun[i])/2) 
    
daily_score_noun



[0.6356499999999999,
 0.1799,
 0.7663,
 -0.477,
 0.7931,
 0.43085,
 0.3286,
 0.4921,
 0.45130000000000003,
 0.48545,
 0.3981,
 -0.19295,
 0.7863,
 -0.0022000000000000353,
 0.29505000000000003,
 0.9682,
 0.9526,
 0.9929,
 -0.5095000000000001,
 0.3347,
 0.5367,
 0.11709999999999998,
 0.71095,
 0.8728,
 0.9493,
 0.47620000000000007,
 0.9079,
 0.7819499999999999,
 0.988,
 0.8301000000000001,
 0.91455,
 0.55795]

In [38]:
#Now we try by filtering only adjectives

pos_tags_to_keep = ["JJS","JJR","JJ"]



wsj_sentiment_adj = article_sentiment("WSJ.txt",True)
ap_sentiment_adj = article_sentiment("AP.txt",True)

#gets the average sentiment of the two articles 
daily_score_adj=[]
for i in range(len(wsj_sentiment_adj)):
    daily_score_adj.append((wsj_sentiment_adj[i]+ap_sentiment_adj[i])/2) 
    
daily_score_adj


[0.17334999999999998,
 0.9867,
 0.69585,
 0.56405,
 0.40475,
 -0.05035,
 0.01050000000000001,
 0.16810000000000003,
 0.27265000000000006,
 0.15440000000000004,
 0.62615,
 0.06530000000000002,
 0.8512,
 0.8791,
 0.5927,
 0.90025,
 0.9457,
 0.75995,
 -0.02930000000000002,
 0.13385,
 0.83805,
 0.7835,
 0.8955,
 0.7988500000000001,
 0.9168000000000001,
 0.1806,
 0.56895,
 0.45895,
 0.9467,
 0.7883,
 0.8766499999999999,
 0.42255]

In [39]:
#Now we try by filtering adjectives and nouns

pos_tags_to_keep = ["JJS","JJR","JJ","NN","NNP","NNS","NNPS"]
#finds sentiment of each article in a given outlet



wsj_sentiment_adj_noun = article_sentiment("WSJ.txt",True)
ap_sentiment_adj_noun = article_sentiment("AP.txt",True)

#gets the average of the two articles based on score_dict
daily_score_adj_noun=[]
for i in range(len(wsj_sentiment_adj_noun)):
    daily_score_adj_noun.append((wsj_sentiment_adj_noun[i]+ap_sentiment_adj_noun[i])/2) 
    
daily_score_adj_noun


[0.9200999999999999,
 0.99295,
 0.8967499999999999,
 0.1528,
 0.7965,
 0.4441,
 0.11154999999999998,
 0.6703,
 0.7333000000000001,
 0.50495,
 0.6822,
 -0.0050000000000000044,
 0.96065,
 0.012899999999999967,
 0.5608500000000001,
 0.9863,
 0.98995,
 0.99455,
 -0.35745000000000005,
 0.22399999999999998,
 0.8626,
 0.5103,
 0.9601999999999999,
 0.96195,
 0.9815,
 0.54895,
 0.9467000000000001,
 0.81555,
 0.9945999999999999,
 0.9611000000000001,
 0.9681500000000001,
 0.6948000000000001]