# Sentiment Analysis of Tech News Articles

In [2]:
import pandas as pd
import nltk

nltk.download(["names",
     "stopwords",
     "averaged_perceptron_tagger",
     "vader_lexicon",
     "punkt",
])

[nltk_data] Downloading package names to /Users/joey/nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/joey/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/joey/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/joey/nltk_data...
[nltk_data] Downloading package punkt to /Users/joey/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
train_set = pd.read_csv("training.csv", header=None)
devt_set = pd.read_csv("development.csv", header=None)
test_set = pd.read_csv("test.csv", header=None, encoding= 'unicode_escape')

In [4]:
print(len(train_set))
print(len(devt_set))
print(len(test_set))

2346
1500
774


In [5]:
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

In [6]:
sia.polarity_scores(train_set[1][164])

{'neg': 0.0, 'neu': 0.493, 'pos': 0.507, 'compound': 0.7351}

In [7]:
# Checks the sentiment of a single string of text, ranking it positive, negative, or neutral.
def check_sentiment(string_to_analyze):
    scores = sia.polarity_scores(string_to_analyze)
    neg_neu_pos = [scores['neg'],scores['neu'],scores['pos']]
    highest_score = neg_neu_pos.index(max(neg_neu_pos))
    return_values = ["negative", "neutral", "positive"]
    return return_values[highest_score]

In [8]:
check_sentiment(train_set[1][164])

'positive'

In [14]:
# Scoring each individual string from the corpus
train_list = []
for x in train_set[1]:
    train_list.append(check_sentiment(x))


In [15]:
# Here, we score the guesses of our sentiment analyzer.
# The score is based on the distance of the guesses, e.g.
# if the answer is "negative" but our system guesses "positive",
# the penalty is larger than, say, if the answer was "neutral"
# but our system guessed "positive".
total_score = len(train_list)
score_dict = {"negative":-0.5, "neutral":0, "positive":0.5}
for index, value in enumerate(train_list):
    total_score -= abs(score_dict[value] - score_dict[train_set[0][index]])

In [66]:
print(total_score/len(train_list))

0.8589087809036658


In [16]:
# Filters out the POS tags we don't want to keep. 
pos_tags_to_keep = ["NN","NNP","NNS","NNPS"]
filtered_list = []
for entry in train_set[1]:
    tokenized = nltk.word_tokenize(entry)
    tagged = nltk.pos_tag(tokenized)
    new_phrase = ""
    for word in tagged:
        if word[1] in pos_tags_to_keep:
            new_phrase += word[0] + " "
    filtered_list.append(new_phrase)

    


In [None]:
#Here we score each date positive, negative, or neutral. 
#Currently have 2 data sets so we will use the score_dict
#to average out the data sets for each day.
wsj_sentiment=[]
with open("WSJ.txt") as file:
