# Sentiment Analysis of Tech News Articles

In [2]:
import pandas as pd
import nltk

nltk.download(["names",
     "stopwords",
     "averaged_perceptron_tagger",
     "vader_lexicon",
     "punkt",
])

[nltk_data] Downloading package names to /Users/joey/nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/joey/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/joey/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/joey/nltk_data...
[nltk_data] Downloading package punkt to /Users/joey/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
train_set = pd.read_csv("training.csv", header=None)
devt_set = pd.read_csv("development.csv", header=None)
test_set = pd.read_csv("test.csv", header=None, encoding= 'unicode_escape')

In [4]:
print(len(train_set))
print(len(devt_set))
print(len(test_set))

2346
1500
774


In [5]:
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

In [6]:
sia.polarity_scores(train_set[1][164])

{'neg': 0.0, 'neu': 0.493, 'pos': 0.507, 'compound': 0.7351}

In [43]:
# Checks the sentiment of a single string of text, ranking it positive, negative, or neutral.
def check_sentiment(string_to_analyze):
    scores = sia.polarity_scores(string_to_analyze)
    print(scores)
    neg_neu_pos = [scores['neg'],scores['neu'],scores['pos']]
    highest_score = neg_neu_pos.index(max(neg_neu_pos))
    return_values = ["negative", "neutral", "positive"]
    return return_values[highest_score]

In [8]:
check_sentiment(train_set[1][164])

'positive'

In [14]:
# Scoring each individual string from the corpus
train_list = []
for x in train_set[1]:
    train_list.append(check_sentiment(x))


In [15]:
# Here, we score the guesses of our sentiment analyzer.
# The score is based on the distance of the guesses, e.g.
# if the answer is "negative" but our system guesses "positive",
# the penalty is larger than, say, if the answer was "neutral"
# but our system guessed "positive".
total_score = len(train_list)
score_dict = {"negative":-0.5, "neutral":0, "positive":0.5}
for index, value in enumerate(train_list):
    total_score -= abs(score_dict[value] - score_dict[train_set[0][index]])

In [66]:
print(total_score/len(train_list))

0.8589087809036658


In [37]:
# Filters out the POS to only have nouns
pos_tags_to_keep = ["NN","NNP","NNS","NNPS"]
noun_list = []
for entry in train_set[1]:
    tokenized = nltk.word_tokenize(entry)
    tagged = nltk.pos_tag(tokenized)
    new_phrase = ""
    for word in tagged:
        if word[1] in pos_tags_to_keep:
            new_phrase += word[0] + " "
    noun_list.append(new_phrase)

#Get sentiment for train set with only nouns:
train_list_noun = []
for x in noun_list:
    train_list_noun.append(check_sentiment(x))
total_score = len(train_list_noun)
score_dict = {"negative":-0.5, "neutral":0, "positive":0.5}
for index, value in enumerate(train_list_noun):
    total_score -= abs(score_dict[value] - score_dict[train_set[0][index]])

#Accuracy score with only nouns
print(total_score/len(train_list_noun))

0.8388746803069054


In [38]:
# Filters out the POS to only have adjectives
pos_tags_to_keep = ["JJS","JJR","JJ"]
adj_list = []
for entry in train_set[1]:
    tokenized = nltk.word_tokenize(entry)
    tagged = nltk.pos_tag(tokenized)
    new_phrase = ""
    for word in tagged:
        if word[1] in pos_tags_to_keep:
            new_phrase += word[0] + " "
    adj_list.append(new_phrase)

#Get sentiment for train set with only adjectives:
train_list_adj = []
for x in adj_list:
    train_list_adj.append(check_sentiment(x))
total_score = len(train_list_adj)
score_dict = {"negative":-0.5, "neutral":0, "positive":0.5}
for index, value in enumerate(train_list_adj):
    total_score -= abs(score_dict[value] - score_dict[train_set[0][index]])

#Accuracy score with only adjectives
print(total_score/len(train_list_adj))

0.7664109121909634


In [42]:
# Filters out the POS to only have adjectives AND nouns
pos_tags_to_keep = ["JJS","JJR","JJ","NN","NNP","NNS","NNPS"]
adj_noun_list = []
for entry in train_set[1]:
    tokenized = nltk.word_tokenize(entry)
    tagged = nltk.pos_tag(tokenized)
    new_phrase = ""
    for word in tagged:
        if word[1] in pos_tags_to_keep:
            new_phrase += word[0] + " "
    adj_noun_list.append(new_phrase)

#Get sentiment for train set with only adjectives:
train_list_adj_noun = []
for x in adj_noun_list:
    train_list_adj_noun.append(check_sentiment(x))
total_score = len(train_list_adj_noun)
score_dict = {"negative":-0.5, "neutral":0, "positive":0.5}
for index, value in enumerate(train_list_adj_noun):
    total_score -= abs(score_dict[value] - score_dict[train_set[0][index]])

#Accuracy score with only adjectives
print(total_score/len(train_list_adj_noun))

0.8478260869565217


In [49]:
#Here we score each date positive, negative, or neutral. 
#Currently have 2 data sets so we will use the score_dict
#to average out the data sets for each day.


#We will first try without any POS filtering
wsj_sentiment=[]
ap_sentiment=[]
#pos_tags_to_keep = ["NN","NNP","NNS","NNPS"]
#finds sentiment of each article in a given outlet
def article_sentiment(filename):
    list_sentiment=[]
    with open(filename) as file:
        data = file.read()
        articles = data.split("--")
        for article in articles:
            list_sentiment.append(check_sentiment(article))
        return list_sentiment


wsj_sentiment = article_sentiment("WSJ.txt")
ap_sentiment = article_sentiment("AP.txt")

#gets the average of the two articles based on score_dict
daily_score=[]
for i in range(len(wsj_sentiment)):
    daily_score.append((score_dict[wsj_sentiment[i]]+score_dict[ap_sentiment[i]])/2) 
    
daily_score

{'neg': 0.025, 'neu': 0.924, 'pos': 0.051, 'compound': 0.4019}
{'neg': 0.148, 'neu': 0.742, 'pos': 0.11, 'compound': -0.802}
{'neg': 0.047, 'neu': 0.888, 'pos': 0.065, 'compound': 0.3182}
{'neg': 0.217, 'neu': 0.749, 'pos': 0.034, 'compound': -0.7964}
{'neg': 0.048, 'neu': 0.778, 'pos': 0.174, 'compound': 0.875}
{'neg': 0.0, 'neu': 0.969, 'pos': 0.031, 'compound': 0.0772}
{'neg': 0.08, 'neu': 0.844, 'pos': 0.075, 'compound': 0.3612}
{'neg': 0.019, 'neu': 0.889, 'pos': 0.092, 'compound': 0.7579}
{'neg': 0.169, 'neu': 0.653, 'pos': 0.178, 'compound': -0.2732}
{'neg': 0.031, 'neu': 0.827, 'pos': 0.142, 'compound': 0.9657}
{'neg': 0.111, 'neu': 0.795, 'pos': 0.094, 'compound': -0.1779}
{'neg': 0.0, 'neu': 0.948, 'pos': 0.052, 'compound': 0.5574}
{'neg': 0.054, 'neu': 0.821, 'pos': 0.126, 'compound': 0.9201}
{'neg': 0.053, 'neu': 0.781, 'pos': 0.166, 'compound': 0.8779}
{'neg': 0.099, 'neu': 0.749, 'pos': 0.152, 'compound': 0.8519}
{'neg': 0.147, 'neu': 0.701, 'pos': 0.153, 'compound': 0.63

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0]

In [50]:
#Now we try by filtering only nouns
wsj_sentiment=[]
ap_sentiment=[]
pos_tags_to_keep = ["NN","NNP","NNS","NNPS"]
#finds sentiment of each article in a given outlet
def article_sentiment(filename):
    list_sentiment=[]
    with open(filename) as file:
        data = file.read()
        articles = data.split("--")
        for article in articles:
            tokenized = nltk.word_tokenize(article)
            tagged = nltk.pos_tag(tokenized)
            new_phrase = ""
            for word in tagged:
             
                if word[1] in pos_tags_to_keep:
                    new_phrase += word[0] + " "
            list_sentiment.append(check_sentiment(new_phrase))
        return list_sentiment


wsj_sentiment = article_sentiment("WSJ.txt")
ap_sentiment = article_sentiment("AP.txt")

#gets the average of the two articles based on score_dict
daily_score=[]
for i in range(len(wsj_sentiment)):
    daily_score.append((score_dict[wsj_sentiment[i]]+score_dict[ap_sentiment[i]])/2) 
    
daily_score



{'neg': 0.019, 'neu': 0.949, 'pos': 0.031, 'compound': 0.296}
{'neg': 0.095, 'neu': 0.825, 'pos': 0.081, 'compound': -0.6249}
{'neg': 0.026, 'neu': 0.872, 'pos': 0.102, 'compound': 0.9607}
{'neg': 0.112, 'neu': 0.838, 'pos': 0.051, 'compound': -0.6808}
{'neg': 0.021, 'neu': 0.865, 'pos': 0.114, 'compound': 0.9493}
{'neg': 0.0, 'neu': 0.989, 'pos': 0.011, 'compound': 0.0772}
{'neg': 0.042, 'neu': 0.919, 'pos': 0.039, 'compound': 0.3612}
{'neg': 0.009, 'neu': 0.948, 'pos': 0.043, 'compound': 0.7579}
{'neg': 0.109, 'neu': 0.772, 'pos': 0.119, 'compound': 0.2023}
{'neg': 0.035, 'neu': 0.868, 'pos': 0.097, 'compound': 0.9709}
{'neg': 0.054, 'neu': 0.901, 'pos': 0.045, 'compound': -0.1779}
{'neg': 0.02, 'neu': 0.94, 'pos': 0.04, 'compound': 0.4215}
{'neg': 0.048, 'neu': 0.862, 'pos': 0.09, 'compound': 0.9477}
{'neg': 0.025, 'neu': 0.816, 'pos': 0.159, 'compound': 0.9782}
{'neg': 0.067, 'neu': 0.721, 'pos': 0.212, 'compound': 0.992}
{'neg': 0.094, 'neu': 0.699, 'pos': 0.207, 'compound': 0.987

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0]

In [51]:
#Now we try by filtering only adjectives
wsj_sentiment=[]
ap_sentiment=[]
pos_tags_to_keep = ["JJS","JJR","JJ"]
#finds sentiment of each article in a given outlet
def article_sentiment(filename):
    list_sentiment=[]
    with open(filename) as file:
        data = file.read()
        articles = data.split("--")
        for article in articles:
            tokenized = nltk.word_tokenize(article)
            tagged = nltk.pos_tag(tokenized)
            new_phrase = ""
            for word in tagged:
             
                if word[1] in pos_tags_to_keep:
                    new_phrase += word[0] + " "
            list_sentiment.append(check_sentiment(new_phrase))
        return list_sentiment


wsj_sentiment = article_sentiment("WSJ.txt")
ap_sentiment = article_sentiment("AP.txt")

#gets the average of the two articles based on score_dict
daily_score=[]
for i in range(len(wsj_sentiment)):
    daily_score.append((score_dict[wsj_sentiment[i]]+score_dict[ap_sentiment[i]])/2) 
    
daily_score


{'neg': 0.054, 'neu': 0.801, 'pos': 0.145, 'compound': 0.8271}
{'neg': 0.026, 'neu': 0.67, 'pos': 0.304, 'compound': 0.9916}
{'neg': 0.0, 'neu': 0.909, 'pos': 0.091, 'compound': 0.7351}
{'neg': 0.145, 'neu': 0.647, 'pos': 0.208, 'compound': 0.323}
{'neg': 0.0, 'neu': 0.805, 'pos': 0.195, 'compound': 0.8555}
{'neg': 0.072, 'neu': 0.803, 'pos': 0.124, 'compound': 0.0772}
{'neg': 0.0, 'neu': 0.826, 'pos': 0.174, 'compound': 0.7479}
{'neg': 0.103, 'neu': 0.897, 'pos': 0.0, 'compound': -0.5719}
{'neg': 0.04, 'neu': 0.809, 'pos': 0.15, 'compound': 0.8555}
{'neg': 0.069, 'neu': 0.653, 'pos': 0.278, 'compound': 0.9896}
{'neg': 0.063, 'neu': 0.745, 'pos': 0.192, 'compound': 0.552}
{'neg': 0.032, 'neu': 0.729, 'pos': 0.239, 'compound': 0.8807}
{'neg': 0.122, 'neu': 0.679, 'pos': 0.198, 'compound': 0.78}
{'neg': 0.0, 'neu': 0.789, 'pos': 0.211, 'compound': 0.8957}
{'neg': 0.119, 'neu': 0.712, 'pos': 0.169, 'compound': 0.6915}
{'neg': 0.032, 'neu': 0.753, 'pos': 0.214, 'compound': 0.9246}
{'neg': 

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0]

In [52]:
#Now we try by filtering adjectives and nouns
wsj_sentiment=[]
ap_sentiment=[]
pos_tags_to_keep = ["JJS","JJR","JJ","NN","NNP","NNS","NNPS"]
#finds sentiment of each article in a given outlet
def article_sentiment(filename):
    list_sentiment=[]
    with open(filename) as file:
        data = file.read()
        articles = data.split("--")
        for article in articles:
            tokenized = nltk.word_tokenize(article)
            tagged = nltk.pos_tag(tokenized)
            new_phrase = ""
            for word in tagged:
             
                if word[1] in pos_tags_to_keep:
                    new_phrase += word[0] + " "
            list_sentiment.append(check_sentiment(new_phrase))
        return list_sentiment


wsj_sentiment = article_sentiment("WSJ.txt")
ap_sentiment = article_sentiment("AP.txt")

#gets the average of the two articles based on score_dict
daily_score=[]
for i in range(len(wsj_sentiment)):
    daily_score.append((score_dict[wsj_sentiment[i]]+score_dict[ap_sentiment[i]])/2) 
    
daily_score

{'neg': 0.027, 'neu': 0.918, 'pos': 0.056, 'compound': 0.872}
{'neg': 0.077, 'neu': 0.784, 'pos': 0.139, 'compound': 0.9901}
{'neg': 0.021, 'neu': 0.88, 'pos': 0.099, 'compound': 0.9759}
{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'compound': -0.4033}
{'neg': 0.017, 'neu': 0.852, 'pos': 0.131, 'compound': 0.9779}
{'neg': 0.013, 'neu': 0.955, 'pos': 0.032, 'compound': 0.1531}
{'neg': 0.034, 'neu': 0.903, 'pos': 0.063, 'compound': 0.8225}
{'neg': 0.025, 'neu': 0.939, 'pos': 0.036, 'compound': 0.4215}
{'neg': 0.097, 'neu': 0.778, 'pos': 0.125, 'compound': 0.8807}
{'neg': 0.045, 'neu': 0.806, 'pos': 0.149, 'compound': 0.996}
{'neg': 0.055, 'neu': 0.874, 'pos': 0.07, 'compound': 0.3818}
{'neg': 0.023, 'neu': 0.89, 'pos': 0.087, 'compound': 0.9186}
{'neg': 0.065, 'neu': 0.821, 'pos': 0.114, 'compound': 0.9712}
{'neg': 0.019, 'neu': 0.81, 'pos': 0.17, 'compound': 0.9891}
{'neg': 0.076, 'neu': 0.719, 'pos': 0.205, 'compound': 0.9937}
{'neg': 0.083, 'neu': 0.708, 'pos': 0.209, 'compound': 0.9936}

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0]