# Baseline sentiment analysis
goal of this nb is to baseline how well we can predict sentiment for news articles using simple scikit learn libraries - this is to make the argument that we should not use neural nets for this purpose

heavily inspired by https://towardsdatascience.com/sentiment-analysis-with-python-part-2-4f71e7bde59a

In [None]:
import nltk
from nltk.corpus import stopwords
import pandas as pd

In [None]:
df = pd.read_csv("simple_twitter_dataset.csv")
df.columns =  columns=["label", "text"]
# df["label"].replace({4: 3}, inplace=True)

In [None]:
df["label"].unique()

array([0, 4])

In [None]:
nltk.download('stopwords')
english_stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
import re

def preprocess_review(review):
  REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
  REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
  clean_review = REPLACE_NO_SPACE.sub("", review.lower())
  clean_review = REPLACE_WITH_SPACE.sub(" ", review)
    
  return clean_review

In [None]:
def remove_stop_words(review):
  return ' '.join([word for word in review.split() if word not in english_stop_words])

In [None]:
df["cleaned_text"] = df["text"].apply(lambda x: preprocess_review(x))
df["cleaned_text"] = df["cleaned_text"].apply(lambda x: remove_stop_words(x))

In [None]:
print(df["cleaned_text"].iloc[1])
print(df["text"].iloc[1])


@Kenichan I dived many times ball. Managed save 50% The rest go bounds
@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))
ngram_vectorizer.fit(df["cleaned_text"])
X = ngram_vectorizer.transform(df["cleaned_text"])
X_test = ngram_vectorizer.transform(df["cleaned_text"])
target = df["label"]

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size=0.75
)

# for c in [0.01, 0.05, 0.25, 0.5, 1]:
#   lr = LogisticRegression(C=c)
#   lr.fit(X_train, y_train)
#   acc = accuracy_score(y_val, lr.predict(X_val))
#   print(f"Accuracy for C={c} is {acc}")


In [None]:
final_ngram = LogisticRegression(C=0.5)
final_ngram.fit(X, target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
accuracy_score(y_val, final_ngram.predict(X_val))

0.9169075

In [None]:
import pickle
pickle.dump(final_ngram, open("/content/lr_model.pkl", "wb"))

In [None]:
pickle.dump(ngram_vectorizer, open("/content/ngram_vectorizer.pkl", "wb"))

In [None]:
pickle.dump(english_stop_words, open("/content/english_stop_words.pkl", "wb"))

In [None]:
def predict_sentiment(text, model, vectorizer):
  text = preprocess_review(text)
  text = remove_stop_words(text)
  # import pdb; pdb.set_trace()
  txt_vec = vectorizer.transform([text])
  print("probabilities: ")
  print(model.predict_proba(txt_vec))
  return model.predict(txt_vec)
  

In [None]:
predict_sentiment(" no probs kazz hun,i think theres a new one going on now celebrity twitterer or summat? lol dunno?i just voted misha anyways ", final_ngram, ngram_vectorizer)

probabilities: 
[[0.06373249 0.93626751]]


array([4])

# Let's try the same thing out with some news articles

In [None]:
df_news = pd.read_csv("/content/SampleLabelledArticles_AWS_Comprehend.csv")

In [None]:
df_news["Sentiment"].unique()

array(['NEUTRAL', 'MIXED', 'POSITIVE', 'NEGATIVE'], dtype=object)

In [None]:
df_news = df_news[["Body", "Sentiment"]]
df_news["Sentiment"].replace({'NEUTRAL': 2, 'MIXED': 1, 'POSITIVE': 3, 'NEGATIVE': 0}, inplace=True)

In [None]:
article = df_news["Body"].iloc[0]
print(article)
# predict_sentiment(article, ngram_vectorizer)
# it thinks this is overwhelmingly negative

LEAD: IN the agency world there are the conglomerates, as typified by Saatchi & Saatchi and the Interpublic Group of Companies, and the colonial (or branch) networks, like those run by J. Walter Thompson, Young & Rubicam and Foote, Cone & Belding. A more recent phenomenon is what Peter Scott, chief executive of Britain's WCRS IN the agency world there are the conglomerates, as typified by Saatchi & Saatchi and the Interpublic Group of Companies, and the colonial (or branch) networks, like those run by J. Walter Thompson, Young & Rubicam and Foote, Cone & Belding. A more recent phenomenon is what Peter Scott, chief executive of Britain's WCRS Group, describes as a creative federation. His own publicly held company, which has acquired America's Della Femina, Travisano & Partners and HBM/Creamer, is just such an organization. So are two other British agencies - Boase Massimi, which bought New York's Ammirati & Puris, and Lowe Howard Spink & Bell, which now owns Laurence Charles, Free & La

In [None]:
df_news["clean_text"] = df_news["Body"].apply(lambda x: preprocess_review(x))
df_news["clean_text"] = df_news["clean_text"].apply(lambda x: remove_stop_words(x))

In [None]:
ngram_vectorizer_news = CountVectorizer(binary=True, ngram_range=(1, 5))
ngram_vectorizer_news.fit(df_news["clean_text"])
X = ngram_vectorizer_news.transform(df_news["clean_text"])
X_test = ngram_vectorizer_news.transform(df_news["clean_text"])
target = df_news["Sentiment"]

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size=0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
  lr = LogisticRegression(C=c)
  lr.fit(X_train, y_train)
  acc = accuracy_score(y_val, lr.predict(X_val))
  print(f"Accuracy for C={c} is {acc}")


Accuracy for C=0.01 is 0.8488372093023255
Accuracy for C=0.05 is 0.8488372093023255
Accuracy for C=0.25 is 0.8488372093023255
Accuracy for C=0.5 is 0.8488372093023255
Accuracy for C=1 is 0.8488372093023255


In [None]:
final_ngram = LogisticRegression(C=0.5)
final_ngram.fit(X, target)

LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
!pip install newspaper3k

In [None]:
from newspaper import Article
url = "https://www.nytimes.com/2021/08/08/opinion/anti-vaccine-america.html"
article = Article(url)
article.download()
article.parse()

In [None]:
article.text

'The vaccine is safe, incredibly safe.\n\nThere are no microchips or magnets in it. It does not cause Covid and it is not more dangerous than Covid.\n\nBelieving all these lies is a luxury of people who have not sat by a hospital bedside, or watched from behind glass, because Covid regulations prevented them from comforting a relative or friend as they drew their last breath, struggling against a virus that choked that breath off.\n\nIt is a luxury to be irresponsible in a society where others will be responsible for you, where you simply assume that you are safer because others take the appropriate precautions to be safe: You do not need to get the shot because others have.\n\nBut the Delta variant is testing that faith.\n\nYou will not be safe as an unvaccinated person riding on the coattails of the vaccinated. Delta is extremely transmissible and unremitting. It is stronger than its progenitor.\n\nAs the Delta variant surges there is an uptick in the pace of vaccinations in the coun

In [None]:
predict_sentiment(article.text, final_ngram, ngram_vectorizer)

probabilities: 
[[0.9981418 0.0018582]]


array([0])

# Try combining the two datasets

In [None]:
df_news.columns = ["body", "label", "text"]
df_news.head()

Unnamed: 0,body,label,text
0,LEAD: IN the agency world there are the conglo...,2,"LEAD: IN agency world conglomerates, typified ..."
1,"LEAD: System Energy Resources Inc., a unit of ...",2,"LEAD: System Energy Resources Inc., unit Middl..."
2,LEAD: *3*** COMPANY REPORTS ** *3*FHP INTERNAT...,2,LEAD: *3*** COMPANY REPORTS ** *3*FHP INTERNAT...
3,LEAD: *3*** COMPANY REPORTS ** *3*GROWTH VENTU...,2,LEAD: *3*** COMPANY REPORTS ** *3*GROWTH VENTU...
4,"LEAD: Claire F. O'Brien, a lawyer, and Timothy...",2,"LEAD: Claire F. O'Brien, lawyer, Timothy Josep..."


In [None]:
tweets_and_news = pd.concat([df, df_news[["label", "text"]]])

In [None]:
ngram_vectorizer_news = CountVectorizer(binary=True, ngram_range=(1, 2))
ngram_vectorizer_news.fit(tweets_and_news["text"])
X = ngram_vectorizer_news.transform(tweets_and_news["text"])
X_test = ngram_vectorizer_news.transform(tweets_and_news["text"])
target = tweets_and_news["label"]

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size=0.75
)


lr = LogisticRegression(C=0.5)
lr.fit(X_train, y_train)
acc = accuracy_score(y_val, lr.predict(X_val))
# print(f"Accuracy for C={c} is {acc}")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


NameError: ignored

In [None]:
accuracy_score(y_val, lr.predict(X_val))

0.8163593963786715

In [None]:
predict_sentiment(article.text, lr, ngram_vectorizer_news)

probabilities: 
[[9.99963612e-01 1.29610314e-09 4.07060749e-10 3.63867019e-05]]


array([0])

In [None]:
article = Article("https://www.nytimes.com/2021/08/07/opinion/sunday/covid-unvaccinated-anger.html?action=click&module=RelatedLinks&pgtype=Article")
article.download()
article.parse()

In [None]:
print(article.text)
predict_sentiment(article.text, lr, ngram_vectorizer_news)

This archetypal bumpkin villain of post-Trump America has long received too much credit in a country where Trumpism thrives in affluent, white urban communities bursting with college degrees. In handling the pandemic, such misdirection of attention keeps us from what we should be doing: trying to reach the vast group of people who might choose vaccination if barriers to access and knowledge were removed.

*

One overlooked barrier, as ever in this country, is socioeconomic class. Polls conducted by the Kaiser Family Foundation earlier this year found that working-class people — white, Black, Hispanic, Democrat, Republican — were less likely to be vaccinated. Vaccination rates for Black and white college graduates, meanwhile, were almost identical. The so-called “uneducated” of all races and backgrounds are hampered not by a lack of good sense but by a lack of money and power. Their education status keeps their income low, and income predicts insurance status. When the highly contagious

array([2])

In [None]:
import pickle
pickle.dump(lr, open("/content/lr_model.weights", "wb"))

## Further baseline by just looking for positive/negative words
Here we read in a lexicon of positive/negatively labelled words that are stored in `sentiment_lookup` in order to baseline against using a `LogisticRegression` model. We appear to outperform this baseline by about 3 percentage points.

In [None]:
# subjectivity lexicon taken from http://mpqa.cs.pitt.edu/lexicons/subj_lexicon/
sentiment_lookup = {}
with open("./subjclueslen1-HLTEMNLP05.tff", "r") as f:
  lines = f.readlines()
  for line in lines:
    words = line.split()
    sentiment_word = words[2].replace("word1=", "")
    sentiment_of_word = words[5].replace("priorpolarity=", "")
    sentiment_lookup[sentiment_word] = 1 if sentiment_of_word == "positive" else -1

In [None]:
import numpy as np
doc=['i', 'really', 'dislike', 'this', 'product']
sentiment = np.mean([sentiment_lookup.get(w, 0) for w in doc])

In [None]:
 np.mean([sentiment_lookup.get(w, 0) for w in article.text])

0.0

In [None]:
def score_with_lookup(text):
  text = preprocess_review(text)
  text = remove_stop_words(text)
  score = np.mean([sentiment_lookup.get(w, 0) for w in text.split()])
  # arbitrary way to score pos(3)/neg(0)/neutral(2)
  if score > 0.05:
    return 3
  elif score < -0.05:
    return 0
  else:
    return 2

In [None]:
df_news["score_w_lookup"] = df_news["Body"].apply(lambda x: score_with_lookup(x))

In [None]:
accuracy_score(df_news["Sentiment"], df_news["score_w_lookup"])

0.7426078526417839

In [None]:
# if you just guess NEUTRAL 100% of the time you basically hit 80% acc
# makes these results kind of inconclusive D:
df_news["score_w_lookup"].value_counts()

2    1754
0     251
3      58
Name: score_w_lookup, dtype: int64

In [None]:
df_news["Sentiment"].value_counts()

2    1683
0     189
3     136
1      55
Name: Sentiment, dtype: int64