In [147]:
import numpy as np
from utils import utils
import gensim.downloader
from gensim.models import KeyedVectors
import spacy
from spacy.lang.en import English
from spacy.tokenizer import Tokenizer
from spacy.pipeline import Sentencizer
from scipy.spatial import distance
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity

In [61]:
stop_words = stopwords.words("english")

In [2]:
def clean(cc_list):
    '''
    Cleans the list of climate change words.
    '''
    cc_list = [w.replace("\n", "") for w in cc_list]
    cc_list = [w.replace(" ", "-").lower() for w in cc_list]
    return cc_list

def load_cc_words(type="normal"):
    '''
    Loads the glosssaries of climate change words.
    '''
    if type == "normal":
        cc_words = open("data/resources/CCglossaryWiki.txt", "r").readlines()
        cc_words = clean(cc_words)
        return cc_words
    elif type == "enriched":
        cc_words = pd.read_csv("data/resources/CCglossaryComplete.csv", index_col=False, names=['word'])['word'].values
        cc_words = clean(cc_words)
        return cc_words

In [271]:
corpus = utils.load_corpus()
cc_words = load_cc_words()

In [25]:
# spaCy stuff
nlp = English()
nlp.add_pipe(nlp.create_pipe('sentencizer'))
tokenizer = Tokenizer(nlp.vocab)

In [15]:
# Initialize GloVe embeddings
print("Loading GloVe embeddings...")
glove_embeddings = gensim.downloader.load("glove-wiki-gigaword-300")
glove_embeddings.init_sims(replace=True)

Loading GloVe embeddings...


### How to assign climate scores to articles? 

One way is to calculate cosine distance between the climate change topic vector and every sentence in an article, summing these scores, and averaging them. Since cosine distance gives a score between 0 and 1, averaging over the sum of scores gives a rating for the climatyness of an article. 

Another way is to calculate climateness distributions rather than a single score for each article. We can calculate scores every sentence and assign them into bins (e.g. 0-20, 21-40, 41-60...).

Also check out these discussions:
- should word vectors be normalized? https://stackoverflow.com/questions/41387000/cosine-similarity-of-word2vec-more-than-1

In [272]:
# Example of cosine distance
a = [2, 0, 1, 1, 0, 2, 1, 1]
b = [2, 1, 1, 0, 1, 1, 1, 1]

print(distance.cosine(a,b)) 
print(distance.cosine(a,a)) # 0.0 means perfect similarity

0.1784161637422509
0.0


Wrangle the data so it can be used easier:

In [329]:
articles = []
journals = ["ScienceOCR", "NatureOCR"]
for journal in journals:
    for article in corpus[journal]:
        text = article[1]
        sentences = []
        doc = nlp(text)
        for sent in doc.sents:
            if sent.orth_ != "\n":
                s = sent.orth_.replace("\n", "")
                sentences.append(s)
        articles.append(sentences)

In [330]:
tokenized = []
for article in articles:
    tokenized_sents = []
    for sent in article:
        tokenized_sents.append([token.orth_.lower() for token in tokenizer(sent) if token.orth_.lower() in glove_embeddings.vocab and token.orth_.lower() not in stop_words])
    tokenized.append(tokenized_sents)

In [331]:
# this function returns the average vector of a sentence
def get_vec(tokenized_sentence):
    return np.mean(np.array([glove_embeddings[word] for word in tokenized_sentence]), axis=0)

In [346]:
# this function takes the list of cosine distances and prints the article according to this ranking,
# from most climaty to least climaty
def print_article(article, scores):
    sorted_scores = sorted(list(enumerate(scores)), key=lambda x:x[1])
    sorted_indices = [el[0] for el in sorted_scores]
    print("Printing article from most to least climaty sentences:\n")
    for i in range(len(scores)):
        try:
            print("{}\n".format(article[sorted_indices[i]]))
        except:
            pass

### Experiment 1: using keywords from Hulme et al. paper.

Hulme et al. created the corpus by combining the articles containing one or more mentions of the following words: ‘climate’, ‘greenhouse’, ‘carbon’, ‘warming’, ‘weather’, ‘atmosphere’, ‘pollution’. We'll use these keywords to create a generic vector representing the topic of climate change. We'll then use this vector to calculate how likely the article is to be about climate change.

In [333]:
cc_keywords = ["climate", "greenhouse", "carbon", "warming", "weather", "atmosphere", "pollution"]
cc_embedding = get_vec(cc_keywords)

Below, we calculate similarity scores for every sentence in every article. `similarity_scores` is a list of lists — its length is 493, one list for each article. The elements of each sublist are cosine distance scores between a sentence and the climate change embedding.

In [334]:
similarity_scores = []
for article in tokenized:
    current_article = []
    for tokenized_sentence in article:
        sentence_embedding = get_vec(tokenized_sentence)
        score = distance.cosine(cc_embedding, sentence_embedding)
        current_article.append(score)
    similarity_scores.append(current_article)

In [335]:
# sanity checks
# make sure that the number of elements corresponds to the number of articles
print(len(similarity_scores) == 493)

# make sure that the number of scores for the first article corresponds to the number of sentences
print(len(similarity_scores[0]) == len(articles[0]))

True
True


In [347]:
# Print article 0
print_article(articles[0], similarity_scores[0])

Printing article from most to least climaty sentences:

One of several ways to attenuate the increase of CO2 in the atmosphere is to sequester it.*

Worldwide emissions of CO2 continue to increase, and prudence dictates that technologies be developed to help limit this trend.

However, if international agreements are implemented to attenuate the buildup of atmospheric CO2, sequestration of it in unused or abandoned fossil hydrocarbon fields is one good step to take.

A vigorous program aimed at cutting the cost of cleaning CO2 emitted by power plants should have a high priority and adequate funding.

The injection of CO2 into oil fields is having economically beneficial effects while at the same time sequestering CO2.

 A major source of CO2 is the combustion of fossil fuel in power plants.

Controversy exists about the possible extent of the contribution of CO2 to present and future global warming.

 The present amount of geological sequestration of CO2 is small in comparison with emi

In [338]:
min(similarity_scores[0]), max(similarity_scores[0])

(0.3401578664779663, 0.7683388441801071)

### Experiment 2: using Wikipedia's glossary of climate change terms.

In [None]:
# Filtering the glossary so it only contains words that are in the model's vocabulary
filtered = [w for w in cc_words if w in glove_embeddings.vocab]
print("Number of words prior to filtering: ({}) and after filtering: ({})".format(len(cc_words), len(filtered)))