In [1]:
# https://okan.cloud/posts/2021-04-08-text-vectorization-using-python-term-document-matrix/#:~:text=Text%20vectorization%20is%20an%20important,transformed%20into%20a%20numerical%20representation./
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

# Import packages

In [1]:
# Run this if running in Google Collab
# Mount google drive if running from Google Collab
from google.colab import drive
drive.mount('/content/drive')

# Set current directory if running from Google Collab
import os
os.chdir('/content/drive/My Drive/Carbon_price_prediction/Workspace/Data')

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
import scipy.sparse

import pickle

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Custom functions

# Parameters / Constants

In [2]:
ngram = 1 # Be careful, this has to be less than or equal to 3, really computationally costly!

# Data import

In [3]:
text_df = pd.read_csv( "./lemmatized_merged_articles.csv", index_col=0)
text_df.head()

Unnamed: 0,date,lemmatized_text
0,2017-01-01,conceit - generation believe experience unique...
1,2017-01-01,- come close world leader appear eager start f...
2,2017-01-01,process automatic browser redirect request con...
3,2017-01-01,labour - - division immigration broken party -...
4,2017-01-01,establish political order come crash ground - ...


In [4]:
text_df.shape

(18937, 2)

# Term-document matrix generation

In [5]:
%%time
# Count Vectorizer
vect = CountVectorizer(ngram_range=(ngram, ngram))  
vects = vect.fit_transform(text_df.lemmatized_text)

CPU times: user 36.5 s, sys: 2.07 s, total: 38.6 s
Wall time: 39.2 s


In [6]:
tmp = vects[:5,]

In [7]:
# Select the first five rows from the data set
td = pd.DataFrame(tmp.todense()) 
td.columns = vect.get_feature_names()



In [8]:
td.head()

Unnamed: 0,aa dcftas georgia,aaa country add,aaa data euractiv,aaa data journalauto,aaa found gas,aaa northeast say,aaa predicts people,aaa project downturn,aaa rating affected,aaa rating shortly,...,zypries say addition,zypries say possibility,zypries say protectionism,zypries say saturday,zypries say united,zypries state sanction,zypries told deutschlandfunk,zypries told reuters,zyrtec pepcid well,zyuzino municipal deputy
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# Check some particular columns (so that ID-s are equivalent)
td['conceit'] # It has a 1 in the first row, as expected

KeyError: ignored

# Export results

In [10]:
type(vects)

scipy.sparse.csr.csr_matrix

In [11]:
scipy.sparse.save_npz(f'merged_articles_keyword_term_document_matrix_ngram_{ngram}.npz', vects)

In [12]:
# # CSV or pickle should be used?? --> depends on the final format, decide once preproc workflow is complete!
# lemmatized_text_df.to_csv(f'./lemmatized_merged_articles.csv')
# # preprocessed_text_df.to_csv(f'./lemmatized_merged_articles.csv')

In [13]:
# Store data (serialize)
with open(f'merged_articles_keyword_term_document_matrix_ngram_{ngram}_colnames', 'wb') as handle:
   pickle.dump(vect.get_feature_names(), handle)



# Extras

## (How to import sparse matrices and convert them to usual numpy array)

In [14]:
sparse_matrix = scipy.sparse.load_npz(f'merged_articles_keyword_term_document_matrix_ngram_{ngram}.npz')

file = open(f"merged_articles_keyword_term_document_matrix_ngram_{ngram}_colnames",'rb')
td_matrix_cols = pickle.load(file)
file.close()

In [15]:
# tmp = sparse_matrix.todense()
tmp = sparse_matrix[:5,]
td_matrix = tmp.toarray()

In [16]:
td_matrix.shape

(5, 5154018)

In [17]:
# Select the first five rows from the data set
td_new = pd.DataFrame(td_matrix).iloc[:5]  
td_new.columns = td_matrix_cols

In [18]:
td_new

Unnamed: 0,aa dcftas georgia,aaa country add,aaa data euractiv,aaa data journalauto,aaa found gas,aaa northeast say,aaa predicts people,aaa project downturn,aaa rating affected,aaa rating shortly,...,zypries say addition,zypries say possibility,zypries say protectionism,zypries say saturday,zypries say united,zypries state sanction,zypries told deutschlandfunk,zypries told reuters,zyrtec pepcid well,zyuzino municipal deputy
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
td_new['conceit']

KeyError: ignored

In [20]:
max(td_new.values.flatten())

3

## Filter for carbon keywords and export dense matrix

In [21]:
# Slicing example taken from here
# https://cmdlinetips.com/2019/07/how-to-slice-rows-and-columns-of-sparse-matrix-in-python/

In [32]:
# Carbon price related keywords
# carbon_keywords = ['solar', 'carbon'] # this is a dummy for now, it will be imported later!
carbon_keywords = pd.read_csv( "./keyword_lists/revised_keyword_list.csv")

In [37]:
carbon_keywords['keywords'] = carbon_keywords['keywords'].apply(lambda x: x.lower())

In [38]:
carbon_keywords.head()

Unnamed: 0,keywords
0,emissions
1,co2
2,carbon dioxide
3,greenhouse gas
4,ghg


In [39]:
carbon_keywords_index = {carbon_keyword: td_matrix_cols.index(carbon_keyword)
                         for carbon_keyword in carbon_keywords.squeeze().values
                         if carbon_keyword in td_matrix_cols}
print(carbon_keywords_index)
np.asarray(td_matrix_cols)[list(carbon_keywords_index.values())]

{}


array([], dtype='<U69')

In [92]:
carbon_keyword_matrix = sparse_matrix.tocsr()[:,list(carbon_keywords_index.values())].todense()

In [93]:
carbon_keyword_matrix.shape

(18937, 15)

In [94]:
carbon_keyword_df = pd.DataFrame(carbon_keyword_matrix, columns=list(carbon_keywords_index.keys()))
print(carbon_keyword_df.head())
print(carbon_keyword_df.describe())

   ghg  climate  sustainability  sustainable  environment  coal  gas  oil  \
0    0        2               0            0            0     0    1    2   
1    0        0               0            0            0     0    0    0   
2    0        0               0            0            0     0    0    0   
3    0        0               0            0            0     0    0    0   
4    0        0               0            0            1     0    0    0   

   crude  gasoline  diesel  petrol  fuel  electricity  renewable  
0      0         0       0       0     0            0          0  
1      0         0       0       0     0            0          0  
2      0         0       0       0     0            0          0  
3      0         0       0       0     0            0          0  
4      0         0       0       0     0            0          0  
                ghg       climate  sustainability   sustainable   environment  \
count  18937.000000  18937.000000    18937.000000  189

In [95]:
# Check the number of articles in which the keywords are mentioned at least once
(carbon_keyword_df > 0).sum(axis=0)

ghg                  3
climate           1343
sustainability     135
sustainable        767
environment       1074
coal               269
gas               1194
oil               1235
crude              185
gasoline            46
diesel              90
petrol              84
fuel              1055
electricity        280
renewable          194
dtype: int64

In [96]:
carbon_keyword_df.to_csv(f'./merged_articles_carbon_keyword_term_document_matrix_ngram_{ngram}.csv')

# Support

In [None]:
A = scipy.sparse.random(5, 5,
                  density=0.5,
                  data_rvs=scipy.stats.poisson(10, loc=10).rvs)

In [None]:
A.todense()

matrix([[23.,  0., 16., 23.,  0.],
        [20., 19.,  0., 22., 18.],
        [ 0.,  0., 24.,  0.,  0.],
        [19., 18.,  0.,  0.,  0.],
        [ 0., 18.,  0., 27.,  0.]])

In [None]:
A.todense()[:, [2, 0]]

matrix([[16., 23.],
        [ 0., 20.],
        [24.,  0.],
        [ 0., 19.],
        [ 0.,  0.]])