# Import packages

In [1]:
# Run this if running in Google Collab
# Mount google drive if running from Google Collab
from google.colab import drive
drive.mount('/content/drive')

# Set current directory if running from Google Collab
import os
os.chdir('/content/drive/My Drive/Carbon_price_prediction/Workspace/Data')

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import pickle

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Custom functions

# Parameters / Constants

In [3]:
ngram = 1
ngrams = [1, 2, 3]

# Data import

In [4]:
dfs = [pd.read_csv(f'./merged_articles_carbon_keyword_term_document_matrix_ngram_{ngram}.csv', index_col=0)
       for ngram in ngrams]
df = pd.concat(dfs, axis=1)
print(df.shape)
print(df.head())

(18937, 36)
   ghg  climate  sustainability  sustainable  environment  coal  gas  oil  \
0    0        2               0            0            0     0    1    2   
1    0        0               0            0            0     0    0    0   
2    0        0               0            0            0     0    0    0   
3    0        0               0            0            0     0    0    0   
4    0        0               0            0            1     0    0    0   

   crude  gasoline  ...  gas heating  gas turbine  fossil fuel  nuclear power  \
0      0         0  ...            0            0            0              0   
1      0         0  ...            0            0            0              0   
2      0         0  ...            0            0            0              0   
3      0         0  ...            0            0            0              0   
4      0         0  ...            0            0            0              0   

   nuclear plant  nuclear energy  clea

In [5]:
id_date_map_df = pd.read_csv(f'./lemmatized_merged_articles.csv', index_col=0)
id_date_map_df.shape

(18937, 2)

In [6]:
df.shape

(18937, 36)

In [7]:
# Reindex keyword counts by dates
df = df.rename(index=id_date_map_df['date'].squeeze().to_dict())
print(df.head())

            ghg  climate  sustainability  sustainable  environment  coal  gas  \
2017-01-01    0        2               0            0            0     0    1   
2017-01-01    0        0               0            0            0     0    0   
2017-01-01    0        0               0            0            0     0    0   
2017-01-01    0        0               0            0            0     0    0   
2017-01-01    0        0               0            0            1     0    0   

            oil  crude  gasoline  ...  gas heating  gas turbine  fossil fuel  \
2017-01-01    2      0         0  ...            0            0            0   
2017-01-01    0      0         0  ...            0            0            0   
2017-01-01    0      0         0  ...            0            0            0   
2017-01-01    0      0         0  ...            0            0            0   
2017-01-01    0      0         0  ...            0            0            0   

            nuclear power  nucle

In [8]:
# Sanity check
print(min(df.index))
print(max(df.index))

2017-01-01
2021-12-12


# Data Preprocessing

In [9]:
# Aggregate by dates
agg_df = df.groupby(df.index).sum()

In [10]:
agg_df

Unnamed: 0,ghg,climate,sustainability,sustainable,environment,coal,gas,oil,crude,gasoline,...,gas heating,gas turbine,fossil fuel,nuclear power,nuclear plant,nuclear energy,clean energy,green energy,emission trading system,emission trading scheme
2017-01-01,0,2,0,0,1,0,2,3,0,0,...,0,0,0,0,0,0,0,0,0,0
2017-01-02,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2017-01-03,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2017-01-04,0,0,0,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2017-01-05,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-08,0,0,0,4,1,0,5,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2021-12-09,0,2,0,0,1,0,2,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2021-12-10,0,17,0,3,1,1,5,6,0,1,...,0,0,1,1,0,3,0,0,0,0
2021-12-11,0,0,0,0,0,0,5,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# TF-IDF generation

In [19]:
nt = (agg_df > 0).sum(axis=0)
nt = (agg_df < 0) + nt
print(nt)

            ghg  climate  sustainability  sustainable  environment  coal  gas  \
2017-01-01    3      828             128          596          739   223  721   
2017-01-02    3      828             128          596          739   223  721   
2017-01-03    3      828             128          596          739   223  721   
2017-01-04    3      828             128          596          739   223  721   
2017-01-05    3      828             128          596          739   223  721   
...         ...      ...             ...          ...          ...   ...  ...   
2021-12-08    3      828             128          596          739   223  721   
2021-12-09    3      828             128          596          739   223  721   
2021-12-10    3      828             128          596          739   223  721   
2021-12-11    3      828             128          596          739   223  721   
2021-12-12    3      828             128          596          739   223  721   

            oil  crude  gas

In [20]:
# Implementation based on the 2. recommended option here: https://en.wikipedia.org/wiki/Tf%E2%80%93idf
N = len(agg_df)

tf_idf_df = np.log(1 + agg_df) * np.log(1 + N / nt)

In [21]:
tf_idf_df

Unnamed: 0,ghg,climate,sustainability,sustainable,environment,coal,gas,oil,crude,gasoline,...,gas heating,gas turbine,fossil fuel,nuclear power,nuclear plant,nuclear energy,clean energy,green energy,emission trading system,emission trading scheme
2017-01-01,0.0,1.267604,0.0,0.000000,0.854682,0.000000,1.373904,1.697494,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0
2017-01-02,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0
2017-01-03,0.0,0.799769,0.0,0.000000,0.000000,0.000000,0.866837,0.000000,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,2.588125,0.0,0.000000,0.0,0.0
2017-01-04,0.0,0.000000,0.0,0.000000,1.709364,0.000000,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0
2017-01-05,0.0,0.000000,0.0,0.963518,0.000000,1.527475,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-08,0.0,0.000000,0.0,2.237219,0.854682,0.000000,2.240740,0.000000,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,2.825955,0.0,0.0
2021-12-09,0.0,1.267604,0.0,0.000000,0.854682,0.000000,1.373904,0.848747,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0
2021-12-10,0.0,3.334977,0.0,1.927036,0.854682,1.527475,2.240740,2.382734,0.0,2.572924,...,0.0,0.0,1.889075,1.830331,0.0,5.176249,0.0,0.000000,0.0,0.0
2021-12-11,0.0,0.000000,0.0,0.000000,0.000000,0.000000,2.240740,0.000000,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0


# Export results

In [22]:
tf_idf_df.to_csv(f'./tf_idf_gdelt_lemmatized_custom_keywords.csv')

# Support

In [None]:
# Single dataframe
# df = pd.read_csv(f'./merged_articles_carbon_keyword_term_document_matrix_ngram_{ngram}.csv', index_col=0)
# df.head()