# Import packages

In [None]:
# Run this if running in Google Collab
# Mount google drive if running from Google Collab
from google.colab import drive
drive.mount('/content/drive')

# Set current directory if running from Google Collab
import os
os.chdir('/content/drive/My Drive/Carbon_price_prediction/Workspace/Data')

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

import pickle

# Custom functions

In [None]:
def tf_idf(df: pd.DataFrame) -> pd.DataFrame:
    """
    Creates TF-IDF scores based on term-document matrix

    Parameters:
        df (pandas DataFrame):term-document dataframe

    Returns:
        tf_idf_df (pandas DataFrame): Dataframe of TF-IDF scores

    """

    # Count term occurences across documents (dates in ur case)
    nt = (df > 0).sum(axis=0)

    # Broadcast to dataframe for compatible shapes
    nt = (df < 0) + nt

    # Get number of documents
    N = len(df)

    # Implementation based on the 2. recommended option here: https://en.wikipedia.org/wiki/Tf%E2%80%93idf
    tf_idf_df = np.log(1 + df) * np.log(1 + N / nt)

    return tf_idf_df

In [None]:
def agg_keyword_index_tf_idf_plot(ma_window, first_date, df, grouping):
    filtered_df = df[df.index > first_date]

    plt.plot(filtered_df, label='Raw scores', alpha=0.2)
    plt.plot(filtered_df.rolling(ma_window).mean(), label=f'{ma_window}-day moving average')
    plt.ylabel("TF-IDF score")
    plt.title("Time series of aggregated keyword index")
    plt.legend()
    plt.tight_layout()
    grouping_flag = '_grouped' if grouping else ''
    plt.savefig(f'./tf_idf/aggregated{grouping_flag}_tf_idf_index_ts_ma_window_{ma_window}_first_date_{first_date.strftime("%Y%m%d")}.pdf')
    plt.show()

# Parameters / Constants

In [None]:
ngrams = [1, 2, 3]
grouping = True

In [None]:
if grouping:
    group_mapping = pd.read_csv('./keyword_lists/group_mapping.csv', index_col=0).squeeze()
    group_mapping.head()

# Data import

In [None]:
dfs = [pd.read_csv(f'./merged_articles_carbon_keyword_term_document_matrix_ngram_{ngram}.csv', index_col=0)
       for ngram in ngrams]
df = pd.concat(dfs, axis=1)
print(df.shape)
print(df.head())

In [None]:
id_date_map_df = pd.read_csv(f'./lemmatized_merged_articles.csv', index_col=0)
id_date_map_df.shape

In [None]:
df.shape

In [None]:
# Reindex keyword counts by dates
df = df.rename(index=id_date_map_df['date'].squeeze().to_dict())
print(df.head())

In [None]:
# Sanity check
print(min(df.index))
print(max(df.index))

# Data Preprocessing

In [None]:
# Potential grouping
if grouping:
    df = df.rename(columns=group_mapping)
    df = df.groupby(by=df.columns, axis=1).apply(lambda g: g.sum(axis=1))
    # df = df.groupby(by=df.index, axis=0).apply(lambda g: g.sum())

In [None]:
df.head()

In [None]:
# Aggregate by dates
agg_df = df.groupby(df.index).sum()

In [None]:
agg_df

# TF-IDF generation

In [None]:
# Generate TF-IDF scores
tf_idf_df = tf_idf(agg_df)

In [None]:
tf_idf_df

# Export results

In [None]:
grouping_flag = '_grouped' if grouping else ''

In [None]:
tf_idf_df.to_csv(f'./tf_idf_gdelt_lemmatized{grouping_flag}_custom_keywords.csv')

# Aggregated keywords "index"

In [None]:
agg_keyword_index_df = df.groupby(df.index).sum().sum(axis=1)
agg_keyword_index_df

In [None]:
agg_keyword_index_tf_idf_df = tf_idf(agg_keyword_index_df)

In [None]:
agg_keyword_index_tf_idf_df

In [None]:
# Converting the index to date type
agg_keyword_index_tf_idf_df.index = pd.to_datetime(agg_keyword_index_tf_idf_df.index)

In [None]:
# Visualize index over time
ma_window = 7
first_date = pd.to_datetime('2021-01-01')

agg_keyword_index_tf_idf_plot(ma_window, first_date,
                              agg_keyword_index_tf_idf_df, grouping)

# Support

In [None]:
# Single dataframe
# df = pd.read_csv(f'./merged_articles_carbon_keyword_term_document_matrix_ngram_{ngram}.csv', index_col=0)
# df.head()

In [None]:
# nt = (agg_df > 0).sum(axis=0)
# nt = (agg_df < 0) + nt
# print(nt)

In [None]:
# # Implementation based on the 2. recommended option here: https://en.wikipedia.org/wiki/Tf%E2%80%93idf
# N = len(agg_df)

# tf_idf_df = np.log(1 + agg_df) * np.log(1 + N / nt)

In [None]:
# agg_keyword_index_tf_idf_df[agg_keyword_index_tf_idf_df.index > pd.to_datetime('2021-10-10')]

In [None]:
# plt.plot(agg_keyword_index_tf_idf_df, label='Raw scores', alpha=0.7)
# plt.plot(agg_keyword_index_tf_idf_df.rolling(ma_window).mean(), label=f'{ma_window}-day moving average')
# plt.ylabel("TF-IDF score")
# plt.title("TEST")
# plt.tight_layout()
# plt.savefig(f'./tf_idf/aggregated_tf_idf_index_ts.pdf')
# plt.show()

In [None]:
# df2 = df.rename(columns=group_mapping).transpose()
# df2 = df2.groupby(by=df2.index, axis=0).apply(lambda g: g.sum())
# df2.transpose().groupby(by=df2.transpose().index, axis=0).apply(lambda g: g.sum())

In [None]:
# df3 = df2.groupby(by=df2.columns, axis=1).apply(lambda g: g.sum(axis=1))

# df.reset_index(inplace=True)
# df.groupby('INCIDENTDATE').max()

In [None]:
# ngram = 1
# df.columns

In [None]:
# groups = ['renewables', 'fossil_fuel', 'policy', 'emissions']
# group_mapping = {'ghg': 'emissions',
#                  'climate': 'policy',
#                  'sustainability': 'policy',
#                  'sustainable': 'policy',
#                  'environment': 'policy',
#                  'ets': 'policy',
#                  'coal': 'fossil_fuel',
#                 #  'gas': 'fossil_fuel',
#                  'oil': 'fossil_fuel',
#                  'crude': 'fossil_fuel',
#                  'gasoline': 'fossil_fuel',
#                  'diesel': 'fossil_fuel',
#                  'petrol': 'fossil_fuel',
#                  'fuel': 'fossil_fuel',
#                  'electricity': 'renewables',
#                  'renewable': 'renewables',
#                  'carbon dioxide': 'emissions',
#                  'greenhouse gas': 'emissions',
#                  'green deal': 'policy',
#                  'solar power': 'renewables',
#                  'solar panel': 'renewables',
#                  'solar energy': 'renewables',
#                  'wind power': 'renewables',
#                  'wind turbine': 'renewables',
#                  'wind energy': 'renewables',
#                  'natural gas': 'fossil_fuel',
#                  'gas price': 'fossil_fuel',
#                  'gas boiler': 'fossil_fuel',
#                  'gas heating': 'fossil_fuel',
#                  'gas turbine': 'fossil_fuel',
#                  'fossil fuel': 'fossil_fuel',
#                  'nuclear power': 'renewables',
#                  'nuclear plant': 'renewables',
#                  'nuclear energy': 'renewables',
#                  'clean energy': 'renewables',
#                  'green energy': 'renewables',
#                  'emission trading system': 'policy',
#                  'emission trading scheme': 'policy'}

In [None]:
# pd.Series(group_mapping).to_csv('./keyword_lists/group_mapping.csv')