## Data Processing
This notebook contains notes and documentation on data processing that was used to prepare the data for our testing.

In [None]:
import pandas as pd
import numpy as np
import data_processing_modules as dpm
# for MIND_large data processing change the folder name in all associated functions

### Initial Processing
Initially the data was stored in .tsv format without column headers. Via data_to_csv in data_processing_modules we were able to change it into a csv.

In [None]:
## Changes to csv format
# Behaviors and then news
dpm.data_to_csv(True, '../MIND_large/tsv/behaviors.tsv')
dpm.data_to_csv(False, '../MIND_large/tsv/news.tsv')

### Processing for popularity counts 
To access popularity counts for both categories and articles, we made create_popularity_dfs and create_popularity_csvs to extract popularity information and output it into a csv for later use in visualizations. 

In [None]:
# behaviors = pd.read_csv('../MIND_large/csv/behaviors.csv', index_col=0)
news = pd.read_csv('../MIND_large/csv/news.csv', index_col=0)
# dpm.create_popularity_csvs(news, behaviors, small=False)

In [None]:
behaviors = behaviors[behaviors['history'].isna() == False] 
behaviors.isna().any()

### Tensorflow compatibility
Tensorflow recommenders requires the dataset to be in a specific format in order for it to be compatible with its systems. Using decompose_interactions we are able to create a dataframe that is tensorflow compatible.

In [None]:
news.head()

In [None]:
num_rows = 5000000 # Update to determine size of data used in decompose interactions
tf_dataset = dpm.decompose_interactions(num_rows, news, behaviors)
tf_dataset.to_csv('../MIND_large/csv/tensorflowDataset.csv')

In [None]:
tf_dataset = pd.read_csv('../MIND_large/csv/tensorflowDataset.csv')
tf_dataset.head()

### Temporal Processing
Due to the inclusion of the interaction timestamp in the behaviors data we analyzed the popularity of articles at different times of day. To process this data we used create_interaction_counts (behaviors_with_individual_counts). Subsequently we used modify_hourly which extracts the hour from the timestamp.

In [None]:
pop = ['popularity_type', 'lifestyle', 'health', 'news', 'sports', 'weather', 'entertainment', 'foodanddrink', 'autos', 'travel', 'video', 'tv', 'finance', 'movies', 'music', 'kids', 'middleeast', 'games']
pop[1:]

In [None]:
['user_id', 'history'] + [category + '_history' for category in news['category'].unique()]

In [None]:
# dpm.create_interaction_counts()
behaviors = pd.read_csv('../MIND_large/csv/behaviors_with_individual_counts.csv', index_col=0).drop(columns='Unnamed: 0')
behaviors.head()

In [None]:
cut_points = pd.date_range(start='2019-11-09 00:00:00', end='2019-11-15 00:00:00', freq='h') # hourly ranges for the time of the behaviors dataset
# going to want to adjust cutpoints so that we are specifically thinking of hours from 1 - 24 with 24 being midnight (0)

# Create labels for the bins.
bins_str = cut_points.astype(str).values
labels = ['({}, {}]'.format(bins_str[i-1], bins_str[i]) for i in range(1, len(bins_str))]

# Apply the bins to the time column.
behaviors['hour'] = pd.cut(behaviors['time'], cut_points, labels=labels, include_lowest=True)

In [None]:
behaviors = dpm.modify_hourly(behaviors)
behaviors.head()

In [None]:
def aggregate_lists(df):
    # Initialize a dictionary to store your aggregated lists
    agg_dict = {}
    # Loop through each category to aggregate '_history' and '_impression'
    for category in news['category'].unique():
        # agg_dict[f'{category}_history'] = list(df[f'{category}_history'])
        agg_dict[f'{category}_impression'] = list(df[f'{category}_impression'])
    return pd.Series(agg_dict)

In [None]:
behaviors = pd.read_csv('../MIND_large/csv/behaviors_with_individual_counts.csv', index_col=0).drop(columns='Unnamed: 0')

In [None]:
behaviors['history'] = behaviors['history'].fillna(-1)

In [None]:
user_impressions_df = behaviors.groupby(['user_id', 'history'] + [category + '_history' for category in news['category'].unique()])['impressions'].apply(list).reset_index()
user_impressions_df.set_index('user_id', inplace=True)
user_impressions_df.head()

In [None]:
user_impression_preference = behaviors.groupby('user_id')[[category + '_impression' for category in news['category'].unique()]].sum().reset_index()
user_impression_preference.set_index('user_id', inplace=True)
user_impression_preference.head()

In [None]:
indices_df1 = set(user_impressions_df.index)
indices_df2 = set(user_impression_preference.index)

# Find indices in df1 not in df2
unique_to_df1 = indices_df1 - indices_df2

# Find indices in df2 not in df1
unique_to_df2 = indices_df2 - indices_df1

# Optionally, convert these sets back to lists if you need list outputs
unique_to_df1_list = list(unique_to_df1)
unique_to_df2_list = list(unique_to_df2)

# Print or use the unique indices as needed
print("Indices in df1 not in df2:", unique_to_df1_list)
print("Indices in df2 not in df1:", unique_to_df2_list)

In [None]:
user_impressions_df.to_csv('../MIND_large/csv/behaviors_grouped_with_history.csv')
user_impression_preference.to_csv('../MIND_large/csv/behaviors_grouped_with_impression.csv')

In [None]:
user_impressions_df.reset_index(inplace=True)
user_impression_preference.reset_index(inplace=True)

In [None]:
# user_impressions_df.drop(columns='index',inplace=True)
# user_impression_preference.drop(columns='index',inplace=True)

In [None]:
import pandas as pd
import numpy as np
import data_processing_modules as dpm
user_impressions_df = pd.read_csv('../MIND_large/csv/behaviors_grouped_with_history.csv').reset_index()
user_impression_preference = pd.read_csv('../MIND_large/csv/behaviors_grouped_with_history.csv').reset_index()
feature_matrix = user_impressions_df.merge(user_impression_preference)
feature_matrix
user_interacted = feature_matrix[['user_id', 'history', 'impressions']]
user_interacted.head() 
del feature_matrix
del user_impressions_df
del user_impression_preference
news = pd.read_csv('../MIND_large/csv/news.csv')['news_id']
news_data = {news_id : np.full(255990, -1, dtype='int8') for news_id in news}
def populate_dictionaries(behaviors_frame):
    """
    Populates the news data dictionary with user preferences where each user_id corresponds to a row index and the columns correspond to news articles.
    """
    index = 0
    # Might just want to use the popularity counts that are already found in behaviors with popularity counts? that could be a lot better imho
    for history, impressions in zip(behaviors_frame['history'], behaviors_frame['impressions']):

        if history != '-1':
            for news_id in history.split():
                meep = news_data[news_id]
                meep[index] = 1

        if type(impressions) != float:    
            
            impressions = impressions.replace('[', '')
            impressions = impressions.replace(']', '')

            for impression_string in impressions.split(','):
                impression_string = impression_string.replace("'", "")
                for impression in impression_string.split():
                
                    impression_info = dpm.clean_impression(impression)
                    if impression_info['score'] == '1':
                        news_data[impression_info['article_ID']][index] = 1
                    else:
                        news_data[impression_info['article_ID']][index] = 0
        index += 1

populate_dictionaries(user_interacted)
del user_interacted

In [None]:
72023 / 7
matrix_separators = [10289 * i for i in range(1, 8)]
matrix_separators.insert(0, 0)
matrix_separators

In [None]:
for index in range(len(matrix_separators)-1):
    start, end = matrix_separators[index], matrix_separators[index+1]
    user_item_chunk = np.empty((255990, 10290), dtype='int8')
    for index in range(start, end):
        key = news[index]
        user_item_chunk[:, index] = news_data[key]
        del news_data[key]
    np.save(f'../MIND_large/{index+1}user_item_mat.npy', user_item_chunk)
    del user_item_chunk



In [1]:
import pandas as np
import numpy as np

In [2]:
indices = [10289, 20578, 30867, 41156, 51445, 61734, 72023]

matrix = np.load(f'../MIND_large/{indices[0]}user_item_mat.npy')
for index in indices[1:4]:
    matrix = np.append(matrix, np.load(f'../MIND_large/{index}user_item_mat.npy'), axis=1)

matrix


array([[-1, -1, -1, ..., -1, -1, -1],
       [-1, -1, -1, ..., -1, -1, -1],
       [-1, -1, -1, ..., -1, -1, -1],
       ...,
       [-1, -1, -1, ..., -1, -1, -1],
       [-1, -1, -1, ..., -1, -1, -1],
       [-1, -1, -1, ..., -1, -1, -1]], dtype=int8)

In [3]:
for index in indices[4:]:
    print(index)
    matrix = np.append(matrix, np.load(f'../MIND_large/{index}user_item_mat.npy'), axis=1)

matrix

51445
61734


: 

In [None]:
for index in range(len(matrix_separators)-1):
    start, end = matrix_separators[index], matrix_separators[index+1]
user_item_matrix = np.empty((255990, 72023), dtype='int8')

# Assuming you have a list or iterable of arrays in news_data.values()

for index in range(72023): 
    key = news[index]
    user_item_matrix[:, index] = news_data[key]
    del news_data[key]


In [None]:
user_item_matrix = np.column_stack(list(news_data.values()))
np.save('../MIND_large/user_item_mat.npy', user_item_matrix)

In [None]:
user_ids = user_interacted['user_id'].unique()
user_item_matrix = pd.DataFrame(data=news_data, index=user_ids)

In [None]:
news_data

### Clustering Processing
With the goal of minimizing search spaces and making our recommenders more efficient we utilized clustering. During the clustering of news articles we extracted embeddings from a pre-trained BERT model and applied them to the abstracts and titles present in the dataset with create_text_embeddings. In addition to using BERT embeddings, we used scikit learn's bag of words and tf-idf vectorizers. Utilizing scikit-learn vectorizers requires only a few lines of code, therefore any preprocessing is done during clustering instead of prior like BERT embeddings below.

In [None]:
dpm.preprocess_BERT_embeddings(news, small=True)

In [None]:
# Might want to consider using UMAP union on the title and abstract embeddings since currently UMAP is
# reducing all of them together which could cause a loss of data quality