# Purpose

### 2022-03-28
After using ANNOY to calculate the nearest neighbor distances, we need to upload them to bigQuery so that we can share this data & use it outside of python/notebooks.

# Imports & notebook setup

In [None]:
%load_ext google.colab.data_table
%load_ext autoreload
%autoreload 2

In [None]:
# colab auth for BigQuery & google drive
from google.colab import auth, files, drive
import sys  # need sys for mounting gdrive path

auth.authenticate_user()
print('Authenticated')

Authenticated


### Install libraries

These might be necessary to read from GCS

In [None]:
# # install subclu & libraries needed to read parquet files from GCS & spreadsheets
# #  make sure to use the [colab] `extra` because it includes colab-specific libraries
# module_path = f"{g_drive_root}/MyDrive/Colab Notebooks/subreddit_clustering_i18n/[colab]"

!pip install fsspec gcsfs



## General imports

In [None]:
# Regular Imports
import os
from datetime import datetime

from google.cloud import bigquery

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib_venn import venn2_unweighted, venn3_unweighted


# Set env variable needed by some libraries to get datay from BigQuery
# os.environ['GOOGLE_CLOUD_PROJECT'] = 'data-science-prod-218515'
os.environ['GOOGLE_CLOUD_PROJECT'] = 'data-prod-165221'

# Load df top nearest neighbors


In [None]:
%%time
gs_df_nn = 'gs://i18n-subreddit-clustering/data/models/nearest_neighbors/manual_model_2022-03-28_191331/df_nearest_neighbors_top-4906242_by_7.parquet'
df_nn_top = pd.read_parquet(gs_df_nn)
print(df_nn_top.shape)

(4906242, 7)
CPU times: user 2.71 s, sys: 904 ms, total: 3.62 s
Wall time: 3.54 s


In [None]:
df_nn_top.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4906242 entries, 1 to 4955799
Data columns (total 7 columns):
 #   Column             Dtype  
---  ------             -----  
 0   subreddit_id_a     object 
 1   subreddit_name_a   object 
 2   distance_rank      int64  
 3   subreddit_id_b     object 
 4   subreddit_name_b   object 
 5   distance           float64
 6   cosine_similarity  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 299.5+ MB


In [None]:
df_nn_top.iloc[:5, :9]

Unnamed: 0,subreddit_id_a,subreddit_name_a,distance_rank,subreddit_id_b,subreddit_name_b,distance,cosine_similarity
1,t5_46wt4h,0hthaatsjaay,1,t5_5fweuy,bestpyt,0.505327,0.872322
2,t5_46wt4h,0hthaatsjaay,2,t5_4ph6vm,babyfacejassbest,0.559713,0.843361
3,t5_46wt4h,0hthaatsjaay,3,t5_4p3c20,officialtootie,0.561146,0.842558
4,t5_46wt4h,0hthaatsjaay,4,t5_2kxm87,honeybthatsme,0.569104,0.83806
5,t5_46wt4h,0hthaatsjaay,5,t5_3ng2du,yungblasian,0.578581,0.832622


In [None]:
df_nn_top.columns.to_list()

['subreddit_id_a',
 'subreddit_name_a',
 'distance_rank',
 'subreddit_id_b',
 'subreddit_name_b',
 'distance',
 'cosine_similarity']

# Save table to BigQuery

NOTE: Sorting is not guaranteed in the final BigQuery table.

We can "force" sorting if we set `chunksize` to a number smaller than the full df size.

ETA for 
batch = 10k
- 490k sub pairs = 3:30 minutes (3.5)
- 4.9 Million sub pairs = ~37 minutes

In [None]:
l_cols_for_bq = [
    'subreddit_id_a',
    'subreddit_name_a',
    'distance_rank',
    'subreddit_id_b',
    'subreddit_name_b',
    'cosine_similarity'
]

(
    df_nn_top[l_cols_for_bq]
    .assign(table_creation_date=pd.to_datetime(datetime.utcnow().date()))
    .to_gbq(
        destination_table='david_bermejo.subclu_v0041_subreddit_distances_c_top_100',
        project_id='reddit-employee-datasets',
        chunksize=10000,
        if_exists='replace'
    )
)

491it [36:27,  4.46s/it]
