# Purpose

### 2021-10-15
In this notebook I calculate the primary language for each subreddit in v0.4.0 so we can create a table to re-use in other places.


---


May need to review &/or delete temp files in these folder(s):
- `/home/jupyter/`
- `/home/jupyter/subreddit_clustering_i18n`


# Imports & notebook setup

In [1]:
%load_ext google.colab.data_table

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# colab auth for BigQuery
from google.colab import auth
auth.authenticate_user()
print('Authenticated')

Authenticated


### attach up my drive + install my code

In [4]:
# Attach google drive & import my python utility functions
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

import sys
l_paths_to_append = [
    '/content/gdrive/MyDrive/Colab Notebooks',

    # need to append the path to subclu so that colab can import things properly
    '/content/gdrive/MyDrive/Colab Notebooks/subreddit_clustering_i18n'
]
for path_ in l_paths_to_append:
    if not path_ in sys.path:
        sys.path.append(path_)


# from eda import (
#     setup_logging, counts_describe, value_counts_and_pcts,
#     style_df_numeric,
# )

Mounted at /content/gdrive


In [55]:
# Regular Imports
from datetime import datetime

from google.cloud import bigquery

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib_venn import venn2_unweighted, venn3_unweighted

In [6]:
## install libraries needed to read parquet files from GCS

# !pip install -e "/content/gdrive/MyDrive/Colab Notebooks/subreddit_clustering_i18n/"

In [7]:
# !pip list

In [32]:
# subclu imports

# For reloading, need to force-delete some imported items
try:
    del LoadPosts, LoadSubreddits
    del (
        L_CLD3_CODES_FOR_TOP_LANGUAGES_USED_AT_REDDIT,
        L_CLD3_CODES_FOR_TOP_LANGUAGES_AND_USE_MULTILINGUAL,
        D_CLD3_CODE_TO_LANGUAGE_NAME,
    )
except Exception:
    pass

from subclu.utils.hydra_config_loader import LoadHydraConfig
from subclu.data.data_loaders import LoadPosts, LoadSubreddits
from subclu.utils.eda import (
    setup_logging, counts_describe, value_counts_and_pcts,
    notebook_display_config, print_lib_versions,
    style_df_numeric, reorder_array,
)
from subclu.utils.language_code_mapping import (
    L_CLD3_CODES_FOR_TOP_LANGUAGES_USED_AT_REDDIT,
    L_CLD3_CODES_FOR_TOP_LANGUAGES_AND_USE_MULTILINGUAL,
    D_CLD3_CODE_TO_LANGUAGE_NAME,
)

setup_logging()

In [33]:
# Check that we're pulling that latest codes

print(len(L_CLD3_CODES_FOR_TOP_LANGUAGES_USED_AT_REDDIT))
assert (41 == len(L_CLD3_CODES_FOR_TOP_LANGUAGES_USED_AT_REDDIT)), f"Check updated codes"
# L_CLD3_CODES_FOR_TOP_LANGUAGES_USED_AT_REDDIT

print(len(L_CLD3_CODES_FOR_TOP_LANGUAGES_AND_USE_MULTILINGUAL))
assert (44 == len(L_CLD3_CODES_FOR_TOP_LANGUAGES_AND_USE_MULTILINGUAL)), f"Check updated codes"

# L_CLD3_CODES_FOR_TOP_LANGUAGES_AND_USE_MULTILINGUAL

41
44


In [34]:
{k:v for k, v in D_CLD3_CODE_TO_LANGUAGE_NAME.items() if 'Chinese' in v}

{'zh': 'Chinese', 'zh-Latn': 'Chinese', 'zh-cn': 'Chinese', 'zh-tw': 'Chinese'}

In [35]:
# set OS environment to use data-prod by default
import os

os.environ['GOOGLE_CLOUD_PROJECT'] = 'data-prod-165221'

# Load subreddit data using data class

Ideally we could just pull the configuration data from github... for now I'm syncing manually via google drive.

`subclu/config/data_embeddings_to_cluster/v0.4.0_2021_10_14-use_multi_lower_case_false_00.yaml`

[Github URL](https://github.snooguts.net/david-bermejo/subreddit_clustering_i18n/blob/djb_v040_expand_and_refactor/subclu/config/data_embeddings_to_cluster/v0.4.0_2021_10_14-use_multi_lower_case_false_00.yaml)

---

Temp files location:
- `/home/jupyter/`
- `/home/jupyter/subreddit_clustering_i18n`


In [12]:
config_v040 = LoadHydraConfig(
    config_name='v0.4.0_19k_subreddits_2021_09_27',
    config_path="../config/data_text_and_metadata",
)

In [13]:
config_v040.config_dict

{'bucket_name': 'i18n-subreddit-clustering',
 'comments_vectorized_gcs': ['gs://i18n-subreddit-clustering/mlflow/mlruns/14/5f10cd75334142168a6ebb787e477c1f/artifacts/df_vect_comments/*.parquet',
  'gs://i18n-subreddit-clustering/mlflow/mlruns/14/2fcfefc3d5af43328168d3478b4fdeb6/artifacts/df_vect_comments/*.parquet'],
 'comments_vectorized_mlflow_uuids': ['5f10cd75334142168a6ebb787e477c1f',
  '2fcfefc3d5af43328168d3478b4fdeb6'],
 'comments_vectorized_mlflow_uuids_lowercase': None,
 'dataset_name': 'v0.4.0 inputs - Top Subreddits (no Geo) + Geo-relevant subs, comments: TBD',
 'folder_comments_text_and_meta': 'comments/top/2021-10-04',
 'folder_posts_text_and_meta': 'posts/top/2021-09-27',
 'folder_subreddits_text_and_meta': 'subreddits/top/2021-09-24',
 'posts_vectorized_mlflow_uuids': ['8eef951842a34a6e81d176b15ae74afd'],
 'posts_vectorized_mlflow_uuids_lowercase': ['537514ab3c724b10903000501802de0e'],
 'subreddit_meta_vectorized_mlflow_uuids': ['8eef951842a34a6e81d176b15ae74afd'],
 'su

## Load posts

And get language counts (by raw language code).

ETA: 
- ~ 4 minutes (downloading + loading data to memory)
- ~ 1 minute (after data is downloaded locally)


In [14]:
%%time

col_manual_labels = 'manual_topic_and_rating'
try:
    del df_posts
except Exception as e:
    pass

df_posts = LoadPosts(
    bucket_name=config_v040.config_dict['bucket_name'],
    folder_path=config_v040.config_dict['folder_posts_text_and_meta'],
    columns='aggregate_embeddings_',
    col_new_manual_topic=col_manual_labels,
).read_and_apply_transformations()

print(df_posts.shape)

21:23:20 | INFO | "Reading raw data..."
21:23:20 | INFO | "Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/posts/top/2021-09-27"
100%|##############################| 27/27 [00:00<00:00, 30549.29it/s]
21:23:39 | INFO | "  Applying transformations..."


(8439672, 15)
CPU times: user 1min 2s, sys: 4.97 s, total: 1min 7s
Wall time: 1min 5s


### Get list of most common languages

Compare with previous list of most common languages & update it so that we have a better representation in primary & secondary languages.

In [15]:
value_counts_and_pcts(df_posts['weighted_language'], top_n=20)

Unnamed: 0,weighted_language-count,weighted_language-percent,weighted_language-pct_cumulative_sum
en,7297230,86.5%,86.5%
de,135996,1.6%,88.1%
es,90288,1.1%,89.1%
pt,83717,1.0%,90.1%
fr,54513,0.6%,90.8%
it,53512,0.6%,91.4%
UNKNOWN,47593,0.6%,92.0%
af,44132,0.5%,92.5%
nl,42053,0.5%,93.0%
no,37185,0.4%,93.4%


Check Chinese & other non-latin script languages

These may need to be re-mapped in the `language_code_mapping.py` module.

In [16]:
# df_posts[df_posts['weighted_language'].str.contains('zh-cn')].head(15)
value_counts_and_pcts(
    df_posts[df_posts['weighted_language'].str.contains('-')]['weighted_language'],
    top_n=20
)

Unnamed: 0,weighted_language-count,weighted_language-percent,weighted_language-pct_cumulative_sum
zh-cn,3951,23.6%,23.6%
hi-Latn,3640,21.8%,45.4%
ru-Latn,2839,17.0%,62.4%
el-Latn,2258,13.5%,75.8%
ja-Latn,1680,10.0%,85.9%
bg-Latn,1198,7.2%,93.1%
zh-Latn,1113,6.7%,99.7%
zh-tw,49,0.3%,100.0%


In [17]:
df_posts[df_posts['weighted_language'].str.contains('hi')].shape

(4848, 15)

In [18]:
value_counts_and_pcts(
    df_posts[df_posts['weighted_language'].str.contains('hi')]['weighted_language'],
    top_n=20
)

Unnamed: 0,weighted_language-count,weighted_language-percent,weighted_language-pct_cumulative_sum
hi-Latn,3640,75.1%,75.1%
hi,1208,24.9%,100.0%


### Check language _names_

In [19]:
df_posts['weighted_language_top'].nunique()

40

In [20]:
value_counts_and_pcts(df_posts['weighted_language_top'], top_n=43)

Unnamed: 0,weighted_language_top-count,weighted_language_top-percent,weighted_language_top-pct_cumulative_sum
English,7297230,86.5%,86.5%
Other_language,137428,1.6%,88.1%
German,135996,1.6%,89.7%
Spanish,90288,1.1%,90.8%
Portuguese,83717,1.0%,91.8%
French,54513,0.6%,92.4%
Italian,53512,0.6%,93.0%
UNKNOWN,47593,0.6%,93.6%
Afrikaans,44132,0.5%,94.1%
Dutch,42053,0.5%,94.6%


## Load subreddits (and implicitly posts)

In [36]:
%%time

col_manual_labels = 'manual_topic_and_rating'
try:
    del df_subs
except Exception:
    pass

df_subs = LoadSubreddits(
    bucket_name=config_v040.config_dict['bucket_name'],
    folder_path=config_v040.config_dict['folder_subreddits_text_and_meta'],
    folder_posts=config_v040.config_dict['folder_posts_text_and_meta'],
    columns=None,
    col_new_manual_topic=col_manual_labels,
# ).read_apply_transformations_and_merge_post_aggs()
).read_apply_transformations_and_merge_post_aggs(df_posts)

print(df_subs.shape)

21:25:58 | INFO | "  reading sub-level data & merging with aggregates..."
21:25:58 | INFO | "Reading raw data..."
21:25:58 | INFO | "Local folder to download artifact(s):
  /home/jupyter/subreddit_clustering_i18n/data/local_cache/subreddits/top/2021-09-24"
100%|#################################| 1/1 [00:00<00:00, 1207.34it/s]
21:25:59 | INFO | "  Applying transformations..."


(19262, 98)
CPU times: user 18.8 s, sys: 659 ms, total: 19.5 s
Wall time: 19.8 s


In [37]:
df_subs.iloc[:5, :10]

Unnamed: 0,pt_date,subreddit_name,subreddit_id,geo_relevant_country_codes,geo_relevant_countries,geo_relevant_country_count,geo_relevant_subreddit,ambassador_subreddit,combined_topic,combined_topic_and_rating
0,2021-09-21,askreddit,t5_2qh1i,,,,False,False,uncategorized,uncategorized
1,2021-09-21,pics,t5_2qh0u,,,,False,False,art,art
2,2021-09-21,funny,t5_2qh33,,,,False,False,uncategorized,uncategorized
3,2021-09-21,memes,t5_2qjpg,,,,False,False,uncategorized,uncategorized
4,2021-09-21,interestingasfuck,t5_2qhsa,,,,False,False,uncategorized,uncategorized


In [38]:
df_subs.iloc[:10, -20:]

Unnamed: 0,primary_post_language_percent,primary_post_language_in_use_multilingual,secondary_post_language,secondary_post_language_percent,crosspost_post_type_percent,gallery_post_type_percent,gif_post_type_percent,image_post_type_percent,link_post_type_percent,liveaudio_post_type_percent,multi_media_post_type_percent,poll_post_type_percent,rpan_post_type_percent,text_post_type_percent,video_post_type_percent,primary_post_type,primary_post_type_percent,posts_for_modeling_count,post_median_word_count,post_median_text_len
0,0.9975,True,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,text,1.0,1200.0,11.0,58.0
1,0.935,True,,,0.003333,0.0,0.0,0.915,0.081667,0.0,0.0,0.0,0.0,0.0,0.0,image,0.915,1200.0,10.0,57.0
2,0.861667,True,German,0.015833,0.0,0.0,0.0225,0.625,0.071667,0.0,0.0,0.0,0.0,0.000833,0.28,image,0.625,1200.0,6.0,33.0
3,0.805833,True,Danish,0.015,0.0,0.0,0.085,0.8925,0.0225,0.0,0.0,0.0,0.0,0.0,0.0,image,0.8925,1200.0,4.0,23.0
4,0.9375,True,,,0.0,0.0,0.0,0.688333,0.311667,0.0,0.0,0.0,0.0,0.0,0.0,image,0.688333,1200.0,11.0,60.0
5,0.635833,True,Turkish,0.029167,0.011667,0.004167,0.143333,0.795833,0.036667,0.0,0.0,0.0,0.0,0.000833,0.0075,image,0.795833,1200.0,3.0,17.0
6,0.934167,True,,,0.125833,0.001667,0.006667,0.0,0.005,0.0,0.0,0.0,0.0,0.0,0.860833,video,0.860833,1200.0,11.0,60.0
7,0.856667,True,Afrikaans,0.01,0.001667,0.016667,0.01,0.724167,0.055833,0.0,0.0,0.0,0.0,0.0,0.191667,image,0.724167,1200.0,6.0,33.0
8,0.770833,True,Norwegian,0.02,0.0,0.0,0.050833,0.014167,0.021667,0.0,0.0,0.0,0.0,0.0,0.913333,video,0.913333,1200.0,4.0,23.0
9,0.996667,True,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,text,1.0,1200.0,35.5,181.0


In [39]:
[c for c in df_subs.columns if 'text' in c]

['text_post_type_percent', 'post_median_text_len']

# Extract only language-related columns

In [40]:
l_post_percent = [c for c in df_subs.columns if '_percent' in c]
print(len(l_post_percent))

54


In [41]:
# the last column with language-related data is expected to be `primary_post_language_percent`
ix_language_limit = l_post_percent.index('primary_post_language_percent')
ix_language_limit

40

In [66]:
l_ix_sub = ['subreddit_id', 'subreddit_name', ]
l_cols_primary_language = [
    'primary_post_language', 'primary_post_language_percent', 
    'secondary_post_language', 'secondary_post_language_percent', 
]
l_cols_geo_and_ambassador = [
    'geo_relevant_countries',
    'geo_relevant_country_count',
    'geo_relevant_country_codes',
    'geo_relevant_subreddit',
    'ambassador_subreddit',
]

l_cols_text_used = [
    'posts_for_modeling_count',
    'post_median_text_len',
    'post_median_word_count',
    'primary_post_language_in_use_multilingual',
]
l_language_cols = [
    c for c in l_post_percent[:1 + ix_language_limit] if c not in l_cols_primary_language
]

l_cols_to_save = (
    l_ix_sub +
    l_cols_primary_language +
    l_cols_geo_and_ambassador +
    l_cols_text_used +
    l_language_cols
)

print(len(l_cols_to_save))

55


In [67]:
df_subs[l_cols_to_save].head()

Unnamed: 0,subreddit_id,subreddit_name,primary_post_language,primary_post_language_percent,secondary_post_language,secondary_post_language_percent,geo_relevant_countries,geo_relevant_country_count,geo_relevant_country_codes,geo_relevant_subreddit,ambassador_subreddit,posts_for_modeling_count,post_median_text_len,post_median_word_count,primary_post_language_in_use_multilingual,Afrikaans_posts_percent,Albanian_posts_percent,Arabic_posts_percent,Catalan_posts_percent,Chinese_posts_percent,Croatian_posts_percent,Danish_posts_percent,Dutch_posts_percent,English_posts_percent,Estonian_posts_percent,Finnish_posts_percent,French_posts_percent,German_posts_percent,Hindi_posts_percent,Hungarian_posts_percent,Indonesian_posts_percent,Italian_posts_percent,Japanese_posts_percent,Korean_posts_percent,Malayalam_posts_percent,Marathi_posts_percent,Norwegian_posts_percent,Other_language_posts_percent,Polish_posts_percent,Portuguese_posts_percent,Romanian_posts_percent,Russian_posts_percent,Slovenian_posts_percent,Somali_posts_percent,Spanish_posts_percent,Swahili_posts_percent,Swedish_posts_percent,Tagalog_posts_percent,Tamil_posts_percent,Telugu_posts_percent,Thai_posts_percent,Turkish_posts_percent,UNKNOWN_posts_percent,Vietnamese_posts_percent,Welsh_posts_percent
0,t5_2qh1i,askreddit,English,0.9975,,,,,,False,False,1200.0,58.0,11.0,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9975,0.0,0.0,0.000833,0.0,0.0,0.0,0.000833,0.0,0.0,0.0,0.0,0.0,0.0,0.000833,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,t5_2qh0u,pics,English,0.935,,,,,,False,False,1200.0,57.0,10.0,True,0.005833,0.0,0.0,0.000833,0.000833,0.0,0.003333,0.003333,0.935,0.001667,0.000833,0.001667,0.004167,0.0,0.0,0.005,0.001667,0.0,0.0,0.0,0.0,0.001667,0.013333,0.004167,0.001667,0.003333,0.0,0.0,0.000833,0.001667,0.000833,0.0,0.005,0.0,0.0,0.0,0.001667,0.0,0.0,0.001667
2,t5_2qh33,funny,English,0.861667,German,0.015833,,,,False,False,1200.0,33.0,6.0,True,0.010833,0.0,0.0,0.005833,0.0,0.0,0.01,0.014167,0.861667,0.004167,0.000833,0.0,0.015833,0.0,0.0,0.008333,0.0025,0.0,0.0,0.0,0.0,0.005,0.0225,0.000833,0.0025,0.003333,0.0,0.001667,0.005833,0.001667,0.0,0.0025,0.010833,0.0,0.0,0.0,0.0,0.000833,0.000833,0.0075
3,t5_2qjpg,memes,English,0.805833,Danish,0.015,,,,False,False,1200.0,23.0,4.0,True,0.011667,0.0025,0.0,0.005833,0.0,0.0,0.015,0.015,0.805833,0.003333,0.001667,0.013333,0.013333,0.0,0.0,0.006667,0.008333,0.001667,0.000833,0.0,0.0,0.014167,0.03,0.000833,0.004167,0.005,0.0,0.000833,0.0075,0.0025,0.003333,0.008333,0.009167,0.0,0.0,0.0,0.0,0.0,0.0025,0.006667
4,t5_2qhsa,interestingasfuck,English,0.9375,,,,,,False,False,1200.0,60.0,11.0,True,0.005,0.0,0.0,0.000833,0.000833,0.0,0.003333,0.006667,0.9375,0.0,0.0,0.004167,0.006667,0.0,0.0,0.004167,0.000833,0.0,0.0,0.0,0.0,0.004167,0.010833,0.0,0.0,0.004167,0.000833,0.0,0.000833,0.001667,0.0,0.001667,0.003333,0.0,0.0,0.0,0.0,0.0,0.0,0.0025


In [68]:
df_subs[l_cols_to_save].tail()

Unnamed: 0,subreddit_id,subreddit_name,primary_post_language,primary_post_language_percent,secondary_post_language,secondary_post_language_percent,geo_relevant_countries,geo_relevant_country_count,geo_relevant_country_codes,geo_relevant_subreddit,ambassador_subreddit,posts_for_modeling_count,post_median_text_len,post_median_word_count,primary_post_language_in_use_multilingual,Afrikaans_posts_percent,Albanian_posts_percent,Arabic_posts_percent,Catalan_posts_percent,Chinese_posts_percent,Croatian_posts_percent,Danish_posts_percent,Dutch_posts_percent,English_posts_percent,Estonian_posts_percent,Finnish_posts_percent,French_posts_percent,German_posts_percent,Hindi_posts_percent,Hungarian_posts_percent,Indonesian_posts_percent,Italian_posts_percent,Japanese_posts_percent,Korean_posts_percent,Malayalam_posts_percent,Marathi_posts_percent,Norwegian_posts_percent,Other_language_posts_percent,Polish_posts_percent,Portuguese_posts_percent,Romanian_posts_percent,Russian_posts_percent,Slovenian_posts_percent,Somali_posts_percent,Spanish_posts_percent,Swahili_posts_percent,Swedish_posts_percent,Tagalog_posts_percent,Tamil_posts_percent,Telugu_posts_percent,Thai_posts_percent,Turkish_posts_percent,UNKNOWN_posts_percent,Vietnamese_posts_percent,Welsh_posts_percent
19257,t5_byuju,kryptowaehrungen,,,,,,,,False,True,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
19258,t5_4azjpj,mediende,,,,,,,,False,True,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
19259,t5_4ogb6k,formula_student,,,,,,,,False,True,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
19260,t5_4oh99n,vansofgermany,,,,,,,,False,True,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
19261,t5_2yu7j,unterhaltung,,,,,,,,False,True,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [74]:
[c for c in df_subs.columns if 've' in c]

['over_18', 'Slovenian_posts_percent', 'liveaudio_post_type_percent']

In [73]:
df_subs.head()

Unnamed: 0,pt_date,subreddit_name,subreddit_id,geo_relevant_country_codes,geo_relevant_countries,geo_relevant_country_count,geo_relevant_subreddit,ambassador_subreddit,combined_topic,combined_topic_and_rating,rating_short,rating_name,primary_topic,secondary_topics,mature_themes_list,over_18,allow_top,video_whitelisted,subreddit_language,whitelist_status,subscribers,first_screenview_date,last_screenview_date,users_l7,users_l28,posts_l7,posts_l28,comments_l7,comments_l28,pt,subreddit_clean_description_word_count,subreddit_name_title_and_clean_descriptions_word_count,subreddit_title,subreddit_public_description,subreddit_description,subreddit_name_title_and_clean_descriptions,manual_topic_and_rating,Afrikaans_posts_percent,Albanian_posts_percent,Arabic_posts_percent,...,Norwegian_posts_percent,Other_language_posts_percent,Polish_posts_percent,Portuguese_posts_percent,Romanian_posts_percent,Russian_posts_percent,Slovenian_posts_percent,Somali_posts_percent,Spanish_posts_percent,Swahili_posts_percent,Swedish_posts_percent,Tagalog_posts_percent,Tamil_posts_percent,Telugu_posts_percent,Thai_posts_percent,Turkish_posts_percent,UNKNOWN_posts_percent,Vietnamese_posts_percent,Welsh_posts_percent,primary_post_language,primary_post_language_percent,primary_post_language_in_use_multilingual,secondary_post_language,secondary_post_language_percent,crosspost_post_type_percent,gallery_post_type_percent,gif_post_type_percent,image_post_type_percent,link_post_type_percent,liveaudio_post_type_percent,multi_media_post_type_percent,poll_post_type_percent,rpan_post_type_percent,text_post_type_percent,video_post_type_percent,primary_post_type,primary_post_type_percent,posts_for_modeling_count,post_median_word_count,post_median_text_len
0,2021-09-21,askreddit,t5_2qh1i,,,,False,False,uncategorized,uncategorized,E,Everyone,Learning and Education,,"profanity_occasional, profanity",f,t,,es,all_ads,33604689,2020-08-24,2021-09-21,12563532,31513185,71934,296017,1525489,6194629,2021-09-24,405,420,Ask Reddit...,r/AskReddit is the place to ask and answer tho...,###### [ [ SERIOUS ] ](http://www.reddit.com/r...,AskReddit. \nAsk Reddit.... \nr AskReddit is t...,uncategorized,0.0,0.0,0.0,...,0.0,0.000833,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,English,0.9975,True,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,text,1.0,1200.0,11.0,58.0
1,2021-09-21,pics,t5_2qh0u,,,,False,False,art,art,E,Everyone,Art,,,f,t,,en,all_ads,28014622,2020-08-24,2021-09-21,6062041,12928114,6101,24428,163585,742511,2021-09-24,347,356,Reddit Pics,A place for pictures and photographs.,[Rules](https://www.reddit.com/r/pics/wiki/ind...,pics. \nReddit Pics. \nA place for pictures an...,art,0.005833,0.0,0.0,...,0.001667,0.013333,0.004167,0.001667,0.003333,0.0,0.0,0.000833,0.001667,0.000833,0.0,0.005,0.0,0.0,0.0,0.001667,0.0,0.0,0.001667,English,0.935,True,,,0.003333,0.0,0.0,0.915,0.081667,0.0,0.0,0.0,0.0,0.0,0.0,image,0.915,1200.0,10.0,57.0
2,2021-09-21,funny,t5_2qh33,,,,False,False,uncategorized,uncategorized,E,Everyone,,,,f,t,f,en,all_ads,37367466,2020-08-24,2021-09-21,5767977,12250775,6892,28839,114801,463485,2021-09-24,839,850,funny,"Welcome to r/Funny, Reddit's largest humour de...",**Welcome to r/Funny:**\n\n\nYou may only post...,"funny. \nfunny. \nWelcome to r Funny, Reddit's...",uncategorized,0.010833,0.0,0.0,...,0.005,0.0225,0.000833,0.0025,0.003333,0.0,0.001667,0.005833,0.001667,0.0,0.0025,0.010833,0.0,0.0,0.0,0.0,0.000833,0.000833,0.0075,English,0.861667,True,German,0.015833,0.0,0.0,0.0225,0.625,0.071667,0.0,0.0,0.0,0.0,0.000833,0.28,image,0.625,1200.0,6.0,33.0
3,2021-09-21,memes,t5_2qjpg,,,,False,False,uncategorized,uncategorized,E,Everyone,Funny/Humor,,"profanity, profanity_occasional",f,t,f,en,all_ads,16335892,2020-08-24,2021-09-21,3969463,10101856,27518,118705,430622,1900286,2021-09-24,831,873,/r/Memes the original since 2008,Memes!\n\nA way of describing cultural informa...,###### \n#**Welcome to /r/Memes**\n\nCommunity...,memes. \n/r/Memes the original since 2008. \nM...,uncategorized,0.011667,0.0025,0.0,...,0.014167,0.03,0.000833,0.004167,0.005,0.0,0.000833,0.0075,0.0025,0.003333,0.008333,0.009167,0.0,0.0,0.0,0.0,0.0,0.0025,0.006667,English,0.805833,True,Danish,0.015,0.0,0.0,0.085,0.8925,0.0225,0.0,0.0,0.0,0.0,0.0,0.0,image,0.8925,1200.0,4.0,23.0
4,2021-09-21,interestingasfuck,t5_2qhsa,,,,False,False,uncategorized,uncategorized,E,Everyone,,,"profanity, profanity_sr_name",f,t,f,en,all_ads,8638369,2020-08-24,2021-09-21,5197231,10071629,1955,7784,132845,522494,2021-09-24,304,313,Interesting As Fuck,For anything that is InterestingAsFuck,---\n\nA place to share (almost) anything and ...,interestingasfuck. \nInteresting As Fuck. \nFo...,uncategorized,0.005,0.0,0.0,...,0.004167,0.010833,0.0,0.0,0.004167,0.000833,0.0,0.000833,0.001667,0.0,0.001667,0.003333,0.0,0.0,0.0,0.0,0.0,0.0,0.0025,English,0.9375,True,,,0.0,0.0,0.0,0.688333,0.311667,0.0,0.0,0.0,0.0,0.0,0.0,image,0.688333,1200.0,11.0,60.0


# Save table to BigQuery

Sorting is not guaranteed in the final BigQuery table.

In [71]:
(
    df_subs[l_cols_to_save]
    .dropna(subset=l_cols_primary_language, how='all')
    # .sort_values(by=['subreddit_name', ], ascending=True)
    .assign(table_creation_date=datetime.utcnow().date())
    .to_gbq(
        destination_table='david_bermejo.subclu_v0040_subreddit_languages',
        project_id='reddit-employee-datasets',
        chunksize=None,
        if_exists='replace'
    )
)

19192 out of 19192 rows loaded."
1it [00:11, 11.39s/it]


# Experiment/ Demo getting secondary language with some sample subreddits

This logic now lives in this function:

`sublcu/data/data_loaders.py > get_subreddit_secondary_language`

In [48]:
l_ix_sub_2ndary_lang = ['subreddit_name']
col_2nd_language = 'secondary_language'
col_2nd_lang_pct = 'secondary_post_language_percent'

(
    df_subs[df_subs['subreddit_name'].str.contains('mex')].head(100)
    [l_ix_sub_2ndary_lang +l_language_cols]
    .set_index(l_ix_sub_2ndary_lang)
    .stack()
    .reset_index()
    .rename(columns={'level_1': col_2nd_language,
                     0: col_2nd_lang_pct,
                     })
    .assign(
        **{col_2nd_language: lambda x: x[col_2nd_language].str.replace('_posts_percent', '')}
    )
    .query(f"{col_2nd_language} != 'Other_language' & {col_2nd_language} != 'UNKNOWN' & {col_2nd_lang_pct} > 0.0001")
    .assign(
        language_rank=lambda x: x.groupby(l_ix_sub_2ndary_lang)[col_2nd_lang_pct].rank(method='first', ascending=False),
    )
    .query(f"language_rank == 2 & {col_2nd_lang_pct} > 0.008")
    .sort_values(by=['subreddit_name', 'language_rank'], ascending=[True, True])
    .set_index(l_ix_sub_2ndary_lang)
    .drop('language_rank', axis=1)
)

Unnamed: 0_level_0,secondary_language,secondary_post_language_percent
subreddit_name,Unnamed: 1_level_1,Unnamed: 2_level_1
argomexperiencia,English,0.040936
askmexico,Spanish,0.166667
ayudamexico,Somali,0.016949
belgamexicana,English,0.206897
bitcoinmexico,English,0.04
buildapcsalesmexico,English,0.0625
cryptomexico,English,0.116071
derechomexicano,English,0.031746
foro_mexico,English,0.179487
futbolfemenilmexico,Portuguese,0.068627
