# Methods for Finding Related Reddit Subreddits with Simple Set Theory

by Max Woolf (@minimaxir)

*This notebook is licensed under the MIT License. If you use the code or data visualization designs contained within this notebook, it would be greatly appreciated if proper attribution is given back to this notebook and/or myself. Thanks! :)*

In [1]:
source("Rstart.R")

library(methods)
library(bigrquery)
library(viridis)

sessionInfo()


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

Registering fonts with R

Attaching package: ‘scales’

The following objects are masked from ‘package:readr’:

    col_factor, col_numeric



R version 3.3.0 (2016-05-03)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: OS X 10.11.4 (El Capitan)

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] grid      stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] viridis_0.3.4      bigrquery_0.2.0    stringr_1.0.0      digest_0.6.9      
 [5] RColorBrewer_1.1-2 scales_0.4.0       extrafont_0.17     ggplot2_2.1.0     
 [9] dplyr_0.4.3        readr_0.2.2       

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.4      Rttf2pt1_1.3.3   magrittr_1.5     munsell_0.4.3   
 [5] uuid_0.1-2       colorspace_1.2-6 R6_2.1.2         httr_1.1.0      
 [9] plyr_1.8.3       tools_3.3.0      parallel_3.3.0   gtable_0.2.0    
[13] DBI_0.4          extrafontdb_1.0  assertthat_0.1   gridExtra_2.2.1 
[17] IRdisplay_0.3    repr_0.4         base64enc_0.1-3  IRkernel_0.5    
[21] evaluate_0.9     rzmq_0.7.7       stringi_1.0-1    j

## Get ALL Edge Data from Reddit

Must use R client to access BigQuery since too much data. If you don't have access to BigQuery, skip the next two cells and use the CSVs included with the repo.

In [2]:
project_name <- "<FILL IN>"   # DO NOT SHARE!

sql <- "SELECT a.l_subreddit as Source, b.l_subreddit as Target, COUNT(*) as Weight
FROM (
  SELECT author, LOWER(subreddit) as l_subreddit, COUNT(DISTINCT(link_id)) as unique_threads
  FROM [pushshift:rt_reddit.comments]
  GROUP BY author, l_subreddit
  HAVING unique_threads >= 5) a JOIN (
  SELECT author, LOWER(subreddit) as l_subreddit, COUNT(DISTINCT(link_id)) as unique_threads
  FROM [pushshift:rt_reddit.comments]
  GROUP BY author, l_subreddit
  HAVING unique_threads >= 5) b ON a.author = b.author
GROUP BY Source, Target
HAVING Source < Target AND Weight >= 10
ORDER BY Weight DESC"

df <- tbl_df(query_exec(sql, project=project_name, max_pages=Inf))
write.csv(df, "reddit-edgelist-061816.csv", row.names=F)

Auto-refreshing stale OAuth token.


Running query:   RUNNING  2.5sRunning query:   RUNNING  3.1sRunning query:   RUNNING  3.7sRunning query:   RUNNING  4.3sRunning query:   RUNNING  4.9sRunning query:   RUNNING  5.5sRunning query:   RUNNING  6.1sRunning query:   RUNNING  6.7sRunning query:   RUNNING  7.3sRunning query:   RUNNING  8.0sRunning query:   RUNNING  8.6sRunning query:   RUNNING  9.2sRunning query:   RUNNING  9.8sRunning query:   RUNNING 10.8sRunning query:   RUNNING 11.4sRunning query:   RUNNING 12.0sRunning query:   RUNNING 12.6sRunning query:   RUNNING 13.3sRunning query:   RUNNING 13.9sRunning query:   RUNNING 14.5sRunning query:   RUNNING 15.1sRunning query:   RUNNING 15.7sRunning query:   RUNNING 16.3sRunning query:   RUNNING 16.9sRunning query:   RUNNING 17.9sRunning query:   RUNNING 18.5sRunning query:   RUNNING 19.1sRunning query:   RUNNING 19.7sRunning query:   RUNNING 20.3sRunning query:   RUNNING 20.9sRunning query:   RUNNING 21.5sRunning query:   RUNNING 22.1sRunning

8.7 gigabytes processed


Retrieving data:  2.0sRetrieving data:  2.9sRetrieving data:  3.9sRetrieving data:  4.8sRetrieving data:  5.6sRetrieving data:  6.6sRetrieving data:  7.5sRetrieving data:  8.4sRetrieving data:  9.4sRetrieving data: 10.3sRetrieving data: 11.3sRetrieving data: 12.5sRetrieving data: 13.5sRetrieving data: 14.4sRetrieving data: 15.4sRetrieving data: 16.5sRetrieving data: 17.7sRetrieving data: 18.6sRetrieving data: 19.5sRetrieving data: 20.5sRetrieving data: 21.3sRetrieving data: 22.2sRetrieving data: 23.2sRetrieving data: 24.0sRetrieving data: 25.0sRetrieving data: 26.0sRetrieving data: 26.9sRetrieving data: 27.8sRetrieving data: 28.8sRetrieving data: 29.7sRetrieving data: 30.9sRetrieving data: 31.8s


## Get Active Users

Get Active Users for all subreddits w/ atleast 10 users

In [3]:
sql <- "SELECT l_subreddit as subreddit, COUNT(*) as Weight
FROM (
SELECT author, LOWER(subreddit) as l_subreddit, COUNT(DISTINCT(link_id)) as unique_threads
  FROM [pushshift:rt_reddit.comments]
  GROUP BY author, l_subreddit
  HAVING unique_threads >= 5
  )
GROUP BY subreddit
HAVING Weight >= 10
ORDER BY Weight DESC"

df_active_users <- tbl_df(query_exec(sql, project=project_name, max_pages=Inf))
write.csv(df_active_users, "reddit-active-users-061816.csv", row.names=F)

Running query:   RUNNING  2.5sRunning query:   RUNNING  3.2sRunning query:   RUNNING  3.8sRunning query:   RUNNING  4.4sRunning query:   RUNNING  5.0sRunning query:   RUNNING  5.6sRunning query:   RUNNING  6.2sRunning query:   RUNNING  6.8sRunning query:   RUNNING  7.4sRunning query:   RUNNING  8.0sRunning query:   RUNNING  8.6sRunning query:   RUNNING  9.2sRunning query:   RUNNING  9.8sRunning query:   RUNNING 10.3sRunning query:   RUNNING 10.9sRunning query:   RUNNING 11.5sRunning query:   RUNNING 12.2sRunning query:   RUNNING 12.8sRunning query:   RUNNING 13.4sRunning query:   RUNNING 13.9sRunning query:   RUNNING 14.6sRunning query:   RUNNING 15.2sRunning query:   RUNNING 15.8sRunning query:   RUNNING 16.4sRunning query:   RUNNING 17.0sRunning query:   RUNNING 17.6sRunning query:   RUNNING 18.2sRunning query:   RUNNING 18.8sRunning query:   RUNNING 19.4sRunning query:   RUNNING 20.0sRunning query:   RUNNING 20.6sRunning query:   RUNNING 21.2sRunning

8.7 gigabytes processed


## Load Data

In [4]:
df <- read_csv("reddit-edgelist-061816.csv")
df_active_users <- read_csv("reddit-active-users-061816.csv")

df <- df %>% arrange(Source, Target)
print(head(df))
print(head(df_active_users))

Source: local data frame [6 x 3]

         Source           Target Weight
          (chr)            (chr)  (int)
1 100daysofketo 1200isplentyketo     11
2 100daysofketo        askreddit     52
3 100daysofketo         askwomen     10
4 100daysofketo         fatlogic     12
5 100daysofketo            funny     11
6 100daysofketo             keto    126
Source: local data frame [6 x 2]

        subreddit Weight
            (chr)  (int)
1       askreddit 419024
2           funny  89687
3            pics  83221
4          videos  71996
5        politics  71252
6 leagueoflegends  66975


Run data processing on one subreddit for testing.

Filter only on subreddits linking directly to specified subreddit.

In [32]:
subreddit <- "aww"

df_subreddit <- df %>% filter(Source == subreddit | Target == subreddit)

print(head(df_subreddit))

Source: local data frame [6 x 3]

             Source Target Weight
              (chr)  (chr)  (int)
1 10cloverfieldlane    aww     27
2        112263hulu    aww     14
3      1200isplenty    aww     51
4         2007scape    aww    110
5            30rock    aww     18
6          3amjokes    aww     12


Clean up data by explicitly identifying Neighbor.

In [33]:
df_subreddit <- df_subreddit %>% mutate(Neighbor = ifelse(Source == subreddit, Target, Source))

print(head(df_subreddit))

Source: local data frame [6 x 4]

             Source Target Weight          Neighbor
              (chr)  (chr)  (int)             (chr)
1 10cloverfieldlane    aww     27 10cloverfieldlane
2        112263hulu    aww     14        112263hulu
3      1200isplenty    aww     51      1200isplenty
4         2007scape    aww    110         2007scape
5            30rock    aww     18            30rock
6          3amjokes    aww     12          3amjokes


Get the Active Users for the Neighbor via Merging.

In [34]:
df_subreddit <- df_subreddit %>% left_join(df_active_users, by=c("Neighbor"="subreddit"))

print(head(df_subreddit))

Source: local data frame [6 x 5]

             Source Target Weight.x          Neighbor Weight.y
              (chr)  (chr)    (int)             (chr)    (int)
1 10cloverfieldlane    aww       27 10cloverfieldlane     1126
2        112263hulu    aww       14        112263hulu      214
3      1200isplenty    aww       51      1200isplenty      872
4         2007scape    aww      110         2007scape    12413
5            30rock    aww       18            30rock      130
6          3amjokes    aww       12          3amjokes       61


Calculate: (A ∩ B) / (A ∪ B) between subreddits, select Top 15 Values.

In [35]:
df_subreddit <- df_subreddit %>%
                    mutate(norm = Weight.x/Weight.y) %>%
                    arrange(desc(norm)) %>%
                    head(15) %>%
                    mutate(Neighbor=factor(Neighbor, level=rev(Neighbor)))

print(df_subreddit %>% select(Neighbor, norm))

Source: local data frame [15 x 2]

               Neighbor      norm
                 (fctr)     (dbl)
1       tuckedinkitties 0.8500000
2        babybigcatgifs 0.7857143
3           puppysmiles 0.7727273
4               teefies 0.7307692
5           hardcoreaww 0.6875000
6              catslaps 0.6585366
7               puppies 0.6451613
8             catpranks 0.6341463
9         jellybeantoes 0.6296296
10    animalsbeingderps 0.6250000
11     animalsbeingbros 0.6246499
12 beforenafteradoption 0.6000000
13                 blep 0.5903614
14              catgifs 0.5797101
15           tuxedocats 0.5789474


Create plotting function to plot data.

In [36]:
plot_related_subreddits <- function(df_plot, subreddit) {
    
    threshold <- 0.95
    labels_right <- ifelse(df_plot$norm < threshold, sprintf("%0.2f%%", df_plot$norm*100), "")
    labels_left <- ifelse(df_plot$norm >= threshold, sprintf("%0.2f%%", df_plot$norm*100), "")
    
    plot <- ggplot(df_plot, aes(x=Neighbor, y=norm, fill=norm, color=norm)) +
        geom_bar(stat="identity", size=0.9) +
        geom_text(label=labels_right, hjust=-0.25, size=2, family="Open Sans Condensed Bold") +
        geom_text(label=labels_left, hjust=1.2, size=2, color="#1a1a1a", family="Open Sans Condensed Bold") +
        coord_flip() +
        fte_theme() +
        theme(panel.grid.major.y=element_blank(), axis.text.y=element_text(margin=margin(0,-5,0,0))) +   # http://stackoverflow.com/a/14487264
        scale_y_continuous(labels = percent, limits=c(0, 1)) +
        scale_fill_viridis(values=c(0, 1), begin = min(df_plot$norm), end = max(df_plot$norm)) +
        scale_color_viridis(values=c(0, 1), begin = min(df_plot$norm), end = max(df_plot$norm)) +
        labs(x = "Subreddit", y=sprintf("%% of Active Users in Subreddit Also Active in /r/%s", subreddit),
             title= sprintf("Top Subreddits Related to /r/%s", subreddit))

    max_save(plot, sprintf("%s-related", subreddit), "Reddit/BigQuery")
}

plot_related_subreddits(df_subreddit, subreddit)

![](aww-related.png)

## Putting it all together

In [37]:
get_related_subreddits <- function(subreddit) {
    df_subreddit <- df %>% filter(Source == subreddit | Target == subreddit) %>%
                        mutate(Neighbor = ifelse(Source == subreddit, Target, Source)) %>%
                        left_join(df_active_users, by=c("Neighbor"="subreddit")) %>%
                        mutate(norm = Weight.x/Weight.y) %>%
                        arrange(desc(norm)) %>%
                        head(15) %>%
                        mutate(Neighbor=factor(Neighbor, level=rev(Neighbor)))

    plot_related_subreddits(df_subreddit, subreddit)   
}

get_related_subreddits("virtualization")

![](the_donald-related.png)

## Jaccard Index

Find similarity between subreddits using the [Jaccard Index](https://en.wikipedia.org/wiki/Jaccard_index) and using subreddit active users as the two sets. (obtained via BigQuery using [methodology descibed here](http://minimaxir.com/2016/05/reddit-graph/).)

For example, here's the formula for /r/the_donald's Jaccard Index with /r/politics:

(# of Users who are active in both /r/the_donald and /r/politics) / [(# of Users are active in /r/the_donald) + (# of Users are active in /r/politics) - (# of Users who are active in both /r/the_donald and /r/politics)]

Calculate the Index between /r/the_donald and all other subreddits, then select the Top 15 non-default subreddits (since due to the nature of defaults, they will almost always be present)

In [38]:
Weight.subreddit <- df_active_users$Weight[which(df_active_users$subreddit==subreddit)]

df_subreddit_jaccard <- df %>% filter(Source == subreddit | Target == subreddit) %>%
                    mutate(Neighbor = ifelse(Source == subreddit, Target, Source)) %>%
                    left_join(df_active_users, by=c("Neighbor"="subreddit")) %>%
                    mutate(Weight.intersection=Weight.x, Weight.Neighbor=Weight.y) %>%
                    mutate(jaccard = Weight.intersection/(Weight.subreddit + Weight.Neighbor - Weight.intersection)) %>%
                    arrange(desc(jaccard)) %>%
                    head(15) %>%
                    mutate(Neighbor=factor(Neighbor, level=rev(Neighbor)))

print(df_subreddit_jaccard %>% select(Neighbor, jaccard))

Source: local data frame [15 x 2]

            Neighbor    jaccard
              (fctr)      (dbl)
1               gifs 0.16959673
2  mildlyinteresting 0.16275875
3               pics 0.14494161
4              funny 0.13278921
5      todayilearned 0.11765992
6     showerthoughts 0.11602923
7                wtf 0.10279884
8      adviceanimals 0.09762829
9             videos 0.09616911
10            gaming 0.09596455
11              news 0.09032371
12       nottheonion 0.08927536
13         worldnews 0.08076352
14            movies 0.07680374
15     oldschoolcool 0.07481190


In [39]:
plot_jaccard_subreddits <- function(df_plot, subreddit) {
    
    threshold <- 0.22
    labels_right <- ifelse(df_plot$jaccard < threshold, sprintf("%0.3f", df_plot$jaccard), "")
    labels_left <- ifelse(df_plot$jaccard >= threshold, sprintf("%0.3f", df_plot$jaccard), "")
    
    plot <- ggplot(df_plot, aes(x=Neighbor, y=jaccard, fill=jaccard, color=jaccard)) +
        geom_bar(stat="identity", size=0.9) +
        geom_text(label=labels_right, hjust=-0.25, size=2, family="Open Sans Condensed Bold") +
        geom_text(label=labels_left, hjust=1.2, size=2, color="#1a1a1a", family="Open Sans Condensed Bold") +
        coord_flip() +
        fte_theme() +
        theme(panel.grid.major.y=element_blank(), axis.text.y=element_text(margin=margin(0,-5,0,0))) +   # http://stackoverflow.com/a/14487264
        scale_y_continuous(limits=c(0, 0.25)) +
        scale_fill_viridis(values=c(0, 1), begin = min(df_plot$jaccard)*4, end = max(df_plot$jaccard)*4) +
        scale_color_viridis(values=c(0, 1), begin = min(df_plot$jaccard)*4, end = max(df_plot$jaccard)*4) +
        labs(x = "Subreddit", y=sprintf("Jaccard Index Between Subreddit and /r/%s", subreddit),
             title= sprintf("Subreddits Most Similar to /r/%s", subreddit))

    max_save(plot, sprintf("%s-jaccard", subreddit), "Reddit/BigQuery")
}

plot_jaccard_subreddits(df_subreddit_jaccard, subreddit)

![](aww-jaccard.png)

In [40]:
get_jaccard_subreddits <- function(subreddit) {
    Weight_subreddit <- df_active_users$Weight[which(df_active_users$subreddit==subreddit)]

    df_subreddit_jaccard <- df %>% filter(Source == subreddit | Target == subreddit) %>%
                                mutate(Neighbor = ifelse(Source == subreddit, Target, Source)) %>%
                                left_join(df_active_users, by=c("Neighbor"="subreddit")) %>%
                                mutate(Weight.intersection=Weight.x, Weight.Neighbor=Weight.y) %>%
                                mutate(jaccard = Weight.intersection/(Weight.subreddit + Weight.Neighbor - Weight.intersection)) %>%
                                arrange(desc(jaccard)) %>%
                                head(15) %>%
                                mutate(Neighbor=factor(Neighbor, level=rev(Neighbor)))

    plot_jaccard_subreddits(df_subreddit_jaccard, subreddit)   
}

get_jaccard_subreddits("the_donald")

![](the_donald-jaccard.png)

## Jaccard Nondefaults

In [41]:
defaults <- c("announcements","art","askreddit","askscience","aww","blog",
             "books","creepy","dataisbeautiful","diy","documentaries","earthporn",
             "explainlikeimfive","food","funny","futurology","gadgets",
             "gaming","getmotivated","gifs","history","iama","internetisbeautiful",
             "jokes","lifeprotips","listentothis","mildlyinteresting","movies","music",
             "news","nosleep","nottheonion","oldschoolcool","personalfinance",
             "philosophy","photoshopbattles","pics","science","showerthoughts",
             "space","sports","television","tifu","todayilearned","twoxchromosomes","upliftingnews",
             "videos","worldnews","writingprompts")

plot_jaccard_nondefault_subreddits <- function(df_plot, subreddit) {
    
    threshold <- 0.95 * max(df_plot$jaccard)
    labels_right <- ifelse(df_plot$jaccard < threshold, sprintf("%0.3f", df_plot$jaccard), "")
    labels_left <- ifelse(df_plot$jaccard >= threshold, sprintf("%0.3f", df_plot$jaccard), "")
    
    plot <- ggplot(df_plot, aes(x=Neighbor, y=jaccard, fill=jaccard, color=jaccard)) +
        geom_bar(stat="identity", size=0.9) +
        geom_text(label=labels_right, hjust=-0.25, size=2, family="Open Sans Condensed Bold") +
        geom_text(label=labels_left, hjust=1.2, size=2, color="#1a1a1a", family="Open Sans Condensed Bold") +
        coord_flip() +
        fte_theme() +
        theme(panel.grid.major.y=element_blank(), axis.text.y=element_text(margin=margin(0,-5,0,0))) +   # http://stackoverflow.com/a/14487264
        #scale_y_continuous(limits=c(0, 0.50)) +
        scale_fill_viridis() +
        scale_color_viridis() +
        #scale_fill_viridis(values=c(0, 1), begin = min(df_plot$jaccard)*4, end = min(max(df_plot$jaccard)*4,1)) +
        #scale_color_viridis(values=c(0, 1), begin = min(df_plot$jaccard)*4, end = min(max(df_plot$jaccard)*4,1)) +
        labs(x = "Subreddit", y=sprintf("Jaccard Index Between Subreddit and /r/%s", subreddit),
             title= sprintf("Subreddits Most Similar to /r/%s", subreddit))

    max_save(plot, sprintf("%s-jaccard-nondefault", subreddit), "Reddit/BigQuery")
}

get_jaccard_nondefault_subreddits <- function(subreddit) {
    Weight.subreddit <- df_active_users$Weight[which(df_active_users$subreddit==subreddit)]

    df_subreddit_jaccard <- df %>% filter(Source == subreddit | Target == subreddit) %>%
                                mutate(Neighbor = ifelse(Source == subreddit, Target, Source)) %>%
                                filter(!(Neighbor %in% defaults)) %>%
                                left_join(df_active_users, by=c("Neighbor"="subreddit")) %>%
                                mutate(Weight.intersection=Weight.x, Weight.Neighbor=Weight.y) %>%
                                mutate(jaccard = Weight.intersection/(Weight.subreddit + Weight.Neighbor - Weight.intersection)) %>%
                                arrange(desc(jaccard)) %>%
                                head(15) %>%
                                mutate(Neighbor=factor(Neighbor, level=rev(Neighbor)))
    
    #print(df_subreddit_jaccard %>% select(Neighbor, jaccard))

    plot_jaccard_nondefault_subreddits(df_subreddit_jaccard, subreddit)   
}

get_jaccard_nondefault_subreddits("aww")

![](aww-jaccard-nondefault.png)

In [None]:
top_subreddits <- unlist(df_active_users %>% head(200) %>% select(subreddit))

x <- lapply(top_subreddits, get_related_subreddits)
x <- lapply(top_subreddits, get_jaccard_nondefault_subreddits)

# The MIT License (MIT)

Copyright (c) 2016 Max Woolf

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.