In [1]:
#import necessary libraries
import random
import feather
import os
import numpy as np
import pandas as pd
from collections import Counter
import subprocess
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import fontTools

In [None]:
#create the color function later used by the wordcloud
def red_color_func(word, font_size, position, orientation, random_state=None,
                    **kwargs):
    i=255-int(font_size*1.5)
    o=225-int(font_size*1.5)
    #return "RGB("+str(o)+","+str(i)+",255)" 
    return "RGB(255,"+str(i)+",0)"

In [None]:
# Read the CSV file "WGS_ribo_trna.csv" into a DataFrame and store it in the "wgs_theta" variable
wgs_theta=pd.read_csv("../tables_and_results/WGS_ribo_trna.csv",sep="\t")

#load file from microbe atlas to get the sample names and keywords
keyw=pd.read_csv("/mnt/mnemo4/microbeatlas/mapdata/20210104map1/samples.env.simple","\t",header=None)

# Loop through a list of keywords and print the number of samples in the "keyw" DataFrame that have each keyword
#Background distribution of the environments in all samples
for x in ["animal","soil","aquatic","plant"]:
    print(x +" found in "+str(len(keyw[keyw[1]==x]))+" samples")

In [None]:
# Create an empty list to store the sequencing run ID for each found theta ribozyme
samples=[]

# Loop through the "ID" column of the "wgs_theta" DataFrame
for x in wgs_theta["ID"]:
    # Split the current ID string at the "." character and append the first part to the "samples" list
    samples.append(x.split(".")[0])

# Create a deep copy of the "keyw" DataFrame and store it in the "keyw_2" variable
keyw_2=keyw.copy(deep=True)

# Split the values in the "0" column of the "keyw_2" DataFrame at the "." character and store the first part in a new list
allspls=keyw_2.loc[:,0].str.split(".")
ind=[]
for x in allspls:
    ind.append(x[0])

# Set the index of the "keyw_2" DataFrame to the values in the "ind" list
keyw_2.index=ind

# Create an empty list to store the environment for each sequencing run ID in the "samples" list
envs=[]

# Loop through the sequencing run IDs in the "samples" list
for sa in samples:
    try:
        # Get the environment for the current sequencing run ID from the "keyw_2" DataFrame and append it to the "envs" list
        envs.append(keyw_2.loc[sa,1])
    except:
        # If the current sequencing run ID is not found in the "keyw_2" DataFrame, print an error message
        print(sa+" not found")

# Remove any NaN values from the "envs" list
envs2 = [x for x in envs if str(x) != 'nan']

# Initialize counters for each environment type
animal=0
soil=0
aquatic=0
plant=0
ww=0

# Loop through the environments in the "envs2" list and increment the corresponding counter for each environment type
for env in envs2:
    if "animal" in env:
        animal+=1
    if "soil" in env:
        soil+=1
    # Exclude wastewater due to "human" contamination
    if "aquatic" in env:
        if "waste water" not in env:
            aquatic+=1
        else:
            ww+=1
    if "plant" in env:
        plant+=1

# Print the number of theta ribozymes found in each environment type
print("animal: "+str(animal)+" \nsoil: "+str(soil)+" \naquatic: "+str(aquatic)+" \nplant: "+str(plant))

In [None]:
# Import the "STOPWORDS" set from the "wordcloud" library
from wordcloud import STOPWORDS

# Add custom stopwords to the "STOPWORDS" set
stopwords = set(STOPWORDS)
stopwords.update(["nan","raw","association","associ","pl"])
stopwords.update(["wat","sourcetracker","altamaha","itasca","keywords","wate","ti","gsc"])

# Create an empty list to store the keywords for each sequencing run ID in the "samples" list
words=[]

# Loop through the sequencing run IDs in the "samples" list
for sa in samples:
    try:
        # Get the keywords for the current sequencing run ID from the "keyw_2" DataFrame and append them to the "words" list
        words.append(keyw_2.loc[sa,4])
    except:
        # If the current sequencing run ID is not found in the "keyw_2" DataFrame, print an error message
        print(sa+" not found")

# Create an empty list to store all individual keywords from the "words" list
allwords_theta=[]

# Loop through the strings in the "words" list
for lin in words:
    try:
        # Split the current string at the "," character and loop through the resulting list of keywords
        for word in lin.split(","):
            # Append the current keyword to the "allwords_theta" list
            allwords_theta.append(word)
    except:
        # If an error occurs during the splitting process, print the current string
        print(lin)
#count the words of theta rz


#count and sort the top 15 words found
count=Counter(allwords_theta)
print(count.most_common(15))

#create and save the wordcloud

wc=WordCloud(color_func=red_color_func, stopwords=stopwords,prefer_horizontal=1, min_font_size=10, max_font_size=150, relative_scaling=.4, width=1000, collocations=False,height=400, max_words=15, random_state=1, background_color="white").generate(str(words).replace("'",""))

wordcloud_svg = wc.to_svg(embed_font=True)
f = open("wordcloud_theta_red.svg","w+")
f.write(wordcloud_svg )
f.close()


In [None]:
#random 10000 samples as a comparison to the wordclouds

In [None]:
# Select all rows in the "keyw_2" DataFrame where the value in the "4" column is not NaN and store the result in the "nonakeyw" variable
nonakeyw=keyw_2[~keyw_2[4].isna()]

# Set the random seed to 106 (analysis performed on 10.6) and select 10,000 random rows from the "nonakeyw" DataFrame
rnd=nonakeyw.sample(10000,random_state=106)

# Create an empty list to store all individual keywords from the selected rows
allwords=[]
for lin in rnd[4]:
    # Split the current string at the "," character and loop through the resulting list of keywords
    for word in lin.split(","):
        # Append the current keyword to the "allwords" list
        allwords.append(word)

# Count the frequency of each keyword in the "allwords" list and print the 15 most common keywords
count=Counter(allwords)
count.most_common(15)

# Create a WordCloud object with custom settings and generate a word cloud from the "4" column of the "rnd" DataFrame
wc2=WordCloud(color_func=red_color_func, stopwords=stopwords,prefer_horizontal=1, min_font_size=10, max_font_size=150, relative_scaling=.4, width=1000, collocations=False,height=400, max_words=15, random_state=1, background_color="white").generate(",".join(rnd[4].values))

# Convert the generated word cloud to an SVG image and write it to a file named "wordcloud_random.svg"
wordcloud_svg2 = wc2.to_svg(embed_font=True)
f = open("wordcloud_random.svg","w+")
f.write(wordcloud_svg2 )
f.close()