# UMAP clustering for the methylation signature Position Weight Matrices
## Wastewater data

In [None]:
import pandas as pd
import numpy as np
import warnings

from matplotlib.colors import ListedColormap

import os 
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import umap
from sklearn.metrics import silhouette_score
from sklearn.cluster import DBSCAN
import seaborn as sns
import os
from PIL import Image, ImageFont
from sklearn.cluster import KMeans

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from sklearn.cluster import KMeans
import matplotlib.lines as mlines


seed = 98
import matplotlib.pyplot as plt
import seaborn as sns

# EFF1
## Import data

In [None]:
path_to_images = '/scratch/project_2006608/Methylation/notebooks/UMAP_WW/'

## FILTERING
### > 100 lines in .gff

In [None]:
file_path = '/scratch/project_2006608/Methylation/WW_data/EFF1_matrices_top100/flattened/EFF1_concat_matrices_top100.tsv'
EFF1_matrices = pd.read_csv(file_path, sep='\t', index_col=0, low_memory=False)

In [None]:
print(EFF1_matrices.shape[0])

In [None]:
print(EFF1_matrices.shape)
EFF1_matrices.head()

In [None]:
EFF1_df = EFF1_matrices.loc[(EFF1_matrices.iloc[:, :492] != 0).any(axis=1)]
EFF1_df.shape
EFF1_df.head()

In [None]:
EFF1_df['sample'].value_counts()
print(EFF1_df.iloc[:, :-1])

In [None]:
# All
#n_neighbors = [20, 30 ]
#min_dist = [0.01, 0.1, 0.2]
# The best
n_neighbors = [20]
min_dist = [0.1]

color_dict = ['#05b5bb']

for n in n_neighbors:
    for m in min_dist:
        
        reducer = umap.UMAP(n_neighbors=n, min_dist=m, random_state=seed)
        embedding = reducer.fit_transform(EFF1_df.iloc[:, :-1])
        UMAP_EFF1_df  = pd.DataFrame({
            'UMAP1': embedding[:, 0],
            'UMAP2': embedding[:, 1],
            'contig': EFF1_df.index
        })
        # clustering
        kmeans = KMeans(n_clusters=11, random_state=seed)
        
        k_labels = kmeans.fit_predict(embedding)
        
        UMAP_EFF1_df['cluster'] = k_labels
        
        fig = px.scatter(UMAP_EFF1_df, 
                            x='UMAP1', 
                            y='UMAP2', 
                            title=f' Wastewater - UMAP with n_neighbors={n}, min_dist={m}', 
                            color_discrete_sequence=color_dict,
                            hover_data={'cluster': True, 'contig': True})
        title = f' Wastewater - UMAP with n_neighbors={n}, min_dist={m}'
        fig.update_layout(
            height=1500,
            width=1200,
            title_text=title,
            showlegend=True,
            legend=dict(
                x=0.5,
                y=-0.1,
                traceorder="normal",
                xanchor='center',
                yanchor='top',
                orientation='h'
            ),
            template='simple_white',
            xaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,  
                linecolor='black', 
                linewidth=1,
                mirror=True
            ),
            yaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,
                linecolor='black',
                linewidth=1,
                mirror=True
            )
        )
        fig.show()
        fig.write_image(f'UMAP_WW/EFF1_UMAP_{n}_{m}_above100.png')
        fig.write_html(f'UMAP_WW/EFF1_UMAP_{n}_{m}_above100.html')

## Attach mod count data

In [None]:
## Bring the contig lengths from Puhti:
file_path = '/scratch/project_2006608/Methylation/WW_data/EFF1_contigs/EFF1_mod_counts.txt'

EFF1_df_mod_counts = pd.read_csv(file_path, sep='\t', index_col=0, header=0, low_memory=False)
print(EFF1_df_mod_counts.shape[0])
EFF1_df_mod_counts.head()

EFF1_df_mod_counts = np.log(EFF1_df_mod_counts)

In [None]:
## Append to merged_data.tsv
EFF1_df_ext = EFF1_df.copy()
EFF1_df_ext.head()
EFF1_df_mod_counts.head()

# Reorder to match
EFF1_df_ordered = EFF1_df_mod_counts.loc[EFF1_df_ext.index]

EFF1_df_mod_counts = pd.concat([EFF1_df_ext, EFF1_df_ordered], axis=1)
EFF1_df_mod_counts.tail()
print(EFF1_df_mod_counts.iloc[:, :-2])

In [None]:
n_neighbors = [20]
min_dist = [0.1]

for n in n_neighbors:
    for m in min_dist:
        reducer = umap.UMAP(n_neighbors=n, min_dist=m, random_state=seed)
        embedding = reducer.fit_transform(EFF1_df_mod_counts.iloc[:, :-2])
        EFF1_mod_counts_UMAP_df  = pd.DataFrame({
            'UMAP1': embedding[:, 0],
            'UMAP2': embedding[:, 1],
            'contig': EFF1_df_mod_counts.index,
            'mod_count':EFF1_df_mod_counts['mod_count']
        })
        # clustering k defined choosing the k by looking at the UMAP 
        kmeans = KMeans(n_clusters=11, random_state=seed)
        
        k_labels = kmeans.fit_predict(embedding)
        
        EFF1_mod_counts_UMAP_df['cluster'] = k_labels
        
        fig = px.scatter(EFF1_mod_counts_UMAP_df, 
                            x='UMAP1', 
                            y='UMAP2', 
                            color='mod_count',
                            title=f' Wastewater EFF1 - UMAP with n_neighbors={n}, min_dist={m}', 
                            color_continuous_scale=px.colors.sequential.Rainbow,
                            hover_data={'cluster': True, 'contig': True} )
        title = f' Wastewater EFF1 - UMAP with n_neighbors={n}, min_dist={m}'
        fig.update_layout(
            height=1700,
            width=1200,
            title_text=title,
            showlegend=True,
            legend=dict(
                x=0.5,
                y=-0.05,
                traceorder="normal",
                xanchor='center',
                yanchor='top',
                orientation='h'
            ),
            template='simple_white',
            xaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,  
                linecolor='black', 
                linewidth=1,
                mirror=True
            ),
            yaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,
                linecolor='black',
                linewidth=1,
                mirror=True
            )
        )
        fig.show()
        #fig.write_image(f'UMAP_WW/EFF1_UMAP_{n}_{m}_contig_mod_counts_log_above100.png')
        #fig.write_html(f'UMAP_WW/EFF1_UMAP_{n}_{m}_contig_mod_counts_log_above100.html')

## Attach ARG data
### Counts
### Lengths
### Contig lengths
### fARG results

### ARG Counts

In [None]:
file_path = '/scratch/project_2006608/Methylation/WW_data/EFF1_contigs/EFF1_ARG_counts.txt'

EFF1_df_ARG_counts = pd.read_csv(file_path, sep='\t', index_col=0, header=0, low_memory=False)
print(EFF1_df_ARG_counts.shape[0])
EFF1_df_ARG_counts.head()

In [None]:
## Append to merged_data.tsv
EFF1_df_ext = EFF1_df.copy()
EFF1_df_ext.head()
EFF1_df_ARG_counts.head()

# Reorder to match
EFF1_df_ordered = EFF1_df_ARG_counts.loc[EFF1_df_ext.index]

EFF1_df_ARG_counts = pd.concat([EFF1_df_ext, EFF1_df_ordered], axis=1)
EFF1_df_ARG_counts.tail()
#print(EFF1_df_ARG_counts.dtypes)

### ARG Names

In [None]:
file_path = '/scratch/project_2006608/Methylation/WW_data/EFF1_contigs/EFF1_ARG_names.txt'

EFF1_df_ARG_names = pd.read_csv(file_path, sep='\t', index_col=0, header=0, low_memory=False)
print(EFF1_df_ARG_names.shape[0])
EFF1_df_ARG_names.head()

In [None]:
## Append to merged_data.tsv
EFF1_df_ext = EFF1_df_ARG_counts.copy()
EFF1_df_ext.head()
EFF1_df_ARG_names.head()

# Reorder to match
EFF1_df_ordered = EFF1_df_ARG_names.loc[EFF1_df_ext.index]

EFF1_df_ARG_names = pd.concat([EFF1_df_ext, EFF1_df_ordered], axis=1)
EFF1_df_ARG_names.tail()

In [None]:
# print those with erm(F)_3
erm_F = EFF1_df_ARG_names[EFF1_df_ARG_names['ARG_name'].str.contains('erm(F)_3', case=False, na=False, regex=False)]
print(erm_F)

### Contig lengths

In [None]:
## Bring the contig lengths from Puhti:
file_path = '/scratch/project_2006608/Methylation/WW_data/EFF1_contigs/EFF1_contigs_lengths.txt'

EFF1_df_contigs_lengths = pd.read_csv(file_path, sep='\t', index_col=0, header=0, low_memory=False)
print(EFF1_df_contigs_lengths.shape[0])
EFF1_df_contigs_lengths.head()

In [None]:
## Append to merged_data.tsv
EFF1_df_ext = EFF1_df_ARG_names.copy()
EFF1_df_ext.head()
EFF1_df_contigs_lengths.head()

# Reorder to match
EFF1_df_ordered = EFF1_df_contigs_lengths.loc[EFF1_df_ext.index]

EFF1_data = pd.concat([EFF1_df_ext, EFF1_df_ordered], axis=1)
EFF1_data.tail()
print(EFF1_data.iloc[:, :-4])

In [None]:
n_neighbors = [20]
min_dist = [0.1]

colors = [0, 1, 2, 3]
color_map = {0: '#8ce6e9', 1: '#fa7a31', 2: '#eb340f', 3: '#d01af5'}

# Map colors to each data point
custom_colors = [color_map[val] for val in colors]

for n in n_neighbors:
    for m in min_dist:
        reducer = umap.UMAP(n_neighbors=n, min_dist=m, random_state=seed)
        embedding = reducer.fit_transform(EFF1_data.iloc[:, :-4])
        EFF1_UMAP_data  = pd.DataFrame({
            'UMAP1': embedding[:, 0],
            'UMAP2': embedding[:, 1],
            'contig': EFF1_data.index,
            'ARG_count':EFF1_data['ARG_count'],
            'ARG_name':EFF1_data['ARG_name'],
            'contig_length':EFF1_data['length'],
        })

        EFF1_UMAP_data['ARG_count'] = EFF1_UMAP_data['ARG_count'].astype(str)
        
        fig = px.scatter(EFF1_UMAP_data, 
                            x='UMAP1', 
                            y='UMAP2', 
                            color='ARG_count',
                            title=f' Wastewater EFF1 - UMAP with n_neighbors={n}, min_dist={m}', 
                            color_discrete_sequence=custom_colors,
                            hover_data={'contig': True, 'ARG_count': True, 'ARG_name': True, 'contig_length': True} )
        title = f' Wastewater EFF1 - UMAP with n_neighbors={n}, min_dist={m}'
        fig.update_layout(
            height=1700,
            width=1200,
            title_text=title,
            showlegend=True,
            legend=dict(
                x=0.5,
                y=-0.05,
                traceorder="normal",
                xanchor='center',
                yanchor='top',
                orientation='h'
            ),
            template='simple_white',
            xaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,  
                linecolor='black', 
                linewidth=1,
                mirror=True
            ),
            yaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,
                linecolor='black',
                linewidth=1,
                mirror=True
            )
        )
        fig.show()
        fig.write_image(f'UMAP_WW/EFF1_UMAP_{n}_{m}_contig_ARG_counts_above100.png')
        fig.write_html(f'UMAP_WW/EFF1_UMAP_{n}_{m}_contig_ARG_counts_above100.html')

### fARGene results

In [69]:
## Bring the contig lengths from Puhti:
file_path = '/scratch/project_2006608/Methylation/WW_data/EFF1_contigs/EFF1_fARGene_names.txt'

EFF1_fargene = pd.read_csv(file_path, sep='\t', index_col=0, header=0, low_memory=False)
print(EFF1_fargene.shape[0])
EFF1_fargene.head()

60374


Unnamed: 0_level_0,fARGene
contig,Unnamed: 1_level_1
s10237.ctg011722l,beta_lactamase_d_2
s10431.ctg011956l,beta_lactamase_b_3
s10664.ctg012232l,beta_lactamase_b_3
s10681.ctg012254l,beta_lactamase_a;beta_lactamase_b_3
s10783.ctg012374l,beta_lactamase_d_2


In [70]:
# Reorder to match
EFF1_fargene_ordered = EFF1_fargene.loc[EFF1_data.index]

EFF1_data_fargene = pd.concat([EFF1_data, EFF1_fargene_ordered], axis=1)
EFF1_data_fargene.tail()
print(EFF1_data_fargene.iloc[:, :-5])
print(EFF1_data_fargene)

                   -20_A_m4C  -19_A_m4C  -18_A_m4C  -17_A_m4C  -16_A_m4C  \
0                                                                          
s20578.ctg024035l   0.207234  -0.074724  -0.284370   0.062514  -0.162525   
s10547.ctg012093l   0.000000   0.000000   0.000000   0.000000   0.000000   
s15252.ctg017667l   0.000000   0.000000   0.000000   0.000000   0.000000   
s29924.ctg035497l   0.000000   0.000000   0.000000   0.000000   0.000000   
s4110.ctg004596l   -0.211630  -0.257595  -0.409714  -0.409714  -0.356395   
...                      ...        ...        ...        ...        ...   
s7415.ctg018875l    0.000000   0.000000   0.000000   0.000000   0.000000   
s18782.ctg021879l  -0.598791  -0.162676  -0.401027  -0.315108   0.239343   
s2287.ctg028073l    0.000000   0.000000   0.000000   0.000000   0.000000   
s13637.ctg015736l   0.000000   0.000000   0.000000   0.000000   0.000000   
s182.ctg006099l     0.000000   0.000000   0.000000   0.000000   0.000000   

           

### Plot fARGene

In [71]:
n_neighbors = [20]
min_dist = [0.1]

colors = [0, 1, 2, 3, 4, 5, 6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21]

color_map = {0: "#fabefa", 1: "#d86950", 2: "#ef360c", 3:"#a27faf", 4: "#c308a4", 5: "#f7b2a5", 6:"#c62204",
             7: "#04c60a", 8: "#1e7e21", 9: "#779e78", 10: "black", 11: "#25a5a5", 12: "#e8db16", 13: "#1656e8",
            14: "#0f378e", 15: "#86a4eb", 16: "#5b48d8", 17: "#146eb4", 18: "#6f87f3", 19: "#85baec", 20: "#04bdfe",
            21: "#0d68c2"}

# Map colors to each data point
custom_colors = [color_map[val] for val in colors]

for n in n_neighbors:
    for m in min_dist:
        reducer = umap.UMAP(n_neighbors=n, min_dist=m, random_state=seed)
        embedding = reducer.fit_transform(EFF1_data_fargene.iloc[:, :-5])
        EFF1_UMAP_data  = pd.DataFrame({
            'UMAP1': embedding[:, 0],
            'UMAP2': embedding[:, 1],
            'contig': EFF1_data_fargene.index,
            'ARG_count':EFF1_data_fargene['ARG_count'],
            'ARG_name':EFF1_data_fargene['ARG_name'],
            'contig_length':EFF1_data_fargene['length'],
            'fARGene':EFF1_data_fargene['fARGene'],
        })

        EFF1_UMAP_data['ARG_count'] = EFF1_UMAP_data['ARG_count'].astype(str)
        
        fig = px.scatter(EFF1_UMAP_data, 
                            x='UMAP1', 
                            y='UMAP2', 
                            color='fARGene',
                            title=f' Wastewater EFF1 - UMAP with n_neighbors={n}, min_dist={m}', 
                            color_discrete_sequence=custom_colors,
                            hover_data={'contig': True, 'ARG_count': True, 'ARG_name': True, 'contig_length': True} )
        title = f' Wastewater EFF1 - UMAP with n_neighbors={n}, min_dist={m}'
        fig.update_layout(
            height=1700,
            width=1200,
            title_text=title,
            showlegend=True,
            legend=dict(
                x=0.5,
                y=-0.05,
                traceorder="normal",
                xanchor='center',
                yanchor='top',
                orientation='h'
            ),
            template='simple_white',
            xaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,  
                linecolor='black', 
                linewidth=1,
                mirror=True
            ),
            yaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,
                linecolor='black',
                linewidth=1,
                mirror=True
            )
        )
        fig.show()
        #fig.write_image(f'UMAP_WW/EFF1_UMAP_{n}_{m}_contig_fARGene_counts_above100.png')
        #fig.write_html(f'UMAP_WW/EFF1_UMAP_{n}_{m}_contig_fARGene_counts_above100.html')


n_jobs value -1 overridden to 1 by setting random_state. Use no seed for parallelism.



ValueError: 
    Invalid value of type 'builtins.str' received for the 'color' property of scattergl.marker
        Received value: 'navyblue'

    The 'color' property is a color and may be specified as:
      - A hex string (e.g. '#ff0000')
      - An rgb/rgba string (e.g. 'rgb(255,0,0)')
      - An hsl/hsla string (e.g. 'hsl(0,100%,50%)')
      - An hsv/hsva string (e.g. 'hsv(0,100%,100%)')
      - A named CSS color:
            aliceblue, antiquewhite, aqua, aquamarine, azure,
            beige, bisque, black, blanchedalmond, blue,
            blueviolet, brown, burlywood, cadetblue,
            chartreuse, chocolate, coral, cornflowerblue,
            cornsilk, crimson, cyan, darkblue, darkcyan,
            darkgoldenrod, darkgray, darkgrey, darkgreen,
            darkkhaki, darkmagenta, darkolivegreen, darkorange,
            darkorchid, darkred, darksalmon, darkseagreen,
            darkslateblue, darkslategray, darkslategrey,
            darkturquoise, darkviolet, deeppink, deepskyblue,
            dimgray, dimgrey, dodgerblue, firebrick,
            floralwhite, forestgreen, fuchsia, gainsboro,
            ghostwhite, gold, goldenrod, gray, grey, green,
            greenyellow, honeydew, hotpink, indianred, indigo,
            ivory, khaki, lavender, lavenderblush, lawngreen,
            lemonchiffon, lightblue, lightcoral, lightcyan,
            lightgoldenrodyellow, lightgray, lightgrey,
            lightgreen, lightpink, lightsalmon, lightseagreen,
            lightskyblue, lightslategray, lightslategrey,
            lightsteelblue, lightyellow, lime, limegreen,
            linen, magenta, maroon, mediumaquamarine,
            mediumblue, mediumorchid, mediumpurple,
            mediumseagreen, mediumslateblue, mediumspringgreen,
            mediumturquoise, mediumvioletred, midnightblue,
            mintcream, mistyrose, moccasin, navajowhite, navy,
            oldlace, olive, olivedrab, orange, orangered,
            orchid, palegoldenrod, palegreen, paleturquoise,
            palevioletred, papayawhip, peachpuff, peru, pink,
            plum, powderblue, purple, red, rosybrown,
            royalblue, rebeccapurple, saddlebrown, salmon,
            sandybrown, seagreen, seashell, sienna, silver,
            skyblue, slateblue, slategray, slategrey, snow,
            springgreen, steelblue, tan, teal, thistle, tomato,
            turquoise, violet, wheat, white, whitesmoke,
            yellow, yellowgreen
      - A number that will be interpreted as a color
        according to scattergl.marker.colorscale
      - A list or array of any of the above

## Focus on distinct clusters
### Leave out the most misc cluster and draw again

In [None]:
# Extract based on UMAP1 (x axis) & UMAP2 (y axis) values:
EFF1_UMAP_data_focused = EFF1_UMAP_data.loc[(EFF1_UMAP_data['UMAP1']>= -7) & (EFF1_UMAP_data['UMAP1']<= 6)
    & (EFF1_UMAP_data['UMAP2']>= -1) & (EFF1_UMAP_data['UMAP2']<= 19)]

# Check
EFF1_UMAP_data_focused.head()

EFF1_data_focused = EFF1_data[EFF1_data.index.isin(EFF1_UMAP_data_focused['contig'])]
print(EFF1_data_focused)

In [None]:
# Save contig IDs
EFF1_focused_contigs = EFF1_data_focused.index.to_list()

directory = 'UMAP_WW'
file_path = os.path.join(directory, 'EFF1_focused_contigs_above100.txt')

with open(file_path, 'w') as file:
    for item in EFF1_focused_contigs:
        file.write(f"{item}\n")

In [None]:
print(EFF1_data_focused.iloc[:, :-4])

In [None]:
n_neighbors = [20]
min_dist = [0.1]

colors = [0, 1, 2, 3]
color_map = {0: '#b4f3f5', 1: '#fa7a31', 2: '#eb340f', 3: '#d01af5'}

# Map colors to each data point
custom_colors = [color_map[val] for val in colors]

for n in n_neighbors:
    for m in min_dist:
        reducer = umap.UMAP(n_neighbors=n, min_dist=m, random_state=seed)
        embedding = reducer.fit_transform(EFF1_data_focused.iloc[:, :-4])
        EFF1_UMAP_data_focused  = pd.DataFrame({
            'UMAP1': embedding[:, 0],
            'UMAP2': embedding[:, 1],
            'contig': EFF1_data_focused.index,
            'ARG_count': EFF1_data_focused['ARG_count'],
            'ARG_name':EFF1_data_focused['ARG_name'],
            'contig_length':EFF1_data_focused['length'],
        })

        EFF1_UMAP_data_focused['ARG_count'] = EFF1_UMAP_data_focused['ARG_count'].astype(str)
        EFF1_UMAP_data_focused['log_contig_length'] = np.log(EFF1_UMAP_data_focused['contig_length'])
        EFF1_UMAP_data_focused['sqrt_contig_length'] = np.sqrt(EFF1_UMAP_data_focused['contig_length'])

        
        fig = px.scatter(EFF1_UMAP_data_focused, 
                            x='UMAP1', 
                            y='UMAP2', 
                            color='ARG_count',
                            title=f' Wastewater EFF1 - UMAP with n_neighbors={n}, min_dist={m}', 
                            color_discrete_sequence=custom_colors,
                            hover_data={'contig': True, 'ARG_count': True, 'ARG_name': True, 'contig_length': True},
                            size='sqrt_contig_length'
        )
        title = f' Wastewater EFF1 - UMAP with n_neighbors={n}, min_dist={m}'
        fig.update_layout(
            height=1700,
            width=1200,
            title_text=title,
            showlegend=True,
            legend=dict(
                x=0.5,
                y=-0.05,
                traceorder="normal",
                xanchor='center',
                yanchor='top',
                orientation='h'
            ),
            template='simple_white',
            xaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,  
                linecolor='black', 
                linewidth=1,
                mirror=True
            ),
            yaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,
                linecolor='black',
                linewidth=1,
                mirror=True
            )
        )
        fig.show()
        #fig.write_image(f'UMAP_WW/EFF1_UMAP_{n}_{m}_ARG_counts_focused_above100.png')
        #fig.write_html(f'UMAP_WW/EFF1_UMAP_{n}_{m}_ARG_counts_focused_above100.html')
        fig.write_image(f'UMAP_WW/EFF1_UMAP_{n}_{m}_ARG_counts_lengths_sqrt_focused_above100.png')
        fig.write_html(f'UMAP_WW/EFF1_UMAP_{n}_{m}_ARG_counts_lengths_sqrt_focused_above100.html')

### Extract clusters
#### C1

In [None]:
# Extract based on UMAP1 (x axis) & UMAP2 (y axis) values:
EFF1_data_focused_C1 = EFF1_UMAP_data_focused.loc[(EFF1_UMAP_data_focused['UMAP1']>= 2.2) & (EFF1_UMAP_data_focused['UMAP1']<= 2.4)
    & (EFF1_UMAP_data_focused['UMAP2']>= 3) & (EFF1_UMAP_data_focused['UMAP2']<= 3.15)]

# Check
EFF1_data_focused_C1.head()

EFF1_data_C1 = EFF1_data_focused[EFF1_data_focused.index.isin(EFF1_data_focused_C1['contig'])]
print(EFF1_data_C1)

In [None]:
# Save contig IDs
EFF1_C1_contigs = EFF1_data_C1.index.to_list()

directory = 'UMAP_WW'
file_path = os.path.join(directory, 'EFF1_C1_contigs_above100.txt')

with open(file_path, 'w') as file:
    for item in EFF1_C1_contigs:
        file.write(f"{item}\n")

### Extract clusters
#### C2

In [None]:
# Extract based on UMAP1 (x axis) & UMAP2 (y axis) values:
EFF1_data_focused_C2 = EFF1_UMAP_data_focused.loc[(EFF1_UMAP_data_focused['UMAP1']>= 2.6) & (EFF1_UMAP_data_focused['UMAP1']<= 2.8)
    & (EFF1_UMAP_data_focused['UMAP2']>= 3.2) & (EFF1_UMAP_data_focused['UMAP2']<= 3.3)]

# Check
EFF1_data_focused_C2.head()

EFF1_data_C2 = EFF1_data_focused[EFF1_data_focused.index.isin(EFF1_data_focused_C2['contig'])]
print(EFF1_data_C2)

In [None]:
# Save contig IDs
EFF1_C2_contigs = EFF1_data_C2.index.to_list()

directory = 'UMAP_WW'
file_path = os.path.join(directory, 'EFF1_C2_contigs_above100.txt')

with open(file_path, 'w') as file:
    for item in EFF1_C2_contigs:
        file.write(f"{item}\n")

### Extract clusters
#### C3

In [None]:
# Extract based on UMAP1 (x axis) & UMAP2 (y axis) values:
EFF1_data_focused_C3 = EFF1_UMAP_data_focused.loc[(EFF1_UMAP_data_focused['UMAP1']>= -6) & (EFF1_UMAP_data_focused['UMAP1']<= -5.8)
    & (EFF1_UMAP_data_focused['UMAP2']>= 10.6) & (EFF1_UMAP_data_focused['UMAP2']<= 10.8)]

# Even more focused
EFF1_data_focused_C3_f = EFF1_UMAP_data_focused.loc[(EFF1_UMAP_data_focused['UMAP1']>= -5.88) & (EFF1_UMAP_data_focused['UMAP1']<= -5.865)
    & (EFF1_UMAP_data_focused['UMAP2']>= 10.704) & (EFF1_UMAP_data_focused['UMAP2']<= 10.716)]

# Check
EFF1_data_focused_C3.head()
EFF1_data_focused_C3_f.head()

EFF1_data_C3 = EFF1_data_focused[EFF1_data_focused.index.isin(EFF1_data_focused_C3['contig'])]
print(EFF1_data_C3)

EFF1_data_C3_f = EFF1_data_focused[EFF1_data_focused.index.isin(EFF1_data_focused_C3_f['contig'])]
print(EFF1_data_C3_f)

In [None]:
# Save contig IDs
EFF1_C3_contigs = EFF1_data_C3.index.to_list()
EFF1_C3_f_contigs = EFF1_data_C3_f.index.to_list()

directory = 'UMAP_WW'
file_path = os.path.join(directory, 'EFF1_C3_contigs_above100.txt')
file_path = os.path.join(directory, 'EFF1_C3_f_contigs_above100.txt')

with open(file_path, 'w') as file:
    for item in EFF1_C3_contigs:
        file.write(f"{item}\n")

with open(file_path, 'w') as file:
    for item in EFF1_C3_f_contigs:
        file.write(f"{item}\n")

# INF1
## Import data

In [None]:
path_to_images = '/scratch/project_2006608/Methylation/notebooks/UMAP_WW/'

## FILTERING
### > 100 lines in .gff

In [None]:
file_path = '/scratch/project_2006608/Methylation/WW_data/INF1_matrices_top100/flattened/INF1_concat_matrices_top100.tsv'
INF1_matrices = pd.read_csv(file_path, sep='\t', index_col=0, low_memory=False)

In [None]:
print(INF1_matrices.shape[0])

In [None]:
print(INF1_matrices.shape)
INF1_matrices.head()

In [None]:
INF1_df = INF1_matrices.loc[(INF1_matrices.iloc[:, :492] != 0).any(axis=1)]
INF1_df.shape
INF1_df.head()

In [None]:
INF1_df['sample'].value_counts()
print(INF1_df.iloc[:, :-1])

In [None]:
# All
#n_neighbors = [20, 30 ]
#min_dist = [0.01, 0.1, 0.2]
# The best
n_neighbors = [20]
min_dist = [0.1]

color_dict = ['#f9c82e']

for n in n_neighbors:
    for m in min_dist:
        
        reducer = umap.UMAP(n_neighbors=n, min_dist=m, random_state=seed)
        embedding = reducer.fit_transform(INF1_df.iloc[:, :-1])
        UMAP_INF1_df  = pd.DataFrame({
            'UMAP1': embedding[:, 0],
            'UMAP2': embedding[:, 1],
            'contig': INF1_df.index
        })
        fig = px.scatter(UMAP_INF1_df, 
                            x='UMAP1', 
                            y='UMAP2', 
                            title=f' Wastewater INF1 - UMAP with n_neighbors={n}, min_dist={m}', 
                            color_discrete_sequence=color_dict,
                            hover_data={'contig': True})
        title = f' Wastewater INF1 - UMAP with n_neighbors={n}, min_dist={m}'
        fig.update_layout(
            height=1500,
            width=1200,
            title_text=title,
            showlegend=True,
            legend=dict(
                x=0.5,
                y=-0.1,
                traceorder="normal",
                xanchor='center',
                yanchor='top',
                orientation='h'
            ),
            template='simple_white',
            xaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,  
                linecolor='black', 
                linewidth=1,
                mirror=True
            ),
            yaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,
                linecolor='black',
                linewidth=1,
                mirror=True
            )
        )
        fig.show()
        fig.write_image(f'UMAP_WW/INF1_UMAP_{n}_{m}_above100.png')
        fig.write_html(f'UMAP_WW/INF1_UMAP_{n}_{m}_above100.html')

## Attach mod count data

In [None]:
## Bring the contig lengths from Puhti:
file_path = '/scratch/project_2006608/Methylation/WW_data/INF1_contigs/INF1_mod_counts.txt'

INF1_df_mod_counts = pd.read_csv(file_path, sep='\t', index_col=0, header=0, low_memory=False)
print(INF1_df_mod_counts.shape[0])
INF1_df_mod_counts.head()

INF1_df_mod_counts = np.log(INF1_df_mod_counts)

In [None]:
## Append to merged_data.tsv
INF1_df_ext = INF1_df.copy()
INF1_df_ext.head()
INF1_df_mod_counts.head()

# Reorder to match
INF1_df_ordered = INF1_df_mod_counts.loc[INF1_df_ext.index]

INF1_df_mod_counts = pd.concat([INF1_df_ext, INF1_df_ordered], axis=1)
INF1_df_mod_counts.tail()
print(INF1_df_mod_counts.iloc[:, :-2])

In [None]:
n_neighbors = [20]
min_dist = [0.1]

for n in n_neighbors:
    for m in min_dist:
        reducer = umap.UMAP(n_neighbors=n, min_dist=m, random_state=seed)
        embedding = reducer.fit_transform(INF1_df_mod_counts.iloc[:, :-2])
        INF1_mod_counts_UMAP_df  = pd.DataFrame({
            'UMAP1': embedding[:, 0],
            'UMAP2': embedding[:, 1],
            'contig': INF1_df_mod_counts.index,
            'mod_count':INF1_df_mod_counts['mod_count']
        })
        fig = px.scatter(INF1_mod_counts_UMAP_df, 
                            x='UMAP1', 
                            y='UMAP2', 
                            color='mod_count',
                            title=f' Wastewater INF1 - UMAP with n_neighbors={n}, min_dist={m}', 
                            color_continuous_scale=px.colors.sequential.Rainbow,
                            hover_data={'contig': True} )
        title = f' Wastewater INF1 - UMAP with n_neighbors={n}, min_dist={m}'
        fig.update_layout(
            height=1700,
            width=1200,
            title_text=title,
            showlegend=True,
            legend=dict(
                x=0.5,
                y=-0.05,
                traceorder="normal",
                xanchor='center',
                yanchor='top',
                orientation='h'
            ),
            template='simple_white',
            xaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,  
                linecolor='black', 
                linewidth=1,
                mirror=True
            ),
            yaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,
                linecolor='black',
                linewidth=1,
                mirror=True
            )
        )
        fig.show()
        fig.write_image(f'UMAP_WW/INF1_UMAP_{n}_{m}_contig_mod_counts_log_above100.png')
        fig.write_html(f'UMAP_WW/INF1_UMAP_{n}_{m}_contig_mod_counts_log_above100.html')

## Attach ARG data
### Counts
### Lengths
### Contig lengths

### ARG Counts

In [None]:
file_path = '/scratch/project_2006608/Methylation/WW_data/INF1_contigs/INF1_ARG_counts.txt'

INF1_df_ARG_counts = pd.read_csv(file_path, sep='\t', index_col=0, header=0, low_memory=False)
print(INF1_df_ARG_counts.shape[0])
INF1_df_ARG_counts.head()

In [None]:
## Append to merged_data.tsv
INF1_df_ext = INF1_df.copy()
INF1_df_ext.head()
INF1_df_ARG_counts.head()

# Reorder to match
INF1_df_ordered = INF1_df_ARG_counts.loc[INF1_df_ext.index]

INF1_df_ARG_counts = pd.concat([INF1_df_ext, INF1_df_ordered], axis=1)
INF1_df_ARG_counts.tail()

### ARG Names

In [None]:
file_path = '/scratch/project_2006608/Methylation/WW_data/INF1_contigs/INF1_ARG_names.txt'

INF1_df_ARG_names = pd.read_csv(file_path, sep='\t', index_col=0, header=0, low_memory=False)
print(INF1_df_ARG_names.shape[0])
INF1_df_ARG_names.head()

In [None]:
## Append to merged_data.tsv
INF1_df_ext = INF1_df_ARG_counts.copy()
INF1_df_ext.head()
INF1_df_ARG_names.head()

# Reorder to match
INF1_df_ordered = INF1_df_ARG_names.loc[INF1_df_ext.index]

INF1_df_ARG_names = pd.concat([INF1_df_ext, INF1_df_ordered], axis=1)
INF1_df_ARG_names.tail()

In [None]:
# print those with erm(F)_3
erm_F = INF1_df_ARG_names[INF1_df_ARG_names['ARG_name'].str.contains('erm(F)_3', case=False, na=False, regex=False)]
print(erm_F)

### Contig lengths

In [None]:
## Bring the contig lengths from Puhti:
file_path = '/scratch/project_2006608/Methylation/WW_data/INF1_contigs/INF1_contigs_lengths.txt'

INF1_df_contigs_lengths = pd.read_csv(file_path, sep='\t', index_col=0, header=0, low_memory=False)
print(INF1_df_contigs_lengths.shape[0])
INF1_df_contigs_lengths.head()

In [None]:
## Append to merged_data.tsv
INF1_df_ext = INF1_df_ARG_names.copy()
INF1_df_ext.head()
INF1_df_contigs_lengths.head()

# Reorder to match
INF1_df_ordered = INF1_df_contigs_lengths.loc[INF1_df_ext.index]

INF1_data = pd.concat([INF1_df_ext, INF1_df_ordered], axis=1)
INF1_data.tail()
print(INF1_data.iloc[:, :-4])

In [None]:
n_neighbors = [20]
min_dist = [0.1]

colors = [0, 1, 2, 3, 4, 5]
color_map = {0: '#f0da95', 1: '#fa7a31', 2: '#eb340f', 3: '#d01af5', 4: '#2a2df6', 5: '#940785'}

# Map colors to each data point
custom_colors = [color_map[val] for val in colors]

for n in n_neighbors:
    for m in min_dist:
        reducer = umap.UMAP(n_neighbors=n, min_dist=m, random_state=seed)
        embedding = reducer.fit_transform(INF1_data.iloc[:, :-4])
        INF1_UMAP_data  = pd.DataFrame({
            'UMAP1': embedding[:, 0],
            'UMAP2': embedding[:, 1],
            'contig': INF1_data.index,
            'ARG_count':INF1_data['ARG_count'],
            'ARG_name':INF1_data['ARG_name'],
            'contig_length':INF1_data['length'],
        })

        INF1_UMAP_data['ARG_count'] = INF1_UMAP_data['ARG_count'].astype(str)
        
        fig = px.scatter(INF1_UMAP_data, 
                            x='UMAP1', 
                            y='UMAP2', 
                            color='ARG_count',
                            title=f' Wastewater INF1 - UMAP with n_neighbors={n}, min_dist={m}', 
                            color_discrete_sequence=custom_colors,
                            hover_data={'contig': True, 'ARG_count': True, 'ARG_name': True, 'contig_length': True} )
        title = f' Wastewater INF1 - UMAP with n_neighbors={n}, min_dist={m}'
        fig.update_layout(
            height=1700,
            width=1200,
            title_text=title,
            showlegend=True,
            legend=dict(
                x=0.5,
                y=-0.05,
                traceorder="normal",
                xanchor='center',
                yanchor='top',
                orientation='h'
            ),
            template='simple_white',
            xaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,  
                linecolor='black', 
                linewidth=1,
                mirror=True
            ),
            yaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,
                linecolor='black',
                linewidth=1,
                mirror=True
            )
        )
        fig.show()
        fig.write_image(f'UMAP_WW/INF1_UMAP_{n}_{m}_contig_ARG_counts_above100.png')
        fig.write_html(f'UMAP_WW/INF1_UMAP_{n}_{m}_contig_ARG_counts_above100.html')

## Focus on distinct clusters
### Leave out the most misc cluster and draw again

In [None]:
# Extract based on UMAP1 (x axis) & UMAP2 (y axis) values:
INF1_UMAP_data_focused = INF1_UMAP_data.loc[(INF1_UMAP_data['UMAP1']>= -5) & (INF1_UMAP_data['UMAP1']<= 4)
    & (INF1_UMAP_data['UMAP2']>= -10) & (INF1_UMAP_data['UMAP2']<= 13)]

# Check
INF1_UMAP_data_focused.head()

INF1_data_focused = INF1_data[INF1_data.index.isin(INF1_UMAP_data_focused['contig'])]
print(INF1_data_focused)

In [None]:
# Save contig IDs
INF1_focused_contigs = INF1_data_focused.index.to_list()

directory = 'UMAP_WW'
file_path = os.path.join(directory, 'INF1_focused_contigs_above100.txt')

with open(file_path, 'w') as file:
    for item in INF1_focused_contigs:
        file.write(f"{item}\n")

In [None]:
print(INF1_data_focused.iloc[:, :-4])
print(INF1_UMAP_data_focused['ARG_count'].unique())

In [None]:
print(INF1_UMAP_data_focused['ARG_count'].unique())

In [None]:
n_neighbors = [20]
min_dist = [0.1]

colors = [0, 1, 2]
color_map = {0: '#f0da95', 1: '#fa7a31', 2: '#eb340f'}

# Map colors to each data point
custom_colors = [color_map[val] for val in colors]

for n in n_neighbors:
    for m in min_dist:
        reducer = umap.UMAP(n_neighbors=n, min_dist=m, random_state=seed)
        embedding = reducer.fit_transform(INF1_data_focused.iloc[:, :-4])
        INF1_UMAP_data_focused  = pd.DataFrame({
            'UMAP1': embedding[:, 0],
            'UMAP2': embedding[:, 1],
            'contig': INF1_data_focused.index,
            'ARG_count': INF1_data_focused['ARG_count'],
            'ARG_name':INF1_data_focused['ARG_name'],
            'contig_length':INF1_data_focused['length'],
        })

        INF1_UMAP_data_focused['ARG_count'] = INF1_UMAP_data_focused['ARG_count'].astype(str)
        INF1_UMAP_data_focused['log_contig_length'] = np.log(INF1_UMAP_data_focused['contig_length'])
        INF1_UMAP_data_focused['sqrt_contig_length'] = np.sqrt(INF1_UMAP_data_focused['contig_length'])

        INF1_UMAP_data_focused['ARG_count'] = INF1_UMAP_data_focused['ARG_count'].astype('category')
        category_order = ["0", "1", "2"]
        
        fig = px.scatter(INF1_UMAP_data_focused, 
                            x='UMAP1', 
                            y='UMAP2', 
                            color='ARG_count',
                            title=f' Wastewater INF1 - UMAP with n_neighbors={n}, min_dist={m}',
                            category_orders={'ARG_count': category_order},
                            color_discrete_sequence=custom_colors,
                            hover_data={'contig': True, 'ARG_count': True, 'ARG_name': True, 'contig_length': True},
                            size='sqrt_contig_length'
        )
        title = f' Wastewater INF1 - UMAP with n_neighbors={n}, min_dist={m}'
        fig.update_layout(
            height=1700,
            width=1200,
            title_text=title,
            showlegend=True,
            legend=dict(
                x=0.5,
                y=-0.05,
                traceorder="normal",
                xanchor='center',
                yanchor='top',
                orientation='h'
            ),
            template='simple_white',
            xaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,  
                linecolor='black', 
                linewidth=1,
                mirror=True
            ),
            yaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,
                linecolor='black',
                linewidth=1,
                mirror=True
            )
        )
        fig.show()
        #fig.write_image(f'UMAP_WW/INF1_UMAP_{n}_{m}_ARG_counts_focused_above100.png')
        #fig.write_html(f'UMAP_WW/INF1_UMAP_{n}_{m}_ARG_counts_focused_above100.html')
        fig.write_image(f'UMAP_WW/INF1_UMAP_{n}_{m}_ARG_counts_lengths_sqrt_focused_above100.png')
        fig.write_html(f'UMAP_WW/INF1_UMAP_{n}_{m}_ARG_counts_lengths_sqrt_focused_above100.html')

### Extract clusters
#### C1

In [None]:
# Extract based on UMAP1 (x axis) & UMAP2 (y axis) values:
INF1_data_focused_C1 = INF1_UMAP_data_focused.loc[(INF1_UMAP_data_focused['UMAP1']>= -1.8) & (INF1_UMAP_data_focused['UMAP1']<= -1.6)
    & (INF1_UMAP_data_focused['UMAP2']>= 8.45) & (INF1_UMAP_data_focused['UMAP2']<= 8.55)]

# Check
INF1_data_focused_C1.head()

INF1_data_C1 = INF1_data_focused[INF1_data_focused.index.isin(INF1_data_focused_C1['contig'])]
print(INF1_data_C1)

In [None]:
# Save contig IDs
INF1_C1_contigs = INF1_data_C1.index.to_list()

directory = 'UMAP_WW'
file_path = os.path.join(directory, 'INF1_C1_contigs_above100.txt')

with open(file_path, 'w') as file:
    for item in INF1_C1_contigs:
        file.write(f"{item}\n")

### Extract clusters
#### C2

In [None]:
# Extract based on UMAP1 (x axis) & UMAP2 (y axis) values:
INF1_data_focused_C2 = INF1_UMAP_data_focused.loc[(INF1_UMAP_data_focused['UMAP1']>= -4.05) & (INF1_UMAP_data_focused['UMAP1']<= -3.7)
    & (INF1_UMAP_data_focused['UMAP2']>= 0.7) & (INF1_UMAP_data_focused['UMAP2']<= 8.1)]

# Check
INF1_data_focused_C2.head()

INF1_data_C2 = INF1_data_focused[INF1_data_focused.index.isin(INF1_data_focused_C2['contig'])]
print(INF1_data_C2)

In [None]:
# Save contig IDs
INF1_C2_contigs = INF1_data_C2.index.to_list()

directory = 'UMAP_WW'
file_path = os.path.join(directory, 'INF1_C2_contigs_above100.txt')

with open(file_path, 'w') as file:
    for item in INF1_C2_contigs:
        file.write(f"{item}\n")

### Extract clusters
#### C3

In [None]:
# Extract based on UMAP1 (x axis) & UMAP2 (y axis) values:
INF1_data_focused_C3 = INF1_UMAP_data_focused.loc[(INF1_UMAP_data_focused['UMAP1']>= 1.8) & (INF1_UMAP_data_focused['UMAP1']<= 2.0)
    & (INF1_UMAP_data_focused['UMAP2']>= -2.1) & (INF1_UMAP_data_focused['UMAP2']<= -1.9)]

# Check
INF1_data_focused_C3.head()

INF1_data_C3 = INF1_data_focused[INF1_data_focused.index.isin(INF1_data_focused_C3['contig'])]
print(INF1_data_C3)

In [None]:
# Save contig IDs
INF1_C3_contigs = INF1_data_C3.index.to_list()

directory = 'UMAP_WW'
file_path = os.path.join(directory, 'INF1_C3_contigs_above100.txt')

with open(file_path, 'w') as file:
    for item in INF1_C3_contigs:
        file.write(f"{item}\n")

### Extract clusters
#### C2_e

In [None]:
# Extract based on UMAP1 (x axis) & UMAP2 (y axis) values:
INF1_data_focused_C2_e = INF1_UMAP_data_focused.loc[(INF1_UMAP_data_focused['UMAP1']>= -4) & (INF1_UMAP_data_focused['UMAP1']<= -3.5)
    & (INF1_UMAP_data_focused['UMAP2']>= 0.85) & (INF1_UMAP_data_focused['UMAP2']<= 0.92)]

# Check
INF1_data_focused_C2_e.head()

INF1_data_C2_e = INF1_data_focused[INF1_data_focused.index.isin(INF1_data_focused_C2_e['contig'])]
print(INF1_data_C2_e)

In [None]:
# Save contig IDs
INF1_C2_e_contigs = INF1_data_C2_e.index.to_list()

directory = 'UMAP_WW'
file_path = os.path.join(directory, 'INF1_C2_e_contigs_above100.txt')

with open(file_path, 'w') as file:
    for item in INF1_C2_e_contigs:
        file.write(f"{item}\n")

# SLU1
## Import data

In [None]:
path_to_images = '/scratch/project_2006608/Methylation/notebooks/UMAP_WW/'

## FILTERING
### > 100 lines in .gff

In [None]:
file_path = '/scratch/project_2006608/Methylation/WW_data/SLU1_matrices_top100/flattened/SLU1_concat_matrices_top100.tsv'
SLU1_matrices = pd.read_csv(file_path, sep='\t', index_col=0, low_memory=False)

In [None]:
print(SLU1_matrices.shape[0])

In [None]:
print(SLU1_matrices.shape)
SLU1_matrices.head()

In [None]:
SLU1_df = SLU1_matrices.loc[(SLU1_matrices.iloc[:, :492] != 0).any(axis=1)]
SLU1_df.shape
SLU1_df.head()

In [None]:
SLU1_df['sample'].value_counts()
print(SLU1_df.iloc[:, :-1])

In [None]:
# All
#n_neighbors = [20, 30 ]
#min_dist = [0.01, 0.1, 0.2]
# The best
n_neighbors = [20]
min_dist = [0.1]

color_dict = ['#df8275']

for n in n_neighbors:
    for m in min_dist:
        
        reducer = umap.UMAP(n_neighbors=n, min_dist=m, random_state=seed)
        embedding = reducer.fit_transform(SLU1_df.iloc[:, :-1])
        UMAP_SLU1_df  = pd.DataFrame({
            'UMAP1': embedding[:, 0],
            'UMAP2': embedding[:, 1],
            'contig': SLU1_df.index
        })
        
        fig = px.scatter(UMAP_SLU1_df, 
                            x='UMAP1', 
                            y='UMAP2', 
                            title=f' Wastewater SLU1 - UMAP with n_neighbors={n}, min_dist={m}', 
                            color_discrete_sequence=color_dict,
                            hover_data={'contig': True})
        title = f' Wastewater SLU1 - UMAP with n_neighbors={n}, min_dist={m}'
        fig.update_layout(
            height=1500,
            width=1200,
            title_text=title,
            showlegend=True,
            legend=dict(
                x=0.5,
                y=-0.1,
                traceorder="normal",
                xanchor='center',
                yanchor='top',
                orientation='h'
            ),
            template='simple_white',
            xaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,  
                linecolor='black', 
                linewidth=1,
                mirror=True
            ),
            yaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,
                linecolor='black',
                linewidth=1,
                mirror=True
            )
        )
        fig.show()
        #fig.write_image(f'UMAP_WW/SLU1_UMAP_{n}_{m}_above100.png')
        #fig.write_html(f'UMAP_WW/SLU1_UMAP_{n}_{m}_above100.html')

## Attach mod count data

In [None]:
## Bring the contig lengths from Puhti:
file_path = '/scratch/project_2006608/Methylation/WW_data/SLU1_contigs/SLU1_mod_counts.txt'

SLU1_df_mod_counts = pd.read_csv(file_path, sep='\t', index_col=0, header=0, low_memory=False)
print(SLU1_df_mod_counts.shape[0])
SLU1_df_mod_counts.head()

SLU1_df_mod_counts = np.log(SLU1_df_mod_counts)

In [None]:
## Append to merged_data.tsv
SLU1_df_ext = SLU1_df.copy()
SLU1_df_ext.head()
SLU1_df_mod_counts.head()

# Reorder to match
SLU1_df_ordered = SLU1_df_mod_counts.loc[SLU1_df_ext.index]

SLU1_df_mod_counts = pd.concat([SLU1_df_ext, SLU1_df_ordered], axis=1)
SLU1_df_mod_counts.tail()
print(SLU1_df_mod_counts.iloc[:, :-2])

In [None]:
n_neighbors = [20]
min_dist = [0.1]

for n in n_neighbors:
    for m in min_dist:
        reducer = umap.UMAP(n_neighbors=n, min_dist=m, random_state=seed)
        embedding = reducer.fit_transform(SLU1_df_mod_counts.iloc[:, :-2])
        SLU1_mod_counts_UMAP_df  = pd.DataFrame({
            'UMAP1': embedding[:, 0],
            'UMAP2': embedding[:, 1],
            'contig': SLU1_df_mod_counts.index,
            'mod_count':SLU1_df_mod_counts['mod_count']
        })
        
        fig = px.scatter(SLU1_mod_counts_UMAP_df, 
                            x='UMAP1', 
                            y='UMAP2', 
                            color='mod_count',
                            title=f' Wastewater SLU1 - UMAP with n_neighbors={n}, min_dist={m}', 
                            color_continuous_scale=px.colors.sequential.Rainbow,
                            hover_data={'contig': True} )
        title = f' Wastewater SLU1 - UMAP with n_neighbors={n}, min_dist={m}'
        fig.update_layout(
            height=1700,
            width=1200,
            title_text=title,
            showlegend=True,
            legend=dict(
                x=0.5,
                y=-0.05,
                traceorder="normal",
                xanchor='center',
                yanchor='top',
                orientation='h'
            ),
            template='simple_white',
            xaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,  
                linecolor='black', 
                linewidth=1,
                mirror=True
            ),
            yaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,
                linecolor='black',
                linewidth=1,
                mirror=True
            )
        )
        fig.show()
        fig.write_image(f'UMAP_WW/SLU1_UMAP_{n}_{m}_contig_mod_counts_log_above100.png')
        fig.write_html(f'UMAP_WW/SLU1_UMAP_{n}_{m}_contig_mod_counts_log_above100.html')

## Attach ARG data
### Counts
### Lengths
### Contig lengths

### ARG Counts

In [None]:
file_path = '/scratch/project_2006608/Methylation/WW_data/SLU1_contigs/SLU1_ARG_counts.txt'

SLU1_df_ARG_counts = pd.read_csv(file_path, sep='\t', index_col=0, header=0, low_memory=False)
print(SLU1_df_ARG_counts.shape[0])
SLU1_df_ARG_counts.head()

In [None]:
## Append to merged_data.tsv
SLU1_df_ext = SLU1_df.copy()
SLU1_df_ext.head()
SLU1_df_ARG_counts.head()

# Reorder to match
SLU1_df_ordered = SLU1_df_ARG_counts.loc[SLU1_df_ext.index]

SLU1_df_ARG_counts = pd.concat([SLU1_df_ext, SLU1_df_ordered], axis=1)
SLU1_df_ARG_counts.tail()

### ARG Names

In [None]:
file_path = '/scratch/project_2006608/Methylation/WW_data/SLU1_contigs/SLU1_ARG_names.txt'

SLU1_df_ARG_names = pd.read_csv(file_path, sep='\t', index_col=0, header=0, low_memory=False)
print(SLU1_df_ARG_names.shape[0])
SLU1_df_ARG_names.head()

In [None]:
## Append to merged_data.tsv
SLU1_df_ext = SLU1_df_ARG_counts.copy()
SLU1_df_ext.head()
SLU1_df_ARG_names.head()

# Reorder to match
SLU1_df_ordered = SLU1_df_ARG_names.loc[SLU1_df_ext.index]

SLU1_df_ARG_names = pd.concat([SLU1_df_ext, SLU1_df_ordered], axis=1)
SLU1_df_ARG_names.tail()

In [None]:
# print those with erm(F)_3
erm_F = SLU1_df_ARG_names[SLU1_df_ARG_names['ARG_name'].str.contains('erm(F)_3', case=False, na=False, regex=False)]
row_names_df = pd.DataFrame(erm_F.index, columns=['0'])
print(row_names_df)

### Contig lengths

In [None]:
## Bring the contig lengths from Puhti:
file_path = '/scratch/project_2006608/Methylation/WW_data/SLU1_contigs/SLU1_contigs_lengths.txt'

SLU1_df_contigs_lengths = pd.read_csv(file_path, sep='\t', index_col=0, header=0, low_memory=False)
print(SLU1_df_contigs_lengths.shape[0])
SLU1_df_contigs_lengths.head()

In [None]:
## Append to merged_data.tsv
SLU1_df_ext = SLU1_df_ARG_names.copy()
SLU1_df_ext.head()
SLU1_df_contigs_lengths.head()

# Reorder to match
SLU1_df_ordered = SLU1_df_contigs_lengths.loc[SLU1_df_ext.index]

SLU1_data = pd.concat([SLU1_df_ext, SLU1_df_ordered], axis=1)
SLU1_data.tail()
print(SLU1_data.iloc[:, :-4])

In [None]:
n_neighbors = [20]
min_dist = [0.1]

colors = [0, 1]
color_map = {0: '#f3d3ce', 1: '#fa7a31'}

# Map colors to each data point
custom_colors = [color_map[val] for val in colors]

for n in n_neighbors:
    for m in min_dist:
        reducer = umap.UMAP(n_neighbors=n, min_dist=m, random_state=seed)
        embedding = reducer.fit_transform(SLU1_data.iloc[:, :-4])
        SLU1_UMAP_data  = pd.DataFrame({
            'UMAP1': embedding[:, 0],
            'UMAP2': embedding[:, 1],
            'contig': SLU1_data.index,
            'ARG_count':SLU1_data['ARG_count'],
            'ARG_name':SLU1_data['ARG_name'],
            'contig_length':SLU1_data['length'],
        })

        SLU1_UMAP_data['ARG_count'] = SLU1_UMAP_data['ARG_count'].astype(str)
        
        fig = px.scatter(SLU1_UMAP_data, 
                            x='UMAP1', 
                            y='UMAP2', 
                            color='ARG_count',
                            title=f' Wastewater SLU1 - UMAP with n_neighbors={n}, min_dist={m}', 
                            color_discrete_sequence=custom_colors,
                            hover_data={'contig': True, 'ARG_count': True, 'ARG_name': True, 'contig_length': True} )
        title = f' Wastewater SLU1 - UMAP with n_neighbors={n}, min_dist={m}'
        fig.update_layout(
            height=1700,
            width=1200,
            title_text=title,
            showlegend=True,
            legend=dict(
                x=0.5,
                y=-0.05,
                traceorder="normal",
                xanchor='center',
                yanchor='top',
                orientation='h'
            ),
            template='simple_white',
            xaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,  
                linecolor='black', 
                linewidth=1,
                mirror=True
            ),
            yaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,
                linecolor='black',
                linewidth=1,
                mirror=True
            )
        )
        fig.show()
        #fig.write_image(f'UMAP_WW/SLU1_UMAP_{n}_{m}_contig_ARG_counts_above100.png')
        #fig.write_html(f'UMAP_WW/SLU1_UMAP_{n}_{m}_contig_ARG_counts_above100.html')

## Focus on distinct clusters
### Leave out the most misc cluster and draw again

In [None]:
# Extract based on UMAP1 (x axis) & UMAP2 (y axis) values:
SLU1_UMAP_data_focused1 = SLU1_UMAP_data.loc[(SLU1_UMAP_data['UMAP1']>= -6) & (SLU1_UMAP_data['UMAP1']<= 9.7)
    & (SLU1_UMAP_data['UMAP2']>= -10) & (SLU1_UMAP_data['UMAP2']<= 18)]

SLU1_UMAP_data_focused2 = SLU1_UMAP_data.loc[(SLU1_UMAP_data['UMAP1']>= 9.71) & (SLU1_UMAP_data['UMAP1']<= 22)
    & (SLU1_UMAP_data['UMAP2']>= -10) & (SLU1_UMAP_data['UMAP2']<= 10.1)]

SLU1_UMAP_data_focused = pd.concat([SLU1_UMAP_data_focused1, SLU1_UMAP_data_focused2], ignore_index=False)

# Check
SLU1_UMAP_data_focused.tail()

SLU1_data_focused = SLU1_data[SLU1_data.index.isin(SLU1_UMAP_data_focused['contig'])]
print(SLU1_data_focused)

In [None]:
# Save contig IDs
SLU1_focused_contigs = SLU1_data_focused.index.to_list()

directory = 'UMAP_WW'
file_path = os.path.join(directory, 'SLU1_focused_contigs_above100.txt')

with open(file_path, 'w') as file:
    for item in SLU1_focused_contigs:
        file.write(f"{item}\n")

In [None]:
print(SLU1_data_focused.iloc[:, :-4])
print(SLU1_UMAP_data_focused['ARG_count'].unique())

In [None]:
print(SLU1_UMAP_data_focused['ARG_count'].unique())

In [None]:
n_neighbors = [20]
min_dist = [0.1]

colors = [0, 1]
color_map = {0: '#f7c0b7', 1: '#f35e47'}

# Map colors to each data point
custom_colors = [color_map[val] for val in colors]

for n in n_neighbors:
    for m in min_dist:
        reducer = umap.UMAP(n_neighbors=n, min_dist=m, random_state=seed)
        embedding = reducer.fit_transform(SLU1_data_focused.iloc[:, :-4])
        SLU1_UMAP_data_focused  = pd.DataFrame({
            'UMAP1': embedding[:, 0],
            'UMAP2': embedding[:, 1],
            'contig': SLU1_data_focused.index,
            'ARG_count': SLU1_data_focused['ARG_count'],
            'ARG_name':SLU1_data_focused['ARG_name'],
            'contig_length':SLU1_data_focused['length'],
        })

        SLU1_UMAP_data_focused['ARG_count'] = SLU1_UMAP_data_focused['ARG_count'].astype(str)
        SLU1_UMAP_data_focused['log_contig_length'] = np.log(SLU1_UMAP_data_focused['contig_length'])
        SLU1_UMAP_data_focused['sqrt_contig_length'] = np.sqrt(SLU1_UMAP_data_focused['contig_length'])

        SLU1_UMAP_data_focused['ARG_count'] = SLU1_UMAP_data_focused['ARG_count'].astype('category')
        category_order = ["0", "1"]
        
        fig = px.scatter(SLU1_UMAP_data_focused, 
                            x='UMAP1', 
                            y='UMAP2', 
                            color='ARG_count',
                            title=f' Wastewater SLU1 - UMAP with n_neighbors={n}, min_dist={m}',
                            category_orders={'ARG_count': category_order},
                            color_discrete_sequence=custom_colors,
                            hover_data={'contig': True, 'ARG_count': True, 'ARG_name': True, 'contig_length': True},
                            #size='sqrt_contig_length'
        )
        title = f' Wastewater SLU1 - UMAP with n_neighbors={n}, min_dist={m}'
        fig.update_layout(
            height=1700,
            width=1200,
            title_text=title,
            showlegend=True,
            legend=dict(
                x=0.5,
                y=-0.05,
                traceorder="normal",
                xanchor='center',
                yanchor='top',
                orientation='h'
            ),
            template='simple_white',
            xaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,  
                linecolor='black', 
                linewidth=1,
                mirror=True
            ),
            yaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,
                linecolor='black',
                linewidth=1,
                mirror=True
            )
        )
        fig.show()
        #fig.write_image(f'UMAP_WW/SLU1_UMAP_{n}_{m}_ARG_counts_focused_above100.png')
        #fig.write_html(f'UMAP_WW/SLU1_UMAP_{n}_{m}_ARG_counts_focused_above100.html')
        fig.write_image(f'UMAP_WW/SLU1_UMAP_{n}_{m}_ARG_counts_lengths_sqrt_focused_above100.png')
        fig.write_html(f'UMAP_WW/SLU1_UMAP_{n}_{m}_ARG_counts_lengths_sqrt_focused_above100.html')

### Extract clusters
#### C1

In [None]:
# Extract based on UMAP1 (x axis) & UMAP2 (y axis) values:
SLU1_data_focused_C1 = SLU1_UMAP_data_focused.loc[(SLU1_UMAP_data_focused['UMAP1']>= 15.284) & (SLU1_UMAP_data_focused['UMAP1']<= 15.296)
    & (SLU1_UMAP_data_focused['UMAP2']>= 10.7) & (SLU1_UMAP_data_focused['UMAP2']<= 10.712)]

# Check
SLU1_data_focused_C1.head()

SLU1_data_C1 = SLU1_data_focused[SLU1_data_focused.index.isin(SLU1_data_focused_C1['contig'])]
print(SLU1_data_C1_f)

In [None]:
# Save contig IDs
SLU1_C1_contigs = SLU1_data_C1.index.to_list()

directory = 'UMAP_WW'
file_path = os.path.join(directory, 'SLU1_C1_contigs_above100.txt')

with open(file_path, 'w') as file:
    for item in SLU1_C1_contigs:
        file.write(f"{item}\n")

### Extract clusters
#### C2

In [None]:
# Extract based on UMAP1 (x axis) & UMAP2 (y axis) values:
SLU1_data_focused_C2 = SLU1_UMAP_data_focused.loc[(SLU1_UMAP_data_focused['UMAP1']>= 15.14) & (SLU1_UMAP_data_focused['UMAP1']<= 15.3)
    & (SLU1_UMAP_data_focused['UMAP2']>= 10.55) & (SLU1_UMAP_data_focused['UMAP2']<= 10.66)]

# Check
SLU1_data_focused_C2.head()

SLU1_data_C2 = SLU1_data_focused[SLU1_data_focused.index.isin(SLU1_data_focused_C2['contig'])]
print(SLU1_data_C2)

In [None]:
# Save contig IDs
SLU1_C2_contigs = SLU1_data_C2.index.to_list()

directory = 'UMAP_WW'
file_path = os.path.join(directory, 'SLU1_C2_contigs_above100.txt')

with open(file_path, 'w') as file:
    for item in SLU1_C2_contigs:
        file.write(f"{item}\n")

### Extract clusters
#### C5_e

In [None]:
# Extract based on UMAP1 (x axis) & UMAP2 (y axis) values:
SLU1_data_focused_C5_e = SLU1_UMAP_data_focused.loc[(SLU1_UMAP_data_focused['UMAP1']>= 16) & (SLU1_UMAP_data_focused['UMAP1']<= 16.5)
    & (SLU1_UMAP_data_focused['UMAP2']>= -2.8) & (SLU1_UMAP_data_focused['UMAP2']<= -2.6)]

# Check
SLU1_data_focused_C5_e.head()

SLU1_data_C5_e = SLU1_data_focused[SLU1_data_focused.index.isin(SLU1_data_focused_C5_e['contig'])]
print(SLU1_data_C5_e)

In [None]:
# Save contig IDs
SLU1_C5_e_contigs = SLU1_data_C5_e.index.to_list()

directory = 'UMAP_WW'
file_path = os.path.join(directory, 'SLU1_C5_e_contigs_above100.txt')

with open(file_path, 'w') as file:
    for item in SLU1_C5_e_contigs:
        file.write(f"{item}\n")

### Extract clusters
#### C4_e

In [None]:
# Extract based on UMAP1 (x axis) & UMAP2 (y axis) values:
SLU1_data_focused_C4_e = SLU1_UMAP_data_focused.loc[(SLU1_UMAP_data_focused['UMAP1']>= 20) & (SLU1_UMAP_data_focused['UMAP1']<= 21.5)
    & (SLU1_UMAP_data_focused['UMAP2']>= 16.8) & (SLU1_UMAP_data_focused['UMAP2']<= 17.2)]

# Check
SLU1_data_focused_C4_e.head()

SLU1_data_C4_e = SLU1_data_focused[SLU1_data_focused.index.isin(SLU1_data_focused_C4_e['contig'])]
print(SLU1_data_C4_e)

In [None]:
# Save contig IDs
SLU1_C4_contigs = SLU1_data_C4_e.index.to_list()

directory = 'UMAP_WW'
file_path = os.path.join(directory, 'SLU1_C4_e_contigs_above100.txt')

with open(file_path, 'w') as file:
    for item in SLU1_C4_e_contigs:
        file.write(f"{item}\n")

### Extract clusters
#### C3

In [None]:
# Extract based on UMAP1 (x axis) & UMAP2 (y axis) values:
SLU1_data_focused_C3 = SLU1_UMAP_data_focused.loc[(SLU1_UMAP_data_focused['UMAP1']>= 0.06) & (SLU1_UMAP_data_focused['UMAP1']<= 0.14)
    & (SLU1_UMAP_data_focused['UMAP2']>= 15.36) & (SLU1_UMAP_data_focused['UMAP2']<= 15.39)]

# Check
SLU1_data_focused_C3.head()

SLU1_data_C3 = SLU1_data_focused[SLU1_data_focused.index.isin(SLU1_data_focused_C3['contig'])]
print(SLU1_data_C3)

In [None]:
# Save contig IDs
SLU1_C3_contigs = SLU1_data_C3.index.to_list()

directory = 'UMAP_WW'
file_path = os.path.join(directory, 'SLU1_C3_contigs_above100.txt')

with open(file_path, 'w') as file:
    for item in SLU1_C3_contigs:
        file.write(f"{item}\n")

### Extract clusters
#### C4

In [None]:
# Extract based on UMAP1 (x axis) & UMAP2 (y axis) values:
SLU1_data_focused_C4 = SLU1_UMAP_data_focused.loc[(SLU1_UMAP_data_focused['UMAP1']>= 8.08) & (SLU1_UMAP_data_focused['UMAP1']<= 8.15)
    & (SLU1_UMAP_data_focused['UMAP2']>= -0.982) & (SLU1_UMAP_data_focused['UMAP2']<= -0.96)]

# Check
SLU1_data_focused_C4.head()

SLU1_data_C4 = SLU1_data_focused[SLU1_data_focused.index.isin(SLU1_data_focused_C4['contig'])]
print(SLU1_data_C4)

In [None]:
# Save contig IDs
SLU1_C4_contigs = SLU1_data_C4.index.to_list()

directory = 'UMAP_WW'
file_path = os.path.join(directory, 'SLU1_C4_contigs_above100.txt')

with open(file_path, 'w') as file:
    for item in SLU1_C4_contigs:
        file.write(f"{item}\n")

### Extract clusters
#### C6_e

In [None]:
# Extract based on UMAP1 (x axis) & UMAP2 (y axis) values:
SLU1_data_focused_C6_e = SLU1_UMAP_data_focused.loc[(SLU1_UMAP_data_focused['UMAP1']>= -2) & (SLU1_UMAP_data_focused['UMAP1']<= -1.5)
    & (SLU1_UMAP_data_focused['UMAP2']>= -1) & (SLU1_UMAP_data_focused['UMAP2']<= -0.8)]

# Check
SLU1_data_focused_C6_e.head()

SLU1_data_C6_e = SLU1_data_focused[SLU1_data_focused.index.isin(SLU1_data_focused_C6_e['contig'])]
print(SLU1_data_C6)

In [None]:
# Save contig IDs
SLU1_C6_e_contigs = SLU1_data_C6_e.index.to_list()

directory = 'UMAP_WW'
file_path = os.path.join(directory, 'SLU1_C6_e_contigs_above100.txt')

with open(file_path, 'w') as file:
    for item in SLU1_C6_e_contigs:
        file.write(f"{item}\n")

# EFF2
## Import data

In [None]:
path_to_images = '/scratch/project_2006608/Methylation/notebooks/UMAP_WW/'

## FILTERING
### > 100 lines in .gff

In [None]:
file_path = '/scratch/project_2006608/Methylation/WW_data/EFF2_matrices_top100/flattened/EFF2_concat_matrices_top100.tsv'
EFF2_matrices = pd.read_csv(file_path, sep='\t', index_col=0, low_memory=False)

In [None]:
print(EFF2_matrices.shape[0])

In [None]:
print(EFF2_matrices.shape)
EFF2_matrices.head()

In [None]:
EFF2_df = EFF2_matrices.loc[(EFF2_matrices.iloc[:, :492] != 0).any(axis=1)]
EFF2_df.shape
EFF2_df.head()

In [None]:
EFF2_df['sample'].value_counts()
print(EFF2_df.iloc[:, :-1])

In [None]:
# All
#n_neighbors = [20, 30 ]
#min_dist = [0.01, 0.1, 0.2]
# The best
n_neighbors = [20]
min_dist = [0.1]

color_dict = ['#05b5bb']

for n in n_neighbors:
    for m in min_dist:
        
        reducer = umap.UMAP(n_neighbors=n, min_dist=m, random_state=seed)
        embedding = reducer.fit_transform(EFF2_df.iloc[:, :-1])
        UMAP_EFF2_df  = pd.DataFrame({
            'UMAP1': embedding[:, 0],
            'UMAP2': embedding[:, 1],
            'contig': EFF2_df.index
        })     
        fig = px.scatter(UMAP_EFF2_df, 
                            x='UMAP1', 
                            y='UMAP2', 
                            title=f' Wastewater EFF2 - UMAP with n_neighbors={n}, min_dist={m}', 
                            color_discrete_sequence=color_dict,
                            hover_data={'contig': True})
        title = f' Wastewater EFF2 - UMAP with n_neighbors={n}, min_dist={m}'
        fig.update_layout(
            height=1500,
            width=1200,
            title_text=title,
            showlegend=True,
            legend=dict(
                x=0.5,
                y=-0.1,
                traceorder="normal",
                xanchor='center',
                yanchor='top',
                orientation='h'
            ),
            template='simple_white',
            xaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,  
                linecolor='black', 
                linewidth=1,
                mirror=True
            ),
            yaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,
                linecolor='black',
                linewidth=1,
                mirror=True
            )
        )
        fig.show()
        fig.write_image(f'UMAP_WW/EFF2_UMAP_{n}_{m}_above100.png')
        fig.write_html(f'UMAP_WW/EFF2_UMAP_{n}_{m}_above100.html')

## Attach mod count data

In [None]:
## Bring the contig lengths from Puhti:
file_path = '/scratch/project_2006608/Methylation/WW_data/EFF2_contigs/EFF2_mod_counts.txt'

EFF2_df_mod_counts = pd.read_csv(file_path, sep='\t', index_col=0, header=0, low_memory=False)
print(EFF2_df_mod_counts.shape[0])
EFF2_df_mod_counts.head()

EFF2_df_mod_counts = np.log(EFF2_df_mod_counts)

In [None]:
## Append to merged_data.tsv
EFF2_df_ext = EFF2_df.copy()
EFF2_df_ext.head()
EFF2_df_mod_counts.head()

# Reorder to match
EFF2_df_ordered = EFF2_df_mod_counts.loc[EFF2_df_ext.index]

EFF2_df_mod_counts = pd.concat([EFF2_df_ext, EFF2_df_ordered], axis=1)
EFF2_df_mod_counts.tail()
print(EFF2_df_mod_counts.iloc[:, :-2])

In [None]:
n_neighbors = [20]
min_dist = [0.1]

for n in n_neighbors:
    for m in min_dist:
        reducer = umap.UMAP(n_neighbors=n, min_dist=m, random_state=seed)
        embedding = reducer.fit_transform(EFF2_df_mod_counts.iloc[:, :-2])
        EFF2_mod_counts_UMAP_df  = pd.DataFrame({
            'UMAP1': embedding[:, 0],
            'UMAP2': embedding[:, 1],
            'contig': EFF2_df_mod_counts.index,
            'mod_count':EFF2_df_mod_counts['mod_count']
        })
        
        fig = px.scatter(EFF2_mod_counts_UMAP_df, 
                            x='UMAP1', 
                            y='UMAP2', 
                            color='mod_count',
                            title=f' Wastewater EFF2 - UMAP with n_neighbors={n}, min_dist={m}', 
                            color_continuous_scale=px.colors.sequential.Rainbow,
                            hover_data={'contig': True} )
        title = f' Wastewater EFF2 - UMAP with n_neighbors={n}, min_dist={m}'
        fig.update_layout(
            height=1700,
            width=1200,
            title_text=title,
            showlegend=True,
            legend=dict(
                x=0.5,
                y=-0.05,
                traceorder="normal",
                xanchor='center',
                yanchor='top',
                orientation='h'
            ),
            template='simple_white',
            xaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,  
                linecolor='black', 
                linewidth=1,
                mirror=True
            ),
            yaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,
                linecolor='black',
                linewidth=1,
                mirror=True
            )
        )
        fig.show()
        fig.write_image(f'UMAP_WW/EFF2_UMAP_{n}_{m}_contig_mod_counts_log_above100.png')
        fig.write_html(f'UMAP_WW/EFF2_UMAP_{n}_{m}_contig_mod_counts_log_above100.html')

## Attach ARG data
### Counts
### Lengths
### Contig lengths

### ARG Counts

In [None]:
file_path = '/scratch/project_2006608/Methylation/WW_data/EFF2_contigs/EFF2_ARG_counts.txt'

EFF2_df_ARG_counts = pd.read_csv(file_path, sep='\t', index_col=0, header=0, low_memory=False)
print(EFF2_df_ARG_counts.shape[0])
EFF2_df_ARG_counts.head()

In [None]:
## Append to merged_data.tsv
EFF2_df_ext = EFF2_df.copy()
EFF2_df_ext.head()
EFF2_df_ARG_counts.head()

# Reorder to match
EFF2_df_ordered = EFF2_df_ARG_counts.loc[EFF2_df_ext.index]

EFF2_df_ARG_counts = pd.concat([EFF2_df_ext, EFF2_df_ordered], axis=1)
EFF2_df_ARG_counts.tail()
#print(EFF2_df_ARG_counts.dtypes)

### ARG Names

In [None]:
file_path = '/scratch/project_2006608/Methylation/WW_data/EFF2_contigs/EFF2_ARG_names.txt'

EFF2_df_ARG_names = pd.read_csv(file_path, sep='\t', index_col=0, header=0, low_memory=False)
print(EFF2_df_ARG_names.shape[0])
EFF2_df_ARG_names.head()

In [None]:
## Append to merged_data.tsv
EFF2_df_ext = EFF2_df_ARG_counts.copy()
EFF2_df_ext.head()
EFF2_df_ARG_names.head()

# Reorder to match
EFF2_df_ordered = EFF2_df_ARG_names.loc[EFF2_df_ext.index]

EFF2_df_ARG_names = pd.concat([EFF2_df_ext, EFF2_df_ordered], axis=1)
EFF2_df_ARG_names.tail()

In [None]:
# print those with erm(F)_3
erm_F = EFF2_df_ARG_names[EFF2_df_ARG_names['ARG_name'].str.contains('erm(F)_3', case=False, na=False, regex=False)]
print(erm_F)

### Contig lengths

In [None]:
## Bring the contig lengths from Puhti:
file_path = '/scratch/project_2006608/Methylation/WW_data/EFF2_contigs/EFF2_contigs_lengths.txt'

EFF2_df_contigs_lengths = pd.read_csv(file_path, sep='\t', index_col=0, header=0, low_memory=False)
print(EFF2_df_contigs_lengths.shape[0])
EFF2_df_contigs_lengths.head()

In [None]:
## Append to merged_data.tsv
EFF2_df_ext = EFF2_df_ARG_names.copy()
EFF2_df_ext.head()
EFF2_df_contigs_lengths.head()

# Reorder to match
EFF2_df_ordered = EFF2_df_contigs_lengths.loc[EFF2_df_ext.index]

EFF2_data = pd.concat([EFF2_df_ext, EFF2_df_ordered], axis=1)
EFF2_data.tail()
print(EFF2_data.iloc[:, :-4])

In [None]:
n_neighbors = [20]
min_dist = [0.1]

colors = [0, 1, 2, 3]
color_map = {0: '#8ce6e9', 1: '#fa7a31', 2: '#eb340f', 3: '#d01af5'}

# Map colors to each data point
custom_colors = [color_map[val] for val in colors]

for n in n_neighbors:
    for m in min_dist:
        reducer = umap.UMAP(n_neighbors=n, min_dist=m, random_state=seed)
        embedding = reducer.fit_transform(EFF2_data.iloc[:, :-4])
        EFF2_UMAP_data  = pd.DataFrame({
            'UMAP1': embedding[:, 0],
            'UMAP2': embedding[:, 1],
            'contig': EFF2_data.index,
            'ARG_count':EFF2_data['ARG_count'],
            'ARG_name':EFF2_data['ARG_name'],
            'contig_length':EFF2_data['length'],
        })

        EFF2_UMAP_data['ARG_count'] = EFF2_UMAP_data['ARG_count'].astype(str)
        
        fig = px.scatter(EFF2_UMAP_data, 
                            x='UMAP1', 
                            y='UMAP2', 
                            color='ARG_count',
                            title=f' Wastewater EFF2 - UMAP with n_neighbors={n}, min_dist={m}', 
                            color_discrete_sequence=custom_colors,
                            hover_data={'contig': True, 'ARG_count': True, 'ARG_name': True, 'contig_length': True} )
        title = f' Wastewater EFF2 - UMAP with n_neighbors={n}, min_dist={m}'
        fig.update_layout(
            height=1700,
            width=1200,
            title_text=title,
            showlegend=True,
            legend=dict(
                x=0.5,
                y=-0.05,
                traceorder="normal",
                xanchor='center',
                yanchor='top',
                orientation='h'
            ),
            template='simple_white',
            xaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,  
                linecolor='black', 
                linewidth=1,
                mirror=True
            ),
            yaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,
                linecolor='black',
                linewidth=1,
                mirror=True
            )
        )
        fig.show()
        fig.write_image(f'UMAP_WW/EFF2_UMAP_{n}_{m}_contig_ARG_counts_above100.png')
        fig.write_html(f'UMAP_WW/EFF2_UMAP_{n}_{m}_contig_ARG_counts_above100.html')

## Focus on distinct clusters
### Leave out the most misc cluster and draw again

In [None]:
# Extract based on UMAP1 (x axis) & UMAP2 (y axis) values:
EFF2_UMAP_data_focused = EFF2_UMAP_data.loc[(EFF2_UMAP_data['UMAP1']>= 2.5) & (EFF2_UMAP_data['UMAP1']<= 13)
    & (EFF2_UMAP_data['UMAP2']>= -1) & (EFF2_UMAP_data['UMAP2']<= 16)]

# Check
EFF2_UMAP_data_focused.head()

EFF2_data_focused = EFF2_data[EFF2_data.index.isin(EFF2_UMAP_data_focused['contig'])]
print(EFF2_data_focused)

In [None]:
# Save contig IDs
EFF2_focused_contigs = EFF2_data_focused.index.to_list()

directory = 'UMAP_WW'
file_path = os.path.join(directory, 'EFF2_focused_contigs_above100.txt')

with open(file_path, 'w') as file:
    for item in EFF2_focused_contigs:
        file.write(f"{item}\n")

In [None]:
print(EFF2_data_focused.iloc[:, :-4])

In [None]:
n_neighbors = [20]
min_dist = [0.1]

colors = [0, 1, 2, 3]
color_map = {0: '#b4f3f5', 1: '#fa7a31', 2: '#eb340f', 3: '#d01af5'}

# Map colors to each data point
custom_colors = [color_map[val] for val in colors]

for n in n_neighbors:
    for m in min_dist:
        reducer = umap.UMAP(n_neighbors=n, min_dist=m, random_state=seed)
        embedding = reducer.fit_transform(EFF2_data_focused.iloc[:, :-4])
        EFF2_UMAP_data_focused  = pd.DataFrame({
            'UMAP1': embedding[:, 0],
            'UMAP2': embedding[:, 1],
            'contig': EFF2_data_focused.index,
            'ARG_count': EFF2_data_focused['ARG_count'],
            'ARG_name':EFF2_data_focused['ARG_name'],
            'contig_length':EFF2_data_focused['length'],
        })

        EFF2_UMAP_data_focused['ARG_count'] = EFF2_UMAP_data_focused['ARG_count'].astype(str)
        EFF2_UMAP_data_focused['log_contig_length'] = np.log(EFF2_UMAP_data_focused['contig_length'])
        EFF2_UMAP_data_focused['sqrt_contig_length'] = np.sqrt(EFF2_UMAP_data_focused['contig_length'])

        
        fig = px.scatter(EFF2_UMAP_data_focused, 
                            x='UMAP1', 
                            y='UMAP2', 
                            color='ARG_count',
                            title=f' Wastewater EFF2 - UMAP with n_neighbors={n}, min_dist={m}', 
                            color_discrete_sequence=custom_colors,
                            hover_data={'contig': True, 'ARG_count': True, 'ARG_name': True, 'contig_length': True},
                            size='sqrt_contig_length'
        )
        title = f' Wastewater EFF2 - UMAP with n_neighbors={n}, min_dist={m}'
        fig.update_layout(
            height=1700,
            width=1200,
            title_text=title,
            showlegend=True,
            legend=dict(
                x=0.5,
                y=-0.05,
                traceorder="normal",
                xanchor='center',
                yanchor='top',
                orientation='h'
            ),
            template='simple_white',
            xaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,  
                linecolor='black', 
                linewidth=1,
                mirror=True
            ),
            yaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,
                linecolor='black',
                linewidth=1,
                mirror=True
            )
        )
        fig.show()
        #fig.write_image(f'UMAP_WW/EFF2_UMAP_{n}_{m}_ARG_counts_focused_above100.png')
        #fig.write_html(f'UMAP_WW/EFF2_UMAP_{n}_{m}_ARG_counts_focused_above100.html')
        fig.write_image(f'UMAP_WW/EFF2_UMAP_{n}_{m}_ARG_counts_lengths_sqrt_focused_above100.png')
        fig.write_html(f'UMAP_WW/EFF2_UMAP_{n}_{m}_ARG_counts_lengths_sqrt_focused_above100.html')

### Extract clusters
#### C1

In [None]:
# Extract based on UMAP1 (x axis) & UMAP2 (y axis) values:
EFF2_data_focused_C1 = EFF2_UMAP_data_focused.loc[(EFF2_UMAP_data_focused['UMAP1']>= 1.1) & (EFF2_UMAP_data_focused['UMAP1']<= 1.2)
    & (EFF2_UMAP_data_focused['UMAP2']>= -1.9) & (EFF2_UMAP_data_focused['UMAP2']<= -1.8)]

# Check
EFF2_data_focused_C1.head()

EFF2_data_C1 = EFF2_data_focused[EFF2_data_focused.index.isin(EFF2_data_focused_C1['contig'])]
print(EFF2_data_C1)

In [None]:
# Save contig IDs
EFF2_C1_contigs = EFF2_data_C1.index.to_list()

directory = 'UMAP_WW'
file_path = os.path.join(directory, 'EFF2_C1_contigs_above100.txt')

with open(file_path, 'w') as file:
    for item in EFF2_C1_contigs:
        file.write(f"{item}\n")

# EFF3
## Import data

In [None]:
path_to_images = '/scratch/project_2006608/Methylation/notebooks/UMAP_WW/'

## FILTERING
### > 100 lines in .gff

In [None]:
file_path = '/scratch/project_2006608/Methylation/WW_data/EFF3_matrices_top100/flattened/EFF3_concat_matrices_top100.tsv'
EFF3_matrices = pd.read_csv(file_path, sep='\t', index_col=0, low_memory=False)

In [None]:
print(EFF3_matrices.shape[0])

In [None]:
print(EFF3_matrices.shape)
EFF3_matrices.head()

In [None]:
EFF3_df = EFF3_matrices.loc[(EFF3_matrices.iloc[:, :492] != 0).any(axis=1)]
EFF3_df.shape
EFF3_df.head()

In [None]:
EFF3_df['sample'].value_counts()
print(EFF3_df.iloc[:, :-1])

In [None]:
# All
#n_neighbors = [20, 30 ]
#min_dist = [0.01, 0.1, 0.2]
# The best
n_neighbors = [20]
min_dist = [0.1]

color_dict = ['#05b5bb']

for n in n_neighbors:
    for m in min_dist:
        
        reducer = umap.UMAP(n_neighbors=n, min_dist=m, random_state=seed)
        embedding = reducer.fit_transform(EFF3_df.iloc[:, :-1])
        UMAP_EFF3_df  = pd.DataFrame({
            'UMAP1': embedding[:, 0],
            'UMAP2': embedding[:, 1],
            'contig': EFF3_df.index
        })
        # clustering
        kmeans = KMeans(n_clusters=11, random_state=seed)
        
        k_labels = kmeans.fit_predict(embedding)
        
        UMAP_EFF3_df['cluster'] = k_labels
        
        fig = px.scatter(UMAP_EFF3_df, 
                            x='UMAP1', 
                            y='UMAP2', 
                            title=f' Wastewater EFF3 - UMAP with n_neighbors={n}, min_dist={m}', 
                            color_discrete_sequence=color_dict,
                            hover_data={'cluster': True, 'contig': True})
        title = f' Wastewater EFF3 - UMAP with n_neighbors={n}, min_dist={m}'
        fig.update_layout(
            height=1500,
            width=1200,
            title_text=title,
            showlegend=True,
            legend=dict(
                x=0.5,
                y=-0.1,
                traceorder="normal",
                xanchor='center',
                yanchor='top',
                orientation='h'
            ),
            template='simple_white',
            xaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,  
                linecolor='black', 
                linewidth=1,
                mirror=True
            ),
            yaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,
                linecolor='black',
                linewidth=1,
                mirror=True
            )
        )
        fig.show()
        fig.write_image(f'UMAP_WW/EFF3_UMAP_{n}_{m}_above100.png')
        fig.write_html(f'UMAP_WW/EFF3_UMAP_{n}_{m}_above100.html')

## Attach mod count data

In [None]:
## Bring the contig lengths from Puhti:
file_path = '/scratch/project_2006608/Methylation/WW_data/EFF3_contigs/EFF3_mod_counts.txt'

EFF3_df_mod_counts = pd.read_csv(file_path, sep='\t', index_col=0, header=0, low_memory=False)
print(EFF3_df_mod_counts.shape[0])
EFF3_df_mod_counts.head()

EFF3_df_mod_counts = np.log(EFF3_df_mod_counts)

In [None]:
## Append to merged_data.tsv
EFF3_df_ext = EFF3_df.copy()
EFF3_df_ext.head()
EFF3_df_mod_counts.head()

# Reorder to match
EFF3_df_ordered = EFF3_df_mod_counts.loc[EFF3_df_ext.index]

EFF3_df_mod_counts = pd.concat([EFF3_df_ext, EFF3_df_ordered], axis=1)
EFF3_df_mod_counts.tail()
print(EFF3_df_mod_counts.iloc[:, :-2])

In [None]:
n_neighbors = [20]
min_dist = [0.1]

for n in n_neighbors:
    for m in min_dist:
        reducer = umap.UMAP(n_neighbors=n, min_dist=m, random_state=seed)
        embedding = reducer.fit_transform(EFF3_df_mod_counts.iloc[:, :-2])
        EFF3_mod_counts_UMAP_df  = pd.DataFrame({
            'UMAP1': embedding[:, 0],
            'UMAP2': embedding[:, 1],
            'contig': EFF3_df_mod_counts.index,
            'mod_count':EFF3_df_mod_counts['mod_count']
        })
        
        fig = px.scatter(EFF3_mod_counts_UMAP_df, 
                            x='UMAP1', 
                            y='UMAP2', 
                            color='mod_count',
                            title=f' Wastewater EFF3 - UMAP with n_neighbors={n}, min_dist={m}', 
                            color_continuous_scale=px.colors.sequential.Rainbow,
                            hover_data={'contig': True} )
        title = f' Wastewater EFF3 - UMAP with n_neighbors={n}, min_dist={m}'
        fig.update_layout(
            height=1700,
            width=1200,
            title_text=title,
            showlegend=True,
            legend=dict(
                x=0.5,
                y=-0.05,
                traceorder="normal",
                xanchor='center',
                yanchor='top',
                orientation='h'
            ),
            template='simple_white',
            xaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,  
                linecolor='black', 
                linewidth=1,
                mirror=True
            ),
            yaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,
                linecolor='black',
                linewidth=1,
                mirror=True
            )
        )
        fig.show()
        fig.write_image(f'UMAP_WW/EFF3_UMAP_{n}_{m}_contig_mod_counts_log_above100.png')
        fig.write_html(f'UMAP_WW/EFF3_UMAP_{n}_{m}_contig_mod_counts_log_above100.html')

## Attach ARG data
### Counts
### Lengths
### Contig lengths

### ARG Counts

In [None]:
file_path = '/scratch/project_2006608/Methylation/WW_data/EFF3_contigs/EFF3_ARG_counts.txt'

EFF3_df_ARG_counts = pd.read_csv(file_path, sep='\t', index_col=0, header=0, low_memory=False)
print(EFF3_df_ARG_counts.shape[0])
EFF3_df_ARG_counts.head()

In [None]:
## Append to merged_data.tsv
EFF3_df_ext = EFF3_df.copy()
EFF3_df_ext.head()
EFF3_df_ARG_counts.head()

# Reorder to match
EFF3_df_ordered = EFF3_df_ARG_counts.loc[EFF3_df_ext.index]

EFF3_df_ARG_counts = pd.concat([EFF3_df_ext, EFF3_df_ordered], axis=1)
EFF3_df_ARG_counts.tail()
#print(EFF3_df_ARG_counts.dtypes)

### ARG Names

In [None]:
file_path = '/scratch/project_2006608/Methylation/WW_data/EFF3_contigs/EFF3_ARG_names.txt'

EFF3_df_ARG_names = pd.read_csv(file_path, sep='\t', index_col=0, header=0, low_memory=False)
print(EFF3_df_ARG_names.shape[0])
EFF3_df_ARG_names.head()

In [None]:
## Append to merged_data.tsv
EFF3_df_ext = EFF3_df_ARG_counts.copy()
EFF3_df_ext.head()
EFF3_df_ARG_names.head()

# Reorder to match
EFF3_df_ordered = EFF3_df_ARG_names.loc[EFF3_df_ext.index]

EFF3_df_ARG_names = pd.concat([EFF3_df_ext, EFF3_df_ordered], axis=1)
EFF3_df_ARG_names.tail()

In [None]:
# print those with erm(F)_3
erm_F = EFF3_df_ARG_names[EFF3_df_ARG_names['ARG_name'].str.contains('erm(F)_3', case=False, na=False, regex=False)]
row_names_df = pd.DataFrame(erm_F.index, columns=['0'])
print(row_names_df)

### Contig lengths

In [None]:
## Bring the contig lengths from Puhti:
file_path = '/scratch/project_2006608/Methylation/WW_data/EFF3_contigs/EFF3_contigs_lengths.txt'

EFF3_df_contigs_lengths = pd.read_csv(file_path, sep='\t', index_col=0, header=0, low_memory=False)
print(EFF3_df_contigs_lengths.shape[0])
EFF3_df_contigs_lengths.head()

In [None]:
## Append to merged_data.tsv
EFF3_df_ext = EFF3_df_ARG_names.copy()
EFF3_df_ext.head()
EFF3_df_contigs_lengths.head()

# Reorder to match
EFF3_df_ordered = EFF3_df_contigs_lengths.loc[EFF3_df_ext.index]

EFF3_data = pd.concat([EFF3_df_ext, EFF3_df_ordered], axis=1)
EFF3_data.tail()
print(EFF3_data.iloc[:, :-4])

In [None]:
n_neighbors = [20]
min_dist = [0.1]

colors = [0, 1, 2, 3]
color_map = {0: '#8ce6e9', 1: '#fa7a31', 2: '#eb340f', 3: '#d01af5'}

# Map colors to each data point
custom_colors = [color_map[val] for val in colors]

for n in n_neighbors:
    for m in min_dist:
        reducer = umap.UMAP(n_neighbors=n, min_dist=m, random_state=seed)
        embedding = reducer.fit_transform(EFF3_data.iloc[:, :-4])
        EFF3_UMAP_data  = pd.DataFrame({
            'UMAP1': embedding[:, 0],
            'UMAP2': embedding[:, 1],
            'contig': EFF3_data.index,
            'ARG_count':EFF3_data['ARG_count'],
            'ARG_name':EFF3_data['ARG_name'],
            'contig_length':EFF3_data['length'],
        })

        EFF3_UMAP_data['ARG_count'] = EFF3_UMAP_data['ARG_count'].astype(str)
        
        fig = px.scatter(EFF3_UMAP_data, 
                            x='UMAP1', 
                            y='UMAP2', 
                            color='ARG_count',
                            title=f' Wastewater EFF3 - UMAP with n_neighbors={n}, min_dist={m}', 
                            color_discrete_sequence=custom_colors,
                            hover_data={'contig': True, 'ARG_count': True, 'ARG_name': True, 'contig_length': True} )
        title = f' Wastewater EFF3 - UMAP with n_neighbors={n}, min_dist={m}'
        fig.update_layout(
            height=1700,
            width=1200,
            title_text=title,
            showlegend=True,
            legend=dict(
                x=0.5,
                y=-0.05,
                traceorder="normal",
                xanchor='center',
                yanchor='top',
                orientation='h'
            ),
            template='simple_white',
            xaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,  
                linecolor='black', 
                linewidth=1,
                mirror=True
            ),
            yaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,
                linecolor='black',
                linewidth=1,
                mirror=True
            )
        )
        fig.show()
        fig.write_image(f'UMAP_WW/EFF3_UMAP_{n}_{m}_contig_ARG_counts_above100.png')
        fig.write_html(f'UMAP_WW/EFF3_UMAP_{n}_{m}_contig_ARG_counts_above100.html')

## Focus on distinct clusters
### Leave out the most misc cluster and draw again

In [None]:
# Extract based on UMAP1 (x axis) & UMAP2 (y axis) values:
EFF3_UMAP_data_focused = EFF3_UMAP_data.loc[(EFF3_UMAP_data['UMAP1']>= -2) & (EFF3_UMAP_data['UMAP1']<= 6.5)
    & (EFF3_UMAP_data['UMAP2']>= -6) & (EFF3_UMAP_data['UMAP2']<= 13)]

# Check
EFF3_UMAP_data_focused.head()

EFF3_data_focused = EFF3_data[EFF3_data.index.isin(EFF3_UMAP_data_focused['contig'])]
print(EFF3_data_focused)

In [None]:
# Save contig IDs
EFF3_focused_contigs = EFF3_data_focused.index.to_list()

directory = 'UMAP_WW'
file_path = os.path.join(directory, 'EFF3_focused_contigs_above100.txt')

with open(file_path, 'w') as file:
    for item in EFF3_focused_contigs:
        file.write(f"{item}\n")

In [None]:
print(EFF3_data_focused.iloc[:, :-4])

In [None]:
n_neighbors = [20]
min_dist = [0.1]

colors = [0, 1, 2, 3]
color_map = {0: '#b4f3f5', 1: '#fa7a31', 2: '#eb340f', 3: '#d01af5'}

# Map colors to each data point
custom_colors = [color_map[val] for val in colors]

for n in n_neighbors:
    for m in min_dist:
        reducer = umap.UMAP(n_neighbors=n, min_dist=m, random_state=seed)
        embedding = reducer.fit_transform(EFF3_data_focused.iloc[:, :-4])
        EFF3_UMAP_data_focused  = pd.DataFrame({
            'UMAP1': embedding[:, 0],
            'UMAP2': embedding[:, 1],
            'contig': EFF3_data_focused.index,
            'ARG_count': EFF3_data_focused['ARG_count'],
            'ARG_name':EFF3_data_focused['ARG_name'],
            'contig_length':EFF3_data_focused['length'],
        })

        EFF3_UMAP_data_focused['ARG_count'] = EFF3_UMAP_data_focused['ARG_count'].astype(str)
        EFF3_UMAP_data_focused['log_contig_length'] = np.log(EFF3_UMAP_data_focused['contig_length'])
        EFF3_UMAP_data_focused['sqrt_contig_length'] = np.sqrt(EFF3_UMAP_data_focused['contig_length'])
        
        fig = px.scatter(EFF3_UMAP_data_focused, 
                            x='UMAP1', 
                            y='UMAP2', 
                            color='ARG_count',
                            title=f' Wastewater EFF3 - UMAP with n_neighbors={n}, min_dist={m}', 
                            color_discrete_sequence=custom_colors,
                            hover_data={'contig': True, 'ARG_count': True, 'ARG_name': True, 'contig_length': True},
                            size='sqrt_contig_length'
        )
        title = f' Wastewater EFF3 - UMAP with n_neighbors={n}, min_dist={m}'
        fig.update_layout(
            height=1700,
            width=1200,
            title_text=title,
            showlegend=True,
            legend=dict(
                x=0.5,
                y=-0.05,
                traceorder="normal",
                xanchor='center',
                yanchor='top',
                orientation='h'
            ),
            template='simple_white',
            xaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,  
                linecolor='black', 
                linewidth=1,
                mirror=True
            ),
            yaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,
                linecolor='black',
                linewidth=1,
                mirror=True
            )
        )
        fig.show()
        #fig.write_image(f'UMAP_WW/EFF3_UMAP_{n}_{m}_ARG_counts_focused_above100.png')
        #fig.write_html(f'UMAP_WW/EFF3_UMAP_{n}_{m}_ARG_counts_focused_above100.html')
        fig.write_image(f'UMAP_WW/EFF3_UMAP_{n}_{m}_ARG_counts_lengths_sqrt_focused_above100.png')
        fig.write_html(f'UMAP_WW/EFF3_UMAP_{n}_{m}_ARG_counts_lengths_sqrt_focused_above100.html')

# INF2
## Import data

In [None]:
path_to_images = '/scratch/project_2006608/Methylation/notebooks/UMAP_WW/'

## FILTERING
### > 100 lines in .gff

In [None]:
file_path = '/scratch/project_2006608/Methylation/WW_data/INF2_matrices_top100/flattened/INF2_concat_matrices_top100.tsv'
INF2_matrices = pd.read_csv(file_path, sep='\t', index_col=0, low_memory=False)

In [None]:
print(INF2_matrices.shape[0])

In [None]:
print(INF2_matrices.shape)
INF2_matrices.head()

In [None]:
INF2_df = INF2_matrices.loc[(INF2_matrices.iloc[:, :492] != 0).any(axis=1)]
INF2_df.shape
INF2_df.head()

In [None]:
INF2_df['sample'].value_counts()
print(INF2_df.iloc[:, :-1])

In [None]:
# All
#n_neighbors = [20, 30 ]
#min_dist = [0.01, 0.1, 0.2]
# The best
n_neighbors = [20]
min_dist = [0.1]

color_dict = ['#f9c82e']

for n in n_neighbors:
    for m in min_dist:
        
        reducer = umap.UMAP(n_neighbors=n, min_dist=m, random_state=seed)
        embedding = reducer.fit_transform(INF2_df.iloc[:, :-1])
        UMAP_INF2_df  = pd.DataFrame({
            'UMAP1': embedding[:, 0],
            'UMAP2': embedding[:, 1],
            'contig': INF2_df.index
        })
        fig = px.scatter(UMAP_INF2_df, 
                            x='UMAP1', 
                            y='UMAP2', 
                            title=f' Wastewater INF2 - UMAP with n_neighbors={n}, min_dist={m}', 
                            color_discrete_sequence=color_dict,
                            hover_data={'contig': True})
        title = f' Wastewater INF2 - UMAP with n_neighbors={n}, min_dist={m}'
        fig.update_layout(
            height=1500,
            width=1200,
            title_text=title,
            showlegend=True,
            legend=dict(
                x=0.5,
                y=-0.1,
                traceorder="normal",
                xanchor='center',
                yanchor='top',
                orientation='h'
            ),
            template='simple_white',
            xaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,  
                linecolor='black', 
                linewidth=1,
                mirror=True
            ),
            yaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,
                linecolor='black',
                linewidth=1,
                mirror=True
            )
        )
        fig.show()
        fig.write_image(f'UMAP_WW/INF2_UMAP_{n}_{m}_above100.png')
        fig.write_html(f'UMAP_WW/INF2_UMAP_{n}_{m}_above100.html')

## Attach mod count data

In [None]:
## Bring the contig lengths from Puhti:
file_path = '/scratch/project_2006608/Methylation/WW_data/INF2_contigs/INF2_mod_counts.txt'

INF2_df_mod_counts = pd.read_csv(file_path, sep='\t', index_col=0, header=0, low_memory=False)
print(INF2_df_mod_counts.shape[0])
INF2_df_mod_counts.head()

INF2_df_mod_counts = np.log(INF2_df_mod_counts)

In [None]:
## Append to merged_data.tsv
INF2_df_ext = INF2_df.copy()
INF2_df_ext.head()
INF2_df_mod_counts.head()

# Reorder to match
INF2_df_ordered = INF2_df_mod_counts.loc[INF2_df_ext.index]

INF2_df_mod_counts = pd.concat([INF2_df_ext, INF2_df_ordered], axis=1)
INF2_df_mod_counts.tail()
print(INF2_df_mod_counts.iloc[:, :-2])

In [None]:
n_neighbors = [20]
min_dist = [0.1]

for n in n_neighbors:
    for m in min_dist:
        reducer = umap.UMAP(n_neighbors=n, min_dist=m, random_state=seed)
        embedding = reducer.fit_transform(INF2_df_mod_counts.iloc[:, :-2])
        INF2_mod_counts_UMAP_df  = pd.DataFrame({
            'UMAP1': embedding[:, 0],
            'UMAP2': embedding[:, 1],
            'contig': INF2_df_mod_counts.index,
            'mod_count':INF2_df_mod_counts['mod_count']
        })
        fig = px.scatter(INF2_mod_counts_UMAP_df, 
                            x='UMAP1', 
                            y='UMAP2', 
                            color='mod_count',
                            title=f' Wastewater INF2 - UMAP with n_neighbors={n}, min_dist={m}', 
                            color_continuous_scale=px.colors.sequential.Rainbow,
                            hover_data={'contig': True} )
        title = f' Wastewater INF2 - UMAP with n_neighbors={n}, min_dist={m}'
        fig.update_layout(
            height=1700,
            width=1200,
            title_text=title,
            showlegend=True,
            legend=dict(
                x=0.5,
                y=-0.05,
                traceorder="normal",
                xanchor='center',
                yanchor='top',
                orientation='h'
            ),
            template='simple_white',
            xaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,  
                linecolor='black', 
                linewidth=1,
                mirror=True
            ),
            yaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,
                linecolor='black',
                linewidth=1,
                mirror=True
            )
        )
        fig.show()
        fig.write_image(f'UMAP_WW/INF2_UMAP_{n}_{m}_contig_mod_counts_log_above100.png')
        fig.write_html(f'UMAP_WW/INF2_UMAP_{n}_{m}_contig_mod_counts_log_above100.html')

## Attach ARG data
### Counts
### Lengths
### Contig lengths

### ARG Counts

In [None]:
file_path = '/scratch/project_2006608/Methylation/WW_data/INF2_contigs/INF2_ARG_counts.txt'

INF2_df_ARG_counts = pd.read_csv(file_path, sep='\t', index_col=0, header=0, low_memory=False)
print(INF2_df_ARG_counts.shape[0])
INF2_df_ARG_counts.head()

In [None]:
## Append to merged_data.tsv
INF2_df_ext = INF2_df.copy()
INF2_df_ext.head()
INF2_df_ARG_counts.head()

# Reorder to match
INF2_df_ordered = INF2_df_ARG_counts.loc[INF2_df_ext.index]

INF2_df_ARG_counts = pd.concat([INF2_df_ext, INF2_df_ordered], axis=1)
INF2_df_ARG_counts.tail()

### ARG Names

In [None]:
file_path = '/scratch/project_2006608/Methylation/WW_data/INF2_contigs/INF2_ARG_names.txt'

INF2_df_ARG_names = pd.read_csv(file_path, sep='\t', index_col=0, header=0, low_memory=False)
print(INF2_df_ARG_names.shape[0])
INF2_df_ARG_names.head()

In [None]:
## Append to merged_data.tsv
INF2_df_ext = INF2_df_ARG_counts.copy()
INF2_df_ext.head()
INF2_df_ARG_names.head()

# Reorder to match
INF2_df_ordered = INF2_df_ARG_names.loc[INF2_df_ext.index]

INF2_df_ARG_names = pd.concat([INF2_df_ext, INF2_df_ordered], axis=1)
INF2_df_ARG_names.tail()

In [None]:
# print those with erm(F)_3
erm_F = INF2_df_ARG_names[INF2_df_ARG_names['ARG_name'].str.contains('erm(F)_3', case=False, na=False, regex=False)]
row_names_df = pd.DataFrame(erm_F.index, columns=['0'])
print(row_names_df)

### Contig lengths

In [None]:
## Bring the contig lengths from Puhti:
file_path = '/scratch/project_2006608/Methylation/WW_data/INF2_contigs/INF2_contigs_lengths.txt'

INF2_df_contigs_lengths = pd.read_csv(file_path, sep='\t', index_col=0, header=0, low_memory=False)
print(INF2_df_contigs_lengths.shape[0])
INF2_df_contigs_lengths.head()

In [None]:
## Append to merged_data.tsv
INF2_df_ext = INF2_df_ARG_names.copy()
INF2_df_ext.head()
INF2_df_contigs_lengths.head()

# Reorder to match
INF2_df_ordered = INF2_df_contigs_lengths.loc[INF2_df_ext.index]

INF2_data = pd.concat([INF2_df_ext, INF2_df_ordered], axis=1)
INF2_data.tail()
print(INF2_data.iloc[:, :-4])

In [None]:
n_neighbors = [20]
min_dist = [0.1]

colors = [0, 1, 2, 3, 4, 5]
color_map = {0: '#f0da95', 1: '#fa7a31', 2: '#eb340f', 3: '#d01af5', 4: '#2a2df6', 5: '#940785'}

# Map colors to each data point
custom_colors = [color_map[val] for val in colors]

for n in n_neighbors:
    for m in min_dist:
        reducer = umap.UMAP(n_neighbors=n, min_dist=m, random_state=seed)
        embedding = reducer.fit_transform(INF2_data.iloc[:, :-4])
        INF2_UMAP_data  = pd.DataFrame({
            'UMAP1': embedding[:, 0],
            'UMAP2': embedding[:, 1],
            'contig': INF2_data.index,
            'ARG_count':INF2_data['ARG_count'],
            'ARG_name':INF2_data['ARG_name'],
            'contig_length':INF2_data['length'],
        })

        INF2_UMAP_data['ARG_count'] = INF2_UMAP_data['ARG_count'].astype(str)

        category_order=["0", "1", "2", "3"]
        
        fig = px.scatter(INF2_UMAP_data, 
                            x='UMAP1', 
                            y='UMAP2', 
                            color='ARG_count',
                            title=f' Wastewater INF2 - UMAP with n_neighbors={n}, min_dist={m}', 
                            color_discrete_sequence=custom_colors,
                            category_orders={'ARG_count': category_order},
                            hover_data={'contig': True, 'ARG_count': True, 'ARG_name': True, 'contig_length': True} )
        title = f' Wastewater INF2 - UMAP with n_neighbors={n}, min_dist={m}'
        fig.update_layout(
            height=1700,
            width=1200,
            title_text=title,
            showlegend=True,
            legend=dict(
                x=0.5,
                y=-0.05,
                traceorder="normal",
                xanchor='center',
                yanchor='top',
                orientation='h'
            ),
            template='simple_white',
            xaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,  
                linecolor='black', 
                linewidth=1,
                mirror=True
            ),
            yaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,
                linecolor='black',
                linewidth=1,
                mirror=True
            )
        )
        fig.show()
        fig.write_image(f'UMAP_WW/INF2_UMAP_{n}_{m}_contig_ARG_counts_above100.png')
        fig.write_html(f'UMAP_WW/INF2_UMAP_{n}_{m}_contig_ARG_counts_above100.html')

## Focus on distinct clusters
### Leave out the most misc cluster and draw again

In [None]:
# Extract based on UMAP1 (x axis) & UMAP2 (y axis) values:
INF2_UMAP_data_focused1 = INF2_UMAP_data.loc[(INF2_UMAP_data['UMAP1']>= 5) & (INF2_UMAP_data['UMAP1']<= 15)
    & (INF2_UMAP_data['UMAP2']>= -5) & (INF2_UMAP_data['UMAP2']<= 15)]

INF2_UMAP_data_focused2 = INF2_UMAP_data.loc[(INF2_UMAP_data['UMAP1']>= -5) & (INF2_UMAP_data['UMAP1']<= 5)
    & (INF2_UMAP_data['UMAP2']>= -10) & (INF2_UMAP_data['UMAP2']<= 10)]

INF2_UMAP_data_focused = pd.concat([INF2_UMAP_data_focused1, INF2_UMAP_data_focused2], ignore_index=False)

# Check
INF2_UMAP_data_focused.head()

INF2_data_focused = INF2_data[INF2_data.index.isin(INF2_UMAP_data_focused['contig'])]
print(INF2_data_focused)

In [None]:
# Save contig IDs
INF2_focused_contigs = INF2_data_focused.index.to_list()

directory = 'UMAP_WW'
file_path = os.path.join(directory, 'INF2_focused_contigs_above100.txt')

with open(file_path, 'w') as file:
    for item in INF2_focused_contigs:
        file.write(f"{item}\n")

In [None]:
print(INF2_data_focused.iloc[:, :-4])
print(INF2_UMAP_data_focused['ARG_count'].unique())

In [None]:
print(INF2_UMAP_data_focused['ARG_count'].unique())

In [None]:
n_neighbors = [20]
min_dist = [0.1]

colors = [0, 1, 2, 3]
color_map = {0: '#f0da95', 1: '#fa7a31', 2: '#eb340f', 3: '#901de5'}

# Map colors to each data point
custom_colors = [color_map[val] for val in colors]

for n in n_neighbors:
    for m in min_dist:
        reducer = umap.UMAP(n_neighbors=n, min_dist=m, random_state=seed)
        embedding = reducer.fit_transform(INF2_data_focused.iloc[:, :-4])
        INF2_UMAP_data_focused  = pd.DataFrame({
            'UMAP1': embedding[:, 0],
            'UMAP2': embedding[:, 1],
            'contig': INF2_data_focused.index,
            'ARG_count': INF2_data_focused['ARG_count'],
            'ARG_name':INF2_data_focused['ARG_name'],
            'contig_length':INF2_data_focused['length'],
        })

        INF2_UMAP_data_focused['ARG_count'] = INF2_UMAP_data_focused['ARG_count'].astype(str)
        INF2_UMAP_data_focused['log_contig_length'] = np.log(INF2_UMAP_data_focused['contig_length'])
        INF2_UMAP_data_focused['sqrt_contig_length'] = np.sqrt(INF2_UMAP_data_focused['contig_length'])

        INF2_UMAP_data_focused['ARG_count'] = INF2_UMAP_data_focused['ARG_count'].astype('category')
        category_order = ["0", "1", "2", "3"]
        
        fig = px.scatter(INF2_UMAP_data_focused, 
                            x='UMAP1', 
                            y='UMAP2', 
                            color='ARG_count',
                            title=f' Wastewater INF2 - UMAP with n_neighbors={n}, min_dist={m}',
                            category_orders={'ARG_count': category_order},
                            color_discrete_sequence=custom_colors,
                            hover_data={'contig': True, 'ARG_count': True, 'ARG_name': True, 'contig_length': True},
                            size='sqrt_contig_length'
        )
        title = f' Wastewater INF2 - UMAP with n_neighbors={n}, min_dist={m}'
        fig.update_layout(
            height=1700,
            width=1200,
            title_text=title,
            showlegend=True,
            legend=dict(
                x=0.5,
                y=-0.05,
                traceorder="normal",
                xanchor='center',
                yanchor='top',
                orientation='h'
            ),
            template='simple_white',
            xaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,  
                linecolor='black', 
                linewidth=1,
                mirror=True
            ),
            yaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,
                linecolor='black',
                linewidth=1,
                mirror=True
            )
        )
        fig.show()
        #fig.write_image(f'UMAP_WW/INF2_UMAP_{n}_{m}_ARG_counts_focused_above100.png')
        #fig.write_html(f'UMAP_WW/INF2_UMAP_{n}_{m}_ARG_counts_focused_above100.html')
        fig.write_image(f'UMAP_WW/INF2_UMAP_{n}_{m}_ARG_counts_lengths_sqrt_focused_above100.png')
        fig.write_html(f'UMAP_WW/INF2_UMAP_{n}_{m}_ARG_counts_lengths_sqrt_focused_above100.html')

### Extract clusters
#### C1

In [None]:
# Extract based on UMAP1 (x axis) & UMAP2 (y axis) values:
INF2_data_focused_C1 = INF2_UMAP_data_focused.loc[(INF2_UMAP_data_focused['UMAP1']>= -6.5) & (INF2_UMAP_data_focused['UMAP1']<= -6.4)
    & (INF2_UMAP_data_focused['UMAP2']>= -2.47) & (INF2_UMAP_data_focused['UMAP2']<= -2.452)]

# Check
INF2_data_focused_C1.head()

INF2_data_C1 = INF2_data_focused[INF2_data_focused.index.isin(INF2_data_focused_C1['contig'])]
print(INF2_data_C1)

In [None]:
# Save contig IDs
INF2_C1_contigs = INF2_data_C1.index.to_list()

directory = 'UMAP_WW'
file_path = os.path.join(directory, 'INF2_C1_contigs_above100.txt')

with open(file_path, 'w') as file:
    for item in INF2_C1_contigs:
        file.write(f"{item}\n")

### Extract clusters
#### C2

In [None]:
# Extract based on UMAP1 (x axis) & UMAP2 (y axis) values:
INF2_data_focused_C2 = INF2_UMAP_data_focused.loc[(INF2_UMAP_data_focused['UMAP1']>= 7.27) & (INF2_UMAP_data_focused['UMAP1']<= 7.45)
    & (INF2_UMAP_data_focused['UMAP2']>= 5.525) & (INF2_UMAP_data_focused['UMAP2']<= 5.66)]

# Check
INF2_data_focused_C2.head()

INF2_data_C2 = INF2_data_focused[INF2_data_focused.index.isin(INF2_data_focused_C2['contig'])]
print(INF2_data_C2)

In [None]:
# Save contig IDs
INF2_C2_contigs = INF2_data_C2.index.to_list()

directory = 'UMAP_WW'
file_path = os.path.join(directory, 'INF2_C2_contigs_above100.txt')

with open(file_path, 'w') as file:
    for item in INF2_C2_contigs:
        file.write(f"{item}\n")

# INF3
## Import data

In [None]:
path_to_images = '/scratch/project_2006608/Methylation/notebooks/UMAP_WW/'

## FILTERING
### > 100 lines in .gff

In [None]:
file_path = '/scratch/project_2006608/Methylation/WW_data/INF3_matrices_top100/flattened/INF3_concat_matrices_top100.tsv'
INF3_matrices = pd.read_csv(file_path, sep='\t', index_col=0, low_memory=False)

In [None]:
print(INF3_matrices.shape[0])

In [None]:
print(INF3_matrices.shape)
INF3_matrices.head()

In [None]:
INF3_df = INF3_matrices.loc[(INF3_matrices.iloc[:, :492] != 0).any(axis=1)]
INF3_df.shape
INF3_df.head()

In [None]:
INF3_df['sample'].value_counts()
print(INF3_df.iloc[:, :-1])

In [None]:
# All
#n_neighbors = [20, 30 ]
#min_dist = [0.01, 0.1, 0.2]
# The best
n_neighbors = [20]
min_dist = [0.1]

color_dict = ['#f9c82e']

for n in n_neighbors:
    for m in min_dist:
        
        reducer = umap.UMAP(n_neighbors=n, min_dist=m, random_state=seed)
        embedding = reducer.fit_transform(INF3_df.iloc[:, :-1])
        UMAP_INF3_df  = pd.DataFrame({
            'UMAP1': embedding[:, 0],
            'UMAP2': embedding[:, 1],
            'contig': INF3_df.index
        })
        fig = px.scatter(UMAP_INF3_df, 
                            x='UMAP1', 
                            y='UMAP2', 
                            title=f' Wastewater INF3 - UMAP with n_neighbors={n}, min_dist={m}', 
                            color_discrete_sequence=color_dict,
                            hover_data={'contig': True})
        title = f' Wastewater INF3 - UMAP with n_neighbors={n}, min_dist={m}'
        fig.update_layout(
            height=1500,
            width=1200,
            title_text=title,
            showlegend=True,
            legend=dict(
                x=0.5,
                y=-0.1,
                traceorder="normal",
                xanchor='center',
                yanchor='top',
                orientation='h'
            ),
            template='simple_white',
            xaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,  
                linecolor='black', 
                linewidth=1,
                mirror=True
            ),
            yaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,
                linecolor='black',
                linewidth=1,
                mirror=True
            )
        )
        fig.show()
        fig.write_image(f'UMAP_WW/INF3_UMAP_{n}_{m}_above100.png')
        fig.write_html(f'UMAP_WW/INF3_UMAP_{n}_{m}_above100.html')

## Attach mod count data

In [None]:
## Bring the contig lengths from Puhti:
file_path = '/scratch/project_2006608/Methylation/WW_data/INF3_contigs/INF3_mod_counts.txt'

INF3_df_mod_counts = pd.read_csv(file_path, sep='\t', index_col=0, header=0, low_memory=False)
print(INF3_df_mod_counts.shape[0])
INF3_df_mod_counts.head()

INF3_df_mod_counts = np.log(INF3_df_mod_counts)

In [None]:
## Append to merged_data.tsv
INF3_df_ext = INF3_df.copy()
INF3_df_ext.head()
INF3_df_mod_counts.head()

# Reorder to match
INF3_df_ordered = INF3_df_mod_counts.loc[INF3_df_ext.index]

INF3_df_mod_counts = pd.concat([INF3_df_ext, INF3_df_ordered], axis=1)
INF3_df_mod_counts.tail()
print(INF3_df_mod_counts.iloc[:, :-2])

In [None]:
n_neighbors = [20]
min_dist = [0.1]

for n in n_neighbors:
    for m in min_dist:
        reducer = umap.UMAP(n_neighbors=n, min_dist=m, random_state=seed)
        embedding = reducer.fit_transform(INF3_df_mod_counts.iloc[:, :-2])
        INF3_mod_counts_UMAP_df  = pd.DataFrame({
            'UMAP1': embedding[:, 0],
            'UMAP2': embedding[:, 1],
            'contig': INF3_df_mod_counts.index,
            'mod_count':INF3_df_mod_counts['mod_count']
        })
        fig = px.scatter(INF3_mod_counts_UMAP_df, 
                            x='UMAP1', 
                            y='UMAP2', 
                            color='mod_count',
                            title=f' Wastewater INF3 - UMAP with n_neighbors={n}, min_dist={m}', 
                            color_continuous_scale=px.colors.sequential.Rainbow,
                            hover_data={'contig': True} )
        title = f' Wastewater INF3 - UMAP with n_neighbors={n}, min_dist={m}'
        fig.update_layout(
            height=1700,
            width=1200,
            title_text=title,
            showlegend=True,
            legend=dict(
                x=0.5,
                y=-0.05,
                traceorder="normal",
                xanchor='center',
                yanchor='top',
                orientation='h'
            ),
            template='simple_white',
            xaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,  
                linecolor='black', 
                linewidth=1,
                mirror=True
            ),
            yaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,
                linecolor='black',
                linewidth=1,
                mirror=True
            )
        )
        fig.show()
        fig.write_image(f'UMAP_WW/INF3_UMAP_{n}_{m}_contig_mod_counts_log_above100.png')
        fig.write_html(f'UMAP_WW/INF3_UMAP_{n}_{m}_contig_mod_counts_log_above100.html')

## Attach ARG data
### Counts
### Lengths
### Contig lengths

### ARG Counts

In [None]:
file_path = '/scratch/project_2006608/Methylation/WW_data/INF3_contigs/INF3_ARG_counts.txt'

INF3_df_ARG_counts = pd.read_csv(file_path, sep='\t', index_col=0, header=0, low_memory=False)
print(INF3_df_ARG_counts.shape[0])
INF3_df_ARG_counts.head()

In [None]:
## Append to merged_data.tsv
INF3_df_ext = INF3_df.copy()
INF3_df_ext.head()
INF3_df_ARG_counts.head()

# Reorder to match
INF3_df_ordered = INF3_df_ARG_counts.loc[INF3_df_ext.index]

INF3_df_ARG_counts = pd.concat([INF3_df_ext, INF3_df_ordered], axis=1)
INF3_df_ARG_counts.tail()

### ARG Names

In [None]:
file_path = '/scratch/project_2006608/Methylation/WW_data/INF3_contigs/INF3_ARG_names.txt'

INF3_df_ARG_names = pd.read_csv(file_path, sep='\t', index_col=0, header=0, low_memory=False)
print(INF3_df_ARG_names.shape[0])
INF3_df_ARG_names.head()

In [None]:
## Append to merged_data.tsv
INF3_df_ext = INF3_df_ARG_counts.copy()
INF3_df_ext.head()
INF3_df_ARG_names.head()

# Reorder to match
INF3_df_ordered = INF3_df_ARG_names.loc[INF3_df_ext.index]

INF3_df_ARG_names = pd.concat([INF3_df_ext, INF3_df_ordered], axis=1)
INF3_df_ARG_names.tail()

In [None]:
# print those with erm(F)_3
erm_F = INF3_df_ARG_names[INF3_df_ARG_names['ARG_name'].str.contains('erm(F)_3', case=False, na=False, regex=False)]
print(erm_F)

### Contig lengths

In [None]:
## Bring the contig lengths from Puhti:
file_path = '/scratch/project_2006608/Methylation/WW_data/INF3_contigs/INF3_contigs_lengths.txt'

INF3_df_contigs_lengths = pd.read_csv(file_path, sep='\t', index_col=0, header=0, low_memory=False)
print(INF3_df_contigs_lengths.shape[0])
INF3_df_contigs_lengths.head()

In [None]:
## Append to merged_data.tsv
INF3_df_ext = INF3_df_ARG_names.copy()
INF3_df_ext.head()
INF3_df_contigs_lengths.head()

# Reorder to match
INF3_df_ordered = INF3_df_contigs_lengths.loc[INF3_df_ext.index]

INF3_data = pd.concat([INF3_df_ext, INF3_df_ordered], axis=1)
INF3_data.tail()
print(INF3_data.iloc[:, :-4])

In [None]:
n_neighbors = [20]
min_dist = [0.1]

colors = [0, 1, 2, 3, 4]
color_map = {0: '#f0da95', 1: '#fa7a31', 2: '#eb340f', 3: '#d01af5', 4: '#2a2df6'}

# Map colors to each data point
custom_colors = [color_map[val] for val in colors]

for n in n_neighbors:
    for m in min_dist:
        reducer = umap.UMAP(n_neighbors=n, min_dist=m, random_state=seed)
        embedding = reducer.fit_transform(INF3_data.iloc[:, :-4])
        INF3_UMAP_data  = pd.DataFrame({
            'UMAP1': embedding[:, 0],
            'UMAP2': embedding[:, 1],
            'contig': INF3_data.index,
            'ARG_count':INF3_data['ARG_count'],
            'ARG_name':INF3_data['ARG_name'],
            'contig_length':INF3_data['length'],
        })

        INF3_UMAP_data['ARG_count'] = INF3_UMAP_data['ARG_count'].astype(str)
        category_order = ["0", "1", "2", "3", "4"]
        
        fig = px.scatter(INF3_UMAP_data, 
                            x='UMAP1', 
                            y='UMAP2', 
                            color='ARG_count',
                            title=f' Wastewater INF3 - UMAP with n_neighbors={n}, min_dist={m}', 
                            color_discrete_sequence=custom_colors,
                            category_orders={'ARG_count': category_order},
                            hover_data={'contig': True, 'ARG_count': True, 'ARG_name': True, 'contig_length': True} )
        title = f' Wastewater INF3 - UMAP with n_neighbors={n}, min_dist={m}'
        fig.update_layout(
            height=1700,
            width=1200,
            title_text=title,
            showlegend=True,
            legend=dict(
                x=0.5,
                y=-0.05,
                traceorder="normal",
                xanchor='center',
                yanchor='top',
                orientation='h'
            ),
            template='simple_white',
            xaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,  
                linecolor='black', 
                linewidth=1,
                mirror=True
            ),
            yaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,
                linecolor='black',
                linewidth=1,
                mirror=True
            )
        )
        fig.show()
        fig.write_image(f'UMAP_WW/INF3_UMAP_{n}_{m}_contig_ARG_counts_above100.png')
        fig.write_html(f'UMAP_WW/INF3_UMAP_{n}_{m}_contig_ARG_counts_above100.html')

## Focus on distinct clusters
### Leave out the most misc cluster and draw again

In [None]:
# Extract based on UMAP1 (x axis) & UMAP2 (y axis) values:
INF3_UMAP_data_focused1 = INF3_UMAP_data.loc[(INF3_UMAP_data['UMAP1']>= 4) & (INF3_UMAP_data['UMAP1']<= 15)
    & (INF3_UMAP_data['UMAP2']>= -10) & (INF3_UMAP_data['UMAP2']<= 15)]

INF3_UMAP_data_focused2 = INF3_UMAP_data.loc[(INF3_UMAP_data['UMAP1']>= -6) & (INF3_UMAP_data['UMAP1']<= 4)
    & (INF3_UMAP_data['UMAP2']>= -10) & (INF3_UMAP_data['UMAP2']<= -1.4)]

INF3_UMAP_data_focused = pd.concat([INF3_UMAP_data_focused1, INF3_UMAP_data_focused2], ignore_index=False)

# Check
INF3_UMAP_data_focused.head()

INF3_data_focused = INF3_data[INF3_data.index.isin(INF3_UMAP_data_focused['contig'])]
print(INF3_data_focused)

In [None]:
# Save contig IDs
INF3_focused_contigs = INF3_data_focused.index.to_list()

directory = 'UMAP_WW'
file_path = os.path.join(directory, 'INF3_focused_contigs_above100.txt')

with open(file_path, 'w') as file:
    for item in INF3_focused_contigs:
        file.write(f"{item}\n")

In [None]:
print(INF3_data_focused.iloc[:, :-4])
print(INF3_UMAP_data_focused['ARG_count'].unique())

In [None]:
print(INF3_UMAP_data_focused['ARG_count'].unique())

In [None]:
n_neighbors = [20]
min_dist = [0.1]

colors = [0, 1, 2, 3, 4, 5]
color_map = {0: '#f0da95', 1: '#fa7a31', 2: '#eb340f', 3: '#d01af5', 4: '#2a2df6', 5: '#940785'}

# Map colors to each data point
custom_colors = [color_map[val] for val in colors]

for n in n_neighbors:
    for m in min_dist:
        reducer = umap.UMAP(n_neighbors=n, min_dist=m, random_state=seed)
        embedding = reducer.fit_transform(INF3_data_focused.iloc[:, :-4])
        INF3_UMAP_data_focused  = pd.DataFrame({
            'UMAP1': embedding[:, 0],
            'UMAP2': embedding[:, 1],
            'contig': INF3_data_focused.index,
            'ARG_count': INF3_data_focused['ARG_count'],
            'ARG_name':INF3_data_focused['ARG_name'],
            'contig_length':INF3_data_focused['length'],
        })

        INF3_UMAP_data_focused['ARG_count'] = INF3_UMAP_data_focused['ARG_count'].astype(str)
        INF3_UMAP_data_focused['log_contig_length'] = np.log(INF3_UMAP_data_focused['contig_length'])
        INF3_UMAP_data_focused['sqrt_contig_length'] = np.sqrt(INF3_UMAP_data_focused['contig_length'])

        INF3_UMAP_data_focused['ARG_count'] = INF3_UMAP_data_focused['ARG_count'].astype('category')
        category_order = ["0", "1", "2", "3", "4", "5"]
        
        fig = px.scatter(INF3_UMAP_data_focused, 
                            x='UMAP1', 
                            y='UMAP2', 
                            color='ARG_count',
                            title=f' Wastewater INF3 - UMAP with n_neighbors={n}, min_dist={m}',
                            category_orders={'ARG_count': category_order},
                            color_discrete_sequence=custom_colors,
                            hover_data={'contig': True, 'ARG_count': True, 'ARG_name': True, 'contig_length': True},
                            size='sqrt_contig_length'
        )
        title = f' Wastewater INF3 - UMAP with n_neighbors={n}, min_dist={m}'
        fig.update_layout(
            height=1700,
            width=1200,
            title_text=title,
            showlegend=True,
            legend=dict(
                x=0.5,
                y=-0.05,
                traceorder="normal",
                xanchor='center',
                yanchor='top',
                orientation='h'
            ),
            template='simple_white',
            xaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,  
                linecolor='black', 
                linewidth=1,
                mirror=True
            ),
            yaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,
                linecolor='black',
                linewidth=1,
                mirror=True
            )
        )
        fig.show()
        #fig.write_image(f'UMAP_WW/INF3_UMAP_{n}_{m}_ARG_counts_focused_above100.png')
        #fig.write_html(f'UMAP_WW/INF3_UMAP_{n}_{m}_ARG_counts_focused_above100.html')
        fig.write_image(f'UMAP_WW/INF3_UMAP_{n}_{m}_ARG_counts_lengths_sqrt_focused_above100.png')
        fig.write_html(f'UMAP_WW/INF3_UMAP_{n}_{m}_ARG_counts_lengths_sqrt_focused_above100.html')

### Extract clusters
#### C1

In [None]:
# Extract based on UMAP1 (x axis) & UMAP2 (y axis) values:
INF3_data_focused_C1 = INF3_UMAP_data_focused.loc[(INF3_UMAP_data_focused['UMAP1']>= 9.4) & (INF3_UMAP_data_focused['UMAP1']<= 9.5)
    & (INF3_UMAP_data_focused['UMAP2']>= 12.62) & (INF3_UMAP_data_focused['UMAP2']<= 12.65)]

# Check
INF3_data_focused_C1.head()

INF3_data_C1 = INF3_data_focused[INF3_data_focused.index.isin(INF3_data_focused_C1['contig'])]
print(INF3_data_C1)

In [None]:
# Save contig IDs
INF3_C1_contigs = INF3_data_C1.index.to_list()

directory = 'UMAP_WW'
file_path = os.path.join(directory, 'INF3_C1_contigs_above100.txt')

with open(file_path, 'w') as file:
    for item in INF3_C1_contigs:
        file.write(f"{item}\n")

### Extract clusters
#### C2

In [None]:
# Extract based on UMAP1 (x axis) & UMAP2 (y axis) values:
INF3_data_focused_C2 = INF3_UMAP_data_focused.loc[(INF3_UMAP_data_focused['UMAP1']>= 16.4) & (INF3_UMAP_data_focused['UMAP1']<= 16.6)
    & (INF3_UMAP_data_focused['UMAP2']>= 8.8) & (INF3_UMAP_data_focused['UMAP2']<= 8.9)]

# Check
INF3_data_focused_C2.head()

INF3_data_C2 = INF3_data_focused[INF3_data_focused.index.isin(INF3_data_focused_C2['contig'])]
print(INF3_data_C2)

In [None]:
# Save contig IDs
INF3_C2_contigs = INF3_data_C2.index.to_list()

directory = 'UMAP_WW'
file_path = os.path.join(directory, 'INF3_C2_contigs_above100.txt')

with open(file_path, 'w') as file:
    for item in INF3_C2_contigs:
        file.write(f"{item}\n")

# SLU2
## Import data

In [None]:
path_to_images = '/scratch/project_2006608/Methylation/notebooks/UMAP_WW/'

## FILTERING
### > 100 lines in .gff

In [None]:
file_path = '/scratch/project_2006608/Methylation/WW_data/SLU2_matrices_top100/flattened/SLU2_concat_matrices_top100.tsv'
SLU2_matrices = pd.read_csv(file_path, sep='\t', index_col=0, low_memory=False)

In [None]:
print(SLU2_matrices.shape[0])

In [None]:
print(SLU2_matrices.shape)
SLU2_matrices.head()

In [None]:
SLU2_df = SLU2_matrices.loc[(SLU2_matrices.iloc[:, :492] != 0).any(axis=1)]
SLU2_df.shape
SLU2_df.head()

In [None]:
SLU2_df['sample'].value_counts()
print(SLU2_df.iloc[:, :-1])

In [None]:
# All
#n_neighbors = [20, 30 ]
#min_dist = [0.01, 0.1, 0.2]
# The best
n_neighbors = [20]
min_dist = [0.1]

color_dict = ['#df8275']

for n in n_neighbors:
    for m in min_dist:
        
        reducer = umap.UMAP(n_neighbors=n, min_dist=m, random_state=seed)
        embedding = reducer.fit_transform(SLU2_df.iloc[:, :-1])
        UMAP_SLU2_df  = pd.DataFrame({
            'UMAP1': embedding[:, 0],
            'UMAP2': embedding[:, 1],
            'contig': SLU2_df.index
        })
        
        fig = px.scatter(UMAP_SLU2_df, 
                            x='UMAP1', 
                            y='UMAP2', 
                            title=f' Wastewater SLU2 - UMAP with n_neighbors={n}, min_dist={m}', 
                            color_discrete_sequence=color_dict,
                            hover_data={'contig': True})
        title = f' Wastewater SLU2 - UMAP with n_neighbors={n}, min_dist={m}'
        fig.update_layout(
            height=1500,
            width=1200,
            title_text=title,
            showlegend=True,
            legend=dict(
                x=0.5,
                y=-0.1,
                traceorder="normal",
                xanchor='center',
                yanchor='top',
                orientation='h'
            ),
            template='simple_white',
            xaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,  
                linecolor='black', 
                linewidth=1,
                mirror=True
            ),
            yaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,
                linecolor='black',
                linewidth=1,
                mirror=True
            )
        )
        fig.show()
        #fig.write_image(f'UMAP_WW/SLU2_UMAP_{n}_{m}_above100.png')
        #fig.write_html(f'UMAP_WW/SLU2_UMAP_{n}_{m}_above100.html')

## Attach mod count data

In [None]:
## Bring the contig lengths from Puhti:
file_path = '/scratch/project_2006608/Methylation/WW_data/SLU2_contigs/SLU2_mod_counts.txt'

SLU2_df_mod_counts = pd.read_csv(file_path, sep='\t', index_col=0, header=0, low_memory=False)
print(SLU2_df_mod_counts.shape[0])
SLU2_df_mod_counts.head()

SLU2_df_mod_counts = np.log(SLU2_df_mod_counts)

In [None]:
## Append to merged_data.tsv
SLU2_df_ext = SLU2_df.copy()
SLU2_df_ext.head()
SLU2_df_mod_counts.head()

# Reorder to match
SLU2_df_ordered = SLU2_df_mod_counts.loc[SLU2_df_ext.index]

SLU2_df_mod_counts = pd.concat([SLU2_df_ext, SLU2_df_ordered], axis=1)
SLU2_df_mod_counts.tail()
print(SLU2_df_mod_counts.iloc[:, :-2])

In [None]:
n_neighbors = [20]
min_dist = [0.1]

for n in n_neighbors:
    for m in min_dist:
        reducer = umap.UMAP(n_neighbors=n, min_dist=m, random_state=seed)
        embedding = reducer.fit_transform(SLU2_df_mod_counts.iloc[:, :-2])
        SLU2_mod_counts_UMAP_df  = pd.DataFrame({
            'UMAP1': embedding[:, 0],
            'UMAP2': embedding[:, 1],
            'contig': SLU2_df_mod_counts.index,
            'mod_count':SLU2_df_mod_counts['mod_count']
        })
        
        fig = px.scatter(SLU2_mod_counts_UMAP_df, 
                            x='UMAP1', 
                            y='UMAP2', 
                            color='mod_count',
                            title=f' Wastewater SLU2 - UMAP with n_neighbors={n}, min_dist={m}', 
                            color_continuous_scale=px.colors.sequential.Rainbow,
                            hover_data={'contig': True} )
        title = f' Wastewater SLU2 - UMAP with n_neighbors={n}, min_dist={m}'
        fig.update_layout(
            height=1700,
            width=1200,
            title_text=title,
            showlegend=True,
            legend=dict(
                x=0.5,
                y=-0.05,
                traceorder="normal",
                xanchor='center',
                yanchor='top',
                orientation='h'
            ),
            template='simple_white',
            xaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,  
                linecolor='black', 
                linewidth=1,
                mirror=True
            ),
            yaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,
                linecolor='black',
                linewidth=1,
                mirror=True
            )
        )
        fig.show()
        fig.write_image(f'UMAP_WW/SLU2_UMAP_{n}_{m}_contig_mod_counts_log_above100.png')
        fig.write_html(f'UMAP_WW/SLU2_UMAP_{n}_{m}_contig_mod_counts_log_above100.html')

## Attach ARG data
### Counts
### Lengths
### Contig lengths

### ARG Counts

In [None]:
file_path = '/scratch/project_2006608/Methylation/WW_data/SLU2_contigs/SLU2_ARG_counts.txt'

SLU2_df_ARG_counts = pd.read_csv(file_path, sep='\t', index_col=0, header=0, low_memory=False)
print(SLU2_df_ARG_counts.shape[0])
SLU2_df_ARG_counts.head()

In [None]:
## Append to merged_data.tsv
SLU2_df_ext = SLU2_df.copy()
SLU2_df_ext.head()
SLU2_df_ARG_counts.head()

# Reorder to match
SLU2_df_ordered = SLU2_df_ARG_counts.loc[SLU2_df_ext.index]

SLU2_df_ARG_counts = pd.concat([SLU2_df_ext, SLU2_df_ordered], axis=1)
SLU2_df_ARG_counts.tail()

### ARG Names

In [None]:
file_path = '/scratch/project_2006608/Methylation/WW_data/SLU2_contigs/SLU2_ARG_names.txt'

SLU2_df_ARG_names = pd.read_csv(file_path, sep='\t', index_col=0, header=0, low_memory=False)
print(SLU2_df_ARG_names.shape[0])
SLU2_df_ARG_names.head()

In [None]:
## Append to merged_data.tsv
SLU2_df_ext = SLU2_df_ARG_counts.copy()
SLU2_df_ext.head()
SLU2_df_ARG_names.head()

# Reorder to match
SLU2_df_ordered = SLU2_df_ARG_names.loc[SLU2_df_ext.index]

SLU2_df_ARG_names = pd.concat([SLU2_df_ext, SLU2_df_ordered], axis=1)
SLU2_df_ARG_names.tail()

In [None]:
# print those with erm(F)_3
erm_F = SLU2_df_ARG_names[SLU2_df_ARG_names['ARG_name'].str.contains('erm(F)_3', case=False, na=False, regex=False)]
print(erm_F)
row_names_df = pd.DataFrame(erm_F.index, columns=['0'])
print(row_names_df)

### Contig lengths

In [None]:
## Bring the contig lengths from Puhti:
file_path = '/scratch/project_2006608/Methylation/WW_data/SLU2_contigs/SLU2_contigs_lengths.txt'

SLU2_df_contigs_lengths = pd.read_csv(file_path, sep='\t', index_col=0, header=0, low_memory=False)
print(SLU2_df_contigs_lengths.shape[0])
SLU2_df_contigs_lengths.head()

In [None]:
## Append to merged_data.tsv
SLU2_df_ext = SLU2_df_ARG_names.copy()
SLU2_df_ext.head()
SLU2_df_contigs_lengths.head()

# Reorder to match
SLU2_df_ordered = SLU2_df_contigs_lengths.loc[SLU2_df_ext.index]

SLU2_data = pd.concat([SLU2_df_ext, SLU2_df_ordered], axis=1)
SLU2_data.tail()
print(SLU2_data.iloc[:, :-4])

In [None]:
n_neighbors = [20]
min_dist = [0.1]

colors = [0, 1, 2, 3, 4, 5]
color_map = {0: '#f7c0b7', 1: '#fa7a31', 2: '#eb340f', 3: '#d01af5', 4: '#2a2df6', 5: '#940785'}

# Map colors to each data point
custom_colors = [color_map[val] for val in colors]

for n in n_neighbors:
    for m in min_dist:
        reducer = umap.UMAP(n_neighbors=n, min_dist=m, random_state=seed)
        embedding = reducer.fit_transform(SLU2_data.iloc[:, :-4])
        SLU2_UMAP_data  = pd.DataFrame({
            'UMAP1': embedding[:, 0],
            'UMAP2': embedding[:, 1],
            'contig': SLU2_data.index,
            'ARG_count':SLU2_data['ARG_count'],
            'ARG_name':SLU2_data['ARG_name'],
            'contig_length':SLU2_data['length'],
        })

        SLU2_UMAP_data['ARG_count'] = SLU2_UMAP_data['ARG_count'].astype(str)
        category_order = ["0", "1", "2"]
        
        fig = px.scatter(SLU2_UMAP_data, 
                            x='UMAP1', 
                            y='UMAP2', 
                            color='ARG_count',
                            category_orders={'ARG_count': category_order},
                            title=f' Wastewater SLU2 - UMAP with n_neighbors={n}, min_dist={m}', 
                            color_discrete_sequence=custom_colors,
                            hover_data={'contig': True, 'ARG_count': True, 'ARG_name': True, 'contig_length': True} )
        title = f' Wastewater SLU2 - UMAP with n_neighbors={n}, min_dist={m}'
        fig.update_layout(
            height=1700,
            width=1200,
            title_text=title,
            showlegend=True,
            legend=dict(
                x=0.5,
                y=-0.05,
                traceorder="normal",
                xanchor='center',
                yanchor='top',
                orientation='h'
            ),
            template='simple_white',
            xaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,  
                linecolor='black', 
                linewidth=1,
                mirror=True
            ),
            yaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,
                linecolor='black',
                linewidth=1,
                mirror=True
            )
        )
        fig.show()
        #fig.write_image(f'UMAP_WW/SLU2_UMAP_{n}_{m}_contig_ARG_counts_above100.png')
        #fig.write_html(f'UMAP_WW/SLU2_UMAP_{n}_{m}_contig_ARG_counts_above100.html')

## Focus on distinct clusters
### Leave out the most misc cluster and draw again

In [None]:
# Extract based on UMAP1 (x axis) & UMAP2 (y axis) values:
SLU2_UMAP_data_focused = SLU2_UMAP_data.loc[(SLU2_UMAP_data['UMAP1']>= -15) & (SLU2_UMAP_data['UMAP1']<= 15)
    & (SLU2_UMAP_data['UMAP2']>= -15) & (SLU2_UMAP_data['UMAP2']<= 10)]

# Check
SLU2_UMAP_data_focused.tail()

SLU2_data_focused = SLU2_data[SLU2_data.index.isin(SLU2_UMAP_data_focused['contig'])]
print(SLU2_data_focused)

In [None]:
# Save contig IDs
SLU2_focused_contigs = SLU2_data_focused.index.to_list()

directory = 'UMAP_WW'
file_path = os.path.join(directory, 'SLU2_focused_contigs_above100.txt')

with open(file_path, 'w') as file:
    for item in SLU2_focused_contigs:
        file.write(f"{item}\n")

In [None]:
print(SLU2_data_focused.iloc[:, :-4])
print(SLU2_UMAP_data_focused['ARG_count'].unique())

In [None]:
print(SLU2_UMAP_data_focused['ARG_count'].unique())

In [None]:
n_neighbors = [20]
min_dist = [0.1]

colors = [0, 1, 2, 3, 4, 5]
color_map = {0: '#f7c0b7', 1: '#fa7a31', 2: '#eb340f', 3: '#d01af5', 4: '#2a2df6', 5: '#940785'}

# Map colors to each data point
custom_colors = [color_map[val] for val in colors]

for n in n_neighbors:
    for m in min_dist:
        reducer = umap.UMAP(n_neighbors=n, min_dist=m, random_state=seed)
        embedding = reducer.fit_transform(SLU2_data_focused.iloc[:, :-4])
        SLU2_UMAP_data_focused  = pd.DataFrame({
            'UMAP1': embedding[:, 0],
            'UMAP2': embedding[:, 1],
            'contig': SLU2_data_focused.index,
            'ARG_count': SLU2_data_focused['ARG_count'],
            'ARG_name':SLU2_data_focused['ARG_name'],
            'contig_length':SLU2_data_focused['length'],
        })

        SLU2_UMAP_data_focused['ARG_count'] = SLU2_UMAP_data_focused['ARG_count'].astype(str)
        SLU2_UMAP_data_focused['log_contig_length'] = np.log(SLU2_UMAP_data_focused['contig_length'])
        SLU2_UMAP_data_focused['sqrt_contig_length'] = np.sqrt(SLU2_UMAP_data_focused['contig_length'])

        SLU2_UMAP_data_focused['ARG_count'] = SLU2_UMAP_data_focused['ARG_count'].astype('category')
        category_order = ["0", "1", "2", "3"]
        
        fig = px.scatter(SLU2_UMAP_data_focused, 
                            x='UMAP1', 
                            y='UMAP2', 
                            color='ARG_count',
                            title=f' Wastewater SLU2 - UMAP with n_neighbors={n}, min_dist={m}',
                            category_orders={'ARG_count': category_order},
                            color_discrete_sequence=custom_colors,
                            hover_data={'contig': True, 'ARG_count': True, 'ARG_name': True, 'contig_length': True},
                            size='sqrt_contig_length'
        )
        title = f' Wastewater SLU2 - UMAP with n_neighbors={n}, min_dist={m}'
        fig.update_layout(
            height=1700,
            width=1200,
            title_text=title,
            showlegend=True,
            legend=dict(
                x=0.5,
                y=-0.05,
                traceorder="normal",
                xanchor='center',
                yanchor='top',
                orientation='h'
            ),
            template='simple_white',
            xaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,  
                linecolor='black', 
                linewidth=1,
                mirror=True
            ),
            yaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,
                linecolor='black',
                linewidth=1,
                mirror=True
            )
        )
        fig.show()
        #fig.write_image(f'UMAP_WW/SLU2_UMAP_{n}_{m}_ARG_counts_focused_above100.png')
        #fig.write_html(f'UMAP_WW/SLU2_UMAP_{n}_{m}_ARG_counts_focused_above100.html')
        #fig.write_image(f'UMAP_WW/SLU2_UMAP_{n}_{m}_ARG_counts_lengths_sqrt_focused_above100.png')
        #fig.write_html(f'UMAP_WW/SLU2_UMAP_{n}_{m}_ARG_counts_lengths_sqrt_focused_above100.html')

### Extract clusters
#### C1

In [None]:
# Extract based on UMAP1 (x axis) & UMAP2 (y axis) values:
SLU2_data_focused_C1 = SLU2_UMAP_data_focused.loc[(SLU2_UMAP_data_focused['UMAP1']>= 10.5) & (SLU2_UMAP_data_focused['UMAP1']<= 10.6)
    & (SLU2_UMAP_data_focused['UMAP2']>= 1.8) & (SLU2_UMAP_data_focused['UMAP2']<= 1.9)]

# Check
SLU2_data_focused_C1.head()

SLU2_data_C1 = SLU2_data_focused[SLU2_data_focused.index.isin(SLU2_data_focused_C1['contig'])]
print(SLU2_data_C1)

In [None]:
# Save contig IDs
SLU2_C1_contigs = SLU2_data_C1.index.to_list()

directory = 'UMAP_WW'
file_path = os.path.join(directory, 'SLU2_C1_contigs_above100.txt')

with open(file_path, 'w') as file:
    for item in SLU2_C1_contigs:
        file.write(f"{item}\n")

### Extract clusters
#### C2

In [None]:
# Extract based on UMAP1 (x axis) & UMAP2 (y axis) values:
SLU2_data_focused_C2 = SLU2_UMAP_data_focused.loc[(SLU2_UMAP_data_focused['UMAP1']>= 0.6) & (SLU2_UMAP_data_focused['UMAP1']<= 0.8)
    & (SLU2_UMAP_data_focused['UMAP2']>= 10.9) & (SLU2_UMAP_data_focused['UMAP2']<= 11)]

# Check
SLU2_data_focused_C2.head()

SLU2_data_C2 = SLU2_data_focused[SLU2_data_focused.index.isin(SLU2_data_focused_C2['contig'])]
print(SLU2_data_C2)

In [None]:
# Save contig IDs
SLU2_C2_contigs = SLU2_data_C2.index.to_list()

directory = 'UMAP_WW'
file_path = os.path.join(directory, 'SLU2_C2_contigs_above100.txt')

with open(file_path, 'w') as file:
    for item in SLU2_C2_contigs:
        file.write(f"{item}\n")

### Extract clusters
#### C3

In [None]:
# Extract based on UMAP1 (x axis) & UMAP2 (y axis) values:
SLU2_data_focused_C3 = SLU2_UMAP_data_focused.loc[(SLU2_UMAP_data_focused['UMAP1']>= 0.62) & (SLU2_UMAP_data_focused['UMAP1']<= 0.64)
    & (SLU2_UMAP_data_focused['UMAP2']>= 10.78) & (SLU2_UMAP_data_focused['UMAP2']<= 10.79)]

# Check
SLU2_data_focused_C3.head()

SLU2_data_C3 = SLU2_data_focused[SLU2_data_focused.index.isin(SLU2_data_focused_C3['contig'])]
print(SLU2_data_C3)

In [None]:
# Save contig IDs
SLU2_C3_contigs = SLU2_data_C3.index.to_list()

directory = 'UMAP_WW'
file_path = os.path.join(directory, 'SLU2_C3_contigs_above100.txt')

with open(file_path, 'w') as file:
    for item in SLU2_C3_contigs:
        file.write(f"{item}\n")

### Extract clusters
#### C4

In [None]:
# Extract based on UMAP1 (x axis) & UMAP2 (y axis) values:
SLU2_data_focused_C4 = SLU2_UMAP_data_focused.loc[(SLU2_UMAP_data_focused['UMAP1']>= 1.35) & (SLU2_UMAP_data_focused['UMAP1']<= 1.45)
    & (SLU2_UMAP_data_focused['UMAP2']>= 2.66) & (SLU2_UMAP_data_focused['UMAP2']<= 2.76)]

# Check
SLU2_data_focused_C4.head()

SLU2_data_C4 = SLU2_data_focused[SLU2_data_focused.index.isin(SLU2_data_focused_C4['contig'])]
print(SLU2_data_C4)

In [None]:
# Save contig IDs
SLU2_C4_contigs = SLU2_data_C4.index.to_list()

directory = 'UMAP_WW'
file_path = os.path.join(directory, 'SLU2_C4_contigs_above100.txt')

with open(file_path, 'w') as file:
    for item in SLU2_C4_contigs:
        file.write(f"{item}\n")

### Extract clusters
#### C5

In [None]:
# Extract based on UMAP1 (x axis) & UMAP2 (y axis) values:
SLU2_data_focused_C5 = SLU2_UMAP_data_focused.loc[(SLU2_UMAP_data_focused['UMAP1']>= 9.2) & (SLU2_UMAP_data_focused['UMAP1']<= 9.4)
    & (SLU2_UMAP_data_focused['UMAP2']>= 2.64) & (SLU2_UMAP_data_focused['UMAP2']<= 3.67)]

# Check
SLU2_data_focused_C5.head()

SLU2_data_C5 = SLU2_data_focused[SLU2_data_focused.index.isin(SLU2_data_focused_C5['contig'])]
print(SLU2_data_C5)

In [None]:
# Save contig IDs
SLU2_C5_contigs = SLU2_data_C5.index.to_list()

directory = 'UMAP_WW'
file_path = os.path.join(directory, 'SLU2_C5_contigs_above100.txt')

with open(file_path, 'w') as file:
    for item in SLU2_C5_contigs:
        file.write(f"{item}\n")

### Extract clusters
#### C6

In [None]:
# Extract based on UMAP1 (x axis) & UMAP2 (y axis) values:
SLU2_data_focused_C6 = SLU2_UMAP_data_focused.loc[(SLU2_UMAP_data_focused['UMAP1']>= 16.1) & (SLU2_UMAP_data_focused['UMAP1']<= 16.15)
    & (SLU2_UMAP_data_focused['UMAP2']>= 8.84) & (SLU2_UMAP_data_focused['UMAP2']<= 8.86)]

# Check
SLU2_data_focused_C6.head()

SLU2_data_C6 = SLU2_data_focused[SLU2_data_focused.index.isin(SLU2_data_focused_C6['contig'])]
print(SLU2_data_C6)

In [None]:
# Save contig IDs
SLU2_C6_contigs = SLU2_data_C6.index.to_list()

directory = 'UMAP_WW'
file_path = os.path.join(directory, 'SLU2_C6_contigs_above100.txt')

with open(file_path, 'w') as file:
    for item in SLU2_C6_contigs:
        file.write(f"{item}\n")

### Extract clusters
#### C7

In [None]:
# Extract based on UMAP1 (x axis) & UMAP2 (y axis) values:
SLU2_data_focused_C7 = SLU2_UMAP_data_focused.loc[(SLU2_UMAP_data_focused['UMAP1']>= 16) & (SLU2_UMAP_data_focused['UMAP1']<= 16.2)
    & (SLU2_UMAP_data_focused['UMAP2']>= 5) & (SLU2_UMAP_data_focused['UMAP2']<= 5.1)]

# Check
SLU2_data_focused_C7.head()

SLU2_data_C7 = SLU2_data_focused[SLU2_data_focused.index.isin(SLU2_data_focused_C7['contig'])]
print(SLU2_data_C7)

In [None]:
# Save contig IDs
SLU2_C7_contigs = SLU2_data_C7.index.to_list()

directory = 'UMAP_WW'
file_path = os.path.join(directory, 'SLU2_C7_contigs_above100.txt')

with open(file_path, 'w') as file:
    for item in SLU2_C7_contigs:
        file.write(f"{item}\n")

### Extract clusters
#### C8

In [None]:
# Extract based on UMAP1 (x axis) & UMAP2 (y axis) values:
SLU2_data_focused_C8 = SLU2_UMAP_data_focused.loc[(SLU2_UMAP_data_focused['UMAP1']>= 13.5) & (SLU2_UMAP_data_focused['UMAP1']<= 13.6)
    & (SLU2_UMAP_data_focused['UMAP2']>= -1.88) & (SLU2_UMAP_data_focused['UMAP2']<= -1.8)]

# Check
SLU2_data_focused_C8.head()

SLU2_data_C8 = SLU2_data_focused[SLU2_data_focused.index.isin(SLU2_data_focused_C8['contig'])]
print(SLU2_data_C8)

In [None]:
# Save contig IDs
SLU2_C8_contigs = SLU2_data_C8.index.to_list()

directory = 'UMAP_WW'
file_path = os.path.join(directory, 'SLU2_C8_contigs_above100.txt')

with open(file_path, 'w') as file:
    for item in SLU2_C8_contigs:
        file.write(f"{item}\n")

# SLU3
## Import data

In [None]:
path_to_images = '/scratch/project_2006608/Methylation/notebooks/UMAP_WW/'

## FILTERING
### > 100 lines in .gff

In [None]:
file_path = '/scratch/project_2006608/Methylation/WW_data/SLU3_matrices_top100/flattened/SLU3_concat_matrices_top100.tsv'
SLU3_matrices = pd.read_csv(file_path, sep='\t', index_col=0, low_memory=False)

In [None]:
print(SLU3_matrices.shape[0])

In [None]:
print(SLU3_matrices.shape)
SLU3_matrices.head()

In [None]:
SLU3_df = SLU3_matrices.loc[(SLU3_matrices.iloc[:, :492] != 0).any(axis=1)]
SLU3_df.shape
SLU3_df.head()

In [None]:
SLU3_df['sample'].value_counts()
print(SLU3_df.iloc[:, :-1])

In [None]:
# All
#n_neighbors = [20, 30 ]
#min_dist = [0.01, 0.1, 0.2]
# The best
n_neighbors = [20]
min_dist = [0.1]

color_dict = ['#df8275']

for n in n_neighbors:
    for m in min_dist:
        
        reducer = umap.UMAP(n_neighbors=n, min_dist=m, random_state=seed)
        embedding = reducer.fit_transform(SLU3_df.iloc[:, :-1])
        UMAP_SLU3_df  = pd.DataFrame({
            'UMAP1': embedding[:, 0],
            'UMAP2': embedding[:, 1],
            'contig': SLU3_df.index
        })
        
        fig = px.scatter(UMAP_SLU3_df, 
                            x='UMAP1', 
                            y='UMAP2', 
                            title=f' Wastewater SLU3 - UMAP with n_neighbors={n}, min_dist={m}', 
                            color_discrete_sequence=color_dict,
                            hover_data={'contig': True})
        title = f' Wastewater SLU3 - UMAP with n_neighbors={n}, min_dist={m}'
        fig.update_layout(
            height=1500,
            width=1200,
            title_text=title,
            showlegend=True,
            legend=dict(
                x=0.5,
                y=-0.1,
                traceorder="normal",
                xanchor='center',
                yanchor='top',
                orientation='h'
            ),
            template='simple_white',
            xaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,  
                linecolor='black', 
                linewidth=1,
                mirror=True
            ),
            yaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,
                linecolor='black',
                linewidth=1,
                mirror=True
            )
        )
        fig.show()
        fig.write_image(f'UMAP_WW/SLU3_UMAP_{n}_{m}_above100.png')
        fig.write_html(f'UMAP_WW/SLU3_UMAP_{n}_{m}_above100.html')

## Attach mod count data

In [None]:
## Bring the contig lengths from Puhti:
file_path = '/scratch/project_2006608/Methylation/WW_data/SLU3_contigs/SLU3_mod_counts.txt'

SLU3_df_mod_counts = pd.read_csv(file_path, sep='\t', index_col=0, header=0, low_memory=False)
print(SLU3_df_mod_counts.shape[0])
SLU3_df_mod_counts.head()

SLU3_df_mod_counts = np.log(SLU3_df_mod_counts)

In [None]:
## Append to merged_data.tsv
SLU3_df_ext = SLU3_df.copy()
SLU3_df_ext.head()
SLU3_df_mod_counts.head()

# Reorder to match
SLU3_df_ordered = SLU3_df_mod_counts.loc[SLU3_df_ext.index]

SLU3_df_mod_counts = pd.concat([SLU3_df_ext, SLU3_df_ordered], axis=1)
SLU3_df_mod_counts.tail()
print(SLU3_df_mod_counts.iloc[:, :-2])

In [None]:
n_neighbors = [20]
min_dist = [0.1]

for n in n_neighbors:
    for m in min_dist:
        reducer = umap.UMAP(n_neighbors=n, min_dist=m, random_state=seed)
        embedding = reducer.fit_transform(SLU3_df_mod_counts.iloc[:, :-2])
        SLU3_mod_counts_UMAP_df  = pd.DataFrame({
            'UMAP1': embedding[:, 0],
            'UMAP2': embedding[:, 1],
            'contig': SLU3_df_mod_counts.index,
            'mod_count':SLU3_df_mod_counts['mod_count']
        })
        
        fig = px.scatter(SLU3_mod_counts_UMAP_df, 
                            x='UMAP1', 
                            y='UMAP2', 
                            color='mod_count',
                            title=f' Wastewater SLU3 - UMAP with n_neighbors={n}, min_dist={m}', 
                            color_continuous_scale=px.colors.sequential.Rainbow,
                            hover_data={'contig': True} )
        title = f' Wastewater SLU3 - UMAP with n_neighbors={n}, min_dist={m}'
        fig.update_layout(
            height=1700,
            width=1200,
            title_text=title,
            showlegend=True,
            legend=dict(
                x=0.5,
                y=-0.05,
                traceorder="normal",
                xanchor='center',
                yanchor='top',
                orientation='h'
            ),
            template='simple_white',
            xaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,  
                linecolor='black', 
                linewidth=1,
                mirror=True
            ),
            yaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,
                linecolor='black',
                linewidth=1,
                mirror=True
            )
        )
        fig.show()
        fig.write_image(f'UMAP_WW/SLU3_UMAP_{n}_{m}_contig_mod_counts_log_above100.png')
        fig.write_html(f'UMAP_WW/SLU3_UMAP_{n}_{m}_contig_mod_counts_log_above100.html')

## Attach ARG data
### Counts
### Lengths
### Contig lengths

### ARG Counts

In [None]:
file_path = '/scratch/project_2006608/Methylation/WW_data/SLU3_contigs/SLU3_ARG_counts.txt'

SLU3_df_ARG_counts = pd.read_csv(file_path, sep='\t', index_col=0, header=0, low_memory=False)
print(SLU3_df_ARG_counts.shape[0])
SLU3_df_ARG_counts.head()

In [None]:
## Append to merged_data.tsv
SLU3_df_ext = SLU3_df.copy()
SLU3_df_ext.head()
SLU3_df_ARG_counts.head()

# Reorder to match
SLU3_df_ordered = SLU3_df_ARG_counts.loc[SLU3_df_ext.index]

SLU3_df_ARG_counts = pd.concat([SLU3_df_ext, SLU3_df_ordered], axis=1)
SLU3_df_ARG_counts.tail()

### ARG Names

In [None]:
file_path = '/scratch/project_2006608/Methylation/WW_data/SLU3_contigs/SLU3_ARG_names.txt'

SLU3_df_ARG_names = pd.read_csv(file_path, sep='\t', index_col=0, header=0, low_memory=False)
print(SLU3_df_ARG_names.shape[0])
SLU3_df_ARG_names.head()

In [None]:
## Append to merged_data.tsv
SLU3_df_ext = SLU3_df_ARG_counts.copy()
SLU3_df_ext.head()
SLU3_df_ARG_names.head()

# Reorder to match
SLU3_df_ordered = SLU3_df_ARG_names.loc[SLU3_df_ext.index]

SLU3_df_ARG_names = pd.concat([SLU3_df_ext, SLU3_df_ordered], axis=1)
SLU3_df_ARG_names.tail()

In [None]:
# print those with erm(F)_3
erm_F = SLU3_df_ARG_names[SLU3_df_ARG_names['ARG_name'].str.contains('erm(F)_3', case=False, na=False, regex=False)]
print(erm_F)
row_names_df = pd.DataFrame(erm_F.index, columns=['0'])
print(row_names_df)

### Contig lengths

In [None]:
## Bring the contig lengths from Puhti:
file_path = '/scratch/project_2006608/Methylation/WW_data/SLU3_contigs/SLU3_contigs_lengths.txt'

SLU3_df_contigs_lengths = pd.read_csv(file_path, sep='\t', index_col=0, header=0, low_memory=False)
print(SLU3_df_contigs_lengths.shape[0])
SLU3_df_contigs_lengths.head()

In [None]:
## Append to merged_data.tsv
SLU3_df_ext = SLU3_df_ARG_names.copy()
SLU3_df_ext.head()
SLU3_df_contigs_lengths.head()

# Reorder to match
SLU3_df_ordered = SLU3_df_contigs_lengths.loc[SLU3_df_ext.index]

SLU3_data = pd.concat([SLU3_df_ext, SLU3_df_ordered], axis=1)
SLU3_data.tail()
print(SLU3_data.iloc[:, :-4])

In [None]:
n_neighbors = [20]
min_dist = [0.1]

colors = [0, 1, 2, 3, 4, 5]
color_map = {0: '#f7c0b7', 1: '#fa7a31', 2: '#eb340f', 3: '#d01af5', 4: '#2a2df6', 5: '#940785'}

# Map colors to each data point
custom_colors = [color_map[val] for val in colors]

for n in n_neighbors:
    for m in min_dist:
        reducer = umap.UMAP(n_neighbors=n, min_dist=m, random_state=seed)
        embedding = reducer.fit_transform(SLU3_data.iloc[:, :-4])
        SLU3_UMAP_data  = pd.DataFrame({
            'UMAP1': embedding[:, 0],
            'UMAP2': embedding[:, 1],
            'contig': SLU3_data.index,
            'ARG_count':SLU3_data['ARG_count'],
            'ARG_name':SLU3_data['ARG_name'],
            'contig_length':SLU3_data['length'],
        })

        SLU3_UMAP_data['ARG_count'] = SLU3_UMAP_data['ARG_count'].astype(str)
        category_order = ["0", "1", "2", "3"]
        
        fig = px.scatter(SLU3_UMAP_data, 
                            x='UMAP1', 
                            y='UMAP2', 
                            color='ARG_count',
                            category_orders={'ARG_count': category_order},
                            title=f' Wastewater SLU3 - UMAP with n_neighbors={n}, min_dist={m}', 
                            color_discrete_sequence=custom_colors,
                            hover_data={'contig': True, 'ARG_count': True, 'ARG_name': True, 'contig_length': True} )
        title = f' Wastewater SLU3 - UMAP with n_neighbors={n}, min_dist={m}'
        fig.update_layout(
            height=1700,
            width=1200,
            title_text=title,
            showlegend=True,
            legend=dict(
                x=0.5,
                y=-0.05,
                traceorder="normal",
                xanchor='center',
                yanchor='top',
                orientation='h'
            ),
            template='simple_white',
            xaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,  
                linecolor='black', 
                linewidth=1,
                mirror=True
            ),
            yaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,
                linecolor='black',
                linewidth=1,
                mirror=True
            )
        )
        fig.show()
        fig.write_image(f'UMAP_WW/SLU3_UMAP_{n}_{m}_contig_ARG_counts_above100.png')
        fig.write_html(f'UMAP_WW/SLU3_UMAP_{n}_{m}_contig_ARG_counts_above100.html')

## Focus on distinct clusters
### Leave out the most misc cluster and draw again

In [None]:
# Extract based on UMAP1 (x axis) & UMAP2 (y axis) values:
SLU3_UMAP_data_focused = SLU3_UMAP_data.loc[(SLU3_UMAP_data['UMAP1']>= -15) & (SLU3_UMAP_data['UMAP1']<= 15)
    & (SLU3_UMAP_data['UMAP2']>= -7) & (SLU3_UMAP_data['UMAP2']<= 14)]

# Check
SLU3_UMAP_data_focused.tail()

SLU3_data_focused = SLU3_data[SLU3_data.index.isin(SLU3_UMAP_data_focused['contig'])]
print(SLU3_data_focused)

In [None]:
# Save contig IDs
SLU3_focused_contigs = SLU3_data_focused.index.to_list()

directory = 'UMAP_WW'
file_path = os.path.join(directory, 'SLU3_focused_contigs_above100.txt')

with open(file_path, 'w') as file:
    for item in SLU3_focused_contigs:
        file.write(f"{item}\n")

In [None]:
print(SLU3_data_focused.iloc[:, :-4])
print(SLU3_UMAP_data_focused['ARG_count'].unique())

In [None]:
print(SLU3_UMAP_data_focused['ARG_count'].unique())

In [None]:
n_neighbors = [20]
min_dist = [0.1]

colors = [0, 1, 2]
color_map = {0: '#f7c0b7', 1: '#fa7a31', 2: '#eb340f', 3: '#d01af5', 4: '#2a2df6', 5: '#940785'}

# Map colors to each data point
custom_colors = [color_map[val] for val in colors]

for n in n_neighbors:
    for m in min_dist:
        reducer = umap.UMAP(n_neighbors=n, min_dist=m, random_state=seed)
        embedding = reducer.fit_transform(SLU3_data_focused.iloc[:, :-4])
        SLU3_UMAP_data_focused  = pd.DataFrame({
            'UMAP1': embedding[:, 0],
            'UMAP2': embedding[:, 1],
            'contig': SLU3_data_focused.index,
            'ARG_count': SLU3_data_focused['ARG_count'],
            'ARG_name':SLU3_data_focused['ARG_name'],
            'contig_length':SLU3_data_focused['length'],
        })

        SLU3_UMAP_data_focused['ARG_count'] = SLU3_UMAP_data_focused['ARG_count'].astype(str)
        SLU3_UMAP_data_focused['log_contig_length'] = np.log(SLU3_UMAP_data_focused['contig_length'])
        SLU3_UMAP_data_focused['sqrt_contig_length'] = np.sqrt(SLU3_UMAP_data_focused['contig_length'])

        SLU3_UMAP_data_focused['ARG_count'] = SLU3_UMAP_data_focused['ARG_count'].astype('category')
        category_order = ["0", "1", "2"]
        
        fig = px.scatter(SLU3_UMAP_data_focused, 
                            x='UMAP1', 
                            y='UMAP2', 
                            color='ARG_count',
                            title=f' Wastewater SLU3 - UMAP with n_neighbors={n}, min_dist={m}',
                            category_orders={'ARG_count': category_order},
                            color_discrete_sequence=custom_colors,
                            hover_data={'contig': True, 'ARG_count': True, 'ARG_name': True, 'contig_length': True},
                            size='sqrt_contig_length'
        )
        title = f' Wastewater SLU3 - UMAP with n_neighbors={n}, min_dist={m}'
        fig.update_layout(
            height=1700,
            width=1200,
            title_text=title,
            showlegend=True,
            legend=dict(
                x=0.5,
                y=-0.05,
                traceorder="normal",
                xanchor='center',
                yanchor='top',
                orientation='h'
            ),
            template='simple_white',
            xaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,  
                linecolor='black', 
                linewidth=1,
                mirror=True
            ),
            yaxis=dict(
                showgrid=True,
                gridcolor='lightgray',
                zeroline=False,
                showline=True,
                linecolor='black',
                linewidth=1,
                mirror=True
            )
        )
        fig.show()
        #fig.write_image(f'UMAP_WW/SLU3_UMAP_{n}_{m}_ARG_counts_focused_above100.png')
        #fig.write_html(f'UMAP_WW/SLU3_UMAP_{n}_{m}_ARG_counts_focused_above100.html')
        fig.write_image(f'UMAP_WW/SLU3_UMAP_{n}_{m}_ARG_counts_lengths_sqrt_focused_above100.png')
        fig.write_html(f'UMAP_WW/SLU3_UMAP_{n}_{m}_ARG_counts_lengths_sqrt_focused_above100.html')

### Extract clusters
#### C1

In [None]:
# Extract based on UMAP1 (x axis) & UMAP2 (y axis) values:
SLU3_data_focused_C1 = SLU3_UMAP_data_focused.loc[(SLU3_UMAP_data_focused['UMAP1']>= -0.1) & (SLU3_UMAP_data_focused['UMAP1']<= -0.02)
    & (SLU3_UMAP_data_focused['UMAP2']>= 2.82) & (SLU3_UMAP_data_focused['UMAP2']<= 2.85)]

# Check
SLU3_data_focused_C1.head()

SLU3_data_C1 = SLU3_data_focused[SLU3_data_focused.index.isin(SLU3_data_focused_C1['contig'])]
print(SLU3_data_C1)

In [None]:
# Save contig IDs
SLU3_C1_contigs = SLU3_data_C1.index.to_list()

directory = 'UMAP_WW'
file_path = os.path.join(directory, 'SLU3_C1_contigs_above100.txt')

with open(file_path, 'w') as file:
    for item in SLU3_C1_contigs:
        file.write(f"{item}\n")

### Extract clusters
#### C2

In [None]:
# Extract based on UMAP1 (x axis) & UMAP2 (y axis) values:
SLU3_data_focused_C2 = SLU3_UMAP_data_focused.loc[(SLU3_UMAP_data_focused['UMAP1']>= -4.4) & (SLU3_UMAP_data_focused['UMAP1']<= -4.3)
    & (SLU3_UMAP_data_focused['UMAP2']>= 5.36) & (SLU3_UMAP_data_focused['UMAP2']<= 5.4)]

# Check
SLU3_data_focused_C2.head()

SLU3_data_C2 = SLU3_data_focused[SLU3_data_focused.index.isin(SLU3_data_focused_C2['contig'])]
print(SLU3_data_C2)

In [None]:
# Save contig IDs
SLU3_C2_contigs = SLU3_data_C2.index.to_list()

directory = 'UMAP_WW'
file_path = os.path.join(directory, 'SLU3_C2_contigs_above100.txt')

with open(file_path, 'w') as file:
    for item in SLU3_C2_contigs:
        file.write(f"{item}\n")

### Extract clusters
#### C3

In [None]:
# Extract based on UMAP1 (x axis) & UMAP2 (y axis) values:
SLU3_data_focused_C3 = SLU3_UMAP_data_focused.loc[(SLU3_UMAP_data_focused['UMAP1']>= 8.2) & (SLU3_UMAP_data_focused['UMAP1']<= 8.32)
    & (SLU3_UMAP_data_focused['UMAP2']>= 12.8) & (SLU3_UMAP_data_focused['UMAP2']<= 12.9)]

# Check
SLU3_data_focused_C3.head()

SLU3_data_C3 = SLU3_data_focused[SLU3_data_focused.index.isin(SLU3_data_focused_C3['contig'])]
print(SLU3_data_C3)

In [None]:
# Save contig IDs
SLU3_C3_contigs = SLU3_data_C3.index.to_list()

directory = 'UMAP_WW'
file_path = os.path.join(directory, 'SLU3_C3_contigs_above100.txt')

with open(file_path, 'w') as file:
    for item in SLU3_C3_contigs:
        file.write(f"{item}\n")

### Extract clusters
#### C4

In [None]:
# Extract based on UMAP1 (x axis) & UMAP2 (y axis) values:
SLU3_data_focused_C4 = SLU3_UMAP_data_focused.loc[(SLU3_UMAP_data_focused['UMAP1']>= -2.65) & (SLU3_UMAP_data_focused['UMAP1']<= -2.5)
    & (SLU3_UMAP_data_focused['UMAP2']>= 8.6) & (SLU3_UMAP_data_focused['UMAP2']<= 8.75)]

# Check
SLU3_data_focused_C4.head()

SLU3_data_C4 = SLU3_data_focused[SLU3_data_focused.index.isin(SLU3_data_focused_C4['contig'])]
print(SLU3_data_C4)

In [None]:
# Save contig IDs
SLU3_C4_contigs = SLU3_data_C4.index.to_list()

directory = 'UMAP_WW'
file_path = os.path.join(directory, 'SLU3_C4_contigs_above100.txt')

with open(file_path, 'w') as file:
    for item in SLU3_C4_contigs:
        file.write(f"{item}\n")

### Extract clusters
#### C5

In [None]:
# Extract based on UMAP1 (x axis) & UMAP2 (y axis) values:
SLU3_data_focused_C5 = SLU3_UMAP_data_focused.loc[(SLU3_UMAP_data_focused['UMAP1']>= 1.1) & (SLU3_UMAP_data_focused['UMAP1']<= 1.2)
    & (SLU3_UMAP_data_focused['UMAP2']>= 9.2) & (SLU3_UMAP_data_focused['UMAP2']<= 9.25)]

# Check
SLU3_data_focused_C5.head()

SLU3_data_C5 = SLU3_data_focused[SLU3_data_focused.index.isin(SLU3_data_focused_C5['contig'])]
print(SLU3_data_C5)

In [None]:
# Save contig IDs
SLU3_C5_contigs = SLU3_data_C5.index.to_list()

directory = 'UMAP_WW'
file_path = os.path.join(directory, 'SLU3_C5_contigs_above100.txt')

with open(file_path, 'w') as file:
    for item in SLU3_C5_contigs:
        file.write(f"{item}\n")

### Extract clusters
#### C6

In [None]:
# Extract based on UMAP1 (x axis) & UMAP2 (y axis) values:
SLU3_data_focused_C6 = SLU3_UMAP_data_focused.loc[(SLU3_UMAP_data_focused['UMAP1']>= 0.3) & (SLU3_UMAP_data_focused['UMAP1']<= 0.4)
    & (SLU3_UMAP_data_focused['UMAP2']>= 9.7) & (SLU3_UMAP_data_focused['UMAP2']<= 9.8)]

# Check
SLU3_data_focused_C6.head()

SLU3_data_C6 = SLU3_data_focused[SLU3_data_focused.index.isin(SLU3_data_focused_C6['contig'])]
print(SLU3_data_C6)

In [None]:
# Save contig IDs
SLU3_C6_contigs = SLU3_data_C6.index.to_list()

directory = 'UMAP_WW'
file_path = os.path.join(directory, 'SLU3_C6_contigs_above100.txt')

with open(file_path, 'w') as file:
    for item in SLU3_C6_contigs:
        file.write(f"{item}\n")

### Extract clusters
#### C7

In [None]:
# Extract based on UMAP1 (x axis) & UMAP2 (y axis) values:
SLU3_data_focused_C7 = SLU3_UMAP_data_focused.loc[(SLU3_UMAP_data_focused['UMAP1']>= 1.2) & (SLU3_UMAP_data_focused['UMAP1']<= 1.33)
    & (SLU3_UMAP_data_focused['UMAP2']>= 10.15) & (SLU3_UMAP_data_focused['UMAP2']<= 10.22)]

# Check
SLU3_data_focused_C7.head()

SLU3_data_C7 = SLU3_data_focused[SLU3_data_focused.index.isin(SLU3_data_focused_C7['contig'])]
print(SLU3_data_C7)

In [None]:
# Save contig IDs
SLU3_C7_contigs = SLU3_data_C7.index.to_list()

directory = 'UMAP_WW'
file_path = os.path.join(directory, 'SLU3_C7_contigs_above100.txt')

with open(file_path, 'w') as file:
    for item in SLU3_C7_contigs:
        file.write(f"{item}\n")

### Extract clusters
#### C8

In [None]:
# Extract based on UMAP1 (x axis) & UMAP2 (y axis) values:
SLU3_data_focused_C8 = SLU3_UMAP_data_focused.loc[(SLU3_UMAP_data_focused['UMAP1']>= 1.8) & (SLU3_UMAP_data_focused['UMAP1']<= 2)
    & (SLU3_UMAP_data_focused['UMAP2']>= 9.75) & (SLU3_UMAP_data_focused['UMAP2']<= 9.9)]

# Check
SLU3_data_focused_C8.head()

SLU3_data_C8 = SLU3_data_focused[SLU3_data_focused.index.isin(SLU3_data_focused_C8['contig'])]
print(SLU3_data_C8)

In [None]:
# Save contig IDs
SLU3_C8_contigs = SLU3_data_C8.index.to_list()

directory = 'UMAP_WW'
file_path = os.path.join(directory, 'SLU3_C8_contigs_above100.txt')

with open(file_path, 'w') as file:
    for item in SLU3_C8_contigs:
        file.write(f"{item}\n")