# Data understanding and Exploratory analysis (EDA)
## Topic: Examining the ecological processes influencing the assembly of molecules into OM assemblages


Dataset: 10 May 2022
Water with site and CFs 

## Data Processing

In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

sns.set_theme()

In [9]:
# Load dataset
fn_water = "../Dataset/Water_Prevalence_10__commat_2021-09-29.csv"
df_water = pd.read_csv(fn_water)
print('Water: ' + str(df_water.shape))

# Data quality check
print(df_water.columns.nunique())

# Join datasets
df = df_water.copy()

# Check if there is any identical columns / CF across all samples
print("Duplicated column: " + str(df.columns.duplicated().any()))

# Pre-processing
df = df.rename(columns={"Unnamed: 0": "sample_id"})
df = df.set_index("sample_id").fillna(0).reset_index()

# For analysis
sw = df_water.rename(columns={"Unnamed: 0": "sample_id"}).set_index("sample_id")
print("Shape:" + str(sw.shape))
sw.head()

Water: (265, 4936)
4936
Duplicated column: False
Shape:(265, 4935)


Unnamed: 0_level_0,C10H10O5,C10H10O5N2,C10H10O5S,C10H10O6,C10H10O6N2,C10H10O6S,C10H10O7,C10H10O7S,C10H10O8,C10H10O8S,...,C9H6O6,C9H6O7,C9H7O5N,C9H7O6N,C9H8O6,C9H8O6S,C9H8O7,C9H8O8,C9H9O5N,C9H9O6N
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SW_S19S.0003_U_1,1,0,0,1,0,0,1,0,1,0,...,1,1,0,0,1,0,1,1,1,1
SW_S19S.0003_U_2,1,0,0,1,0,0,1,0,1,0,...,1,1,0,0,1,0,1,0,0,1
SW_S19S.0003_U_3,1,0,0,1,0,0,1,0,1,0,...,1,1,0,0,1,0,1,0,0,1
SW_S19S.0004_U_1,1,0,0,1,0,0,1,0,1,0,...,1,1,0,0,1,0,1,0,1,1
SW_S19S.0004_U_2,1,0,0,1,0,0,1,0,1,0,...,1,1,1,0,1,0,1,1,0,1


## DOM analysis based on association rules

In [10]:
import mlxtend
from mlxtend.frequent_patterns import apriori

In [27]:
def draw_graph(rules, rules_to_show):
  import networkx as nx  
  G1 = nx.DiGraph()
   
  color_map=[]
  N = 50
  colors = np.random.rand(N)    
  strs=['R0', 'R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'R9', 'R10', 'R11']   
   
   
  for i in range (rules_to_show):      
    G1.add_nodes_from(["R"+str(i)])
    
     
    for a in rules.iloc[i]['antecedants']:
                
        G1.add_nodes_from([a])
        
        G1.add_edge(a, "R"+str(i), color=colors[i] , weight = 2)
       
    for c in rules.iloc[i]['consequents']:
             
            G1.add_nodes_from()
            
            G1.add_edge("R"+str(i), c, color=colors[i],  weight=2)
 
  for node in G1:
       found_a_string = False
       for item in strs: 
           if node==item:
                found_a_string = True
       if found_a_string:
            color_map.append('yellow')
       else:
            color_map.append('green')       
 
 
   
  edges = G1.edges()
  colors = [G1[u][v]['color'] for u,v in edges]
  weights = [G1[u][v]['weight'] for u,v in edges]
 
  pos = nx.spring_layout(G1, k=16, scale=1)
  nx.draw(G1, pos, edges=edges, node_color = color_map, edge_color=colors, width=weights, font_size=16, with_labels=False)            
   
  for p in pos:  # raise text positions
           pos[p][1] += 0.07
  nx.draw_networkx_labels(G1, pos)
  plt.show()

In [28]:
sw.shape

(265, 4935)

In [34]:
frequent_itemsets = apriori(sw.iloc[:,:1000], min_support=0.99, use_colnames=True)
print (frequent_itemsets)

MemoryError: Unable to allocate 71.1 GiB for an array with shape (5141079, 7, 265) and data type int64

In [None]:
from mlxtend.frequent_patterns import association_rules
 
association_rules(frequent_itemsets, metric="confidence", min_threshold=0.8)
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.2)
print (rules)
 
support=rules['support'].values
confidence=rules['confidence'].values

In [None]:
import seaborn as sns

for i in range (len(support)):
    support[i] = support[i] 
    confidence[i] = confidence[i] 
     
plt.title('Association Rules')
plt.xlabel('support')
plt.ylabel('confidence')    
sns.regplot(x=support, y=confidence, fit_reg=False)
 
plt.gcf().clear()
draw_graph (rules, 10)  