In [1410]:
import numpy as np
from sklearn.preprocessing import FunctionTransformer
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
from sklearn.cluster import SpectralClustering, KMeans
import warnings
warnings.filterwarnings("ignore")
from genetic_utils import identify_spec, generate_pca_plot, generate_proportion_plot

In [1411]:
gene_df = pd.read_csv('data/gene_spec.csv')
morph_df = pd.read_csv('data/morph.csv')

In [1412]:
morph_df = morph_df.drop(['idx', 'Date', 'classification', 'uncertainty','Latitude', 'Longitude', 'Altitude.ft', 'Multi.Single.stem', 'General.location.Habitat', 'site', 'Putative_spp'], axis=1)
morph_df = morph_df.dropna()
morph_df = morph_df.replace(0, 1e-10)

data = morph_df.drop(['TreeNo'], axis=1)

scaler = StandardScaler()
data = scaler.fit_transform(data)
data = data[:, [0, 11, 7, 12, 16]]
kmeans = KMeans(n_clusters=3, random_state=0)
cluster_labels = kmeans.fit_predict(data)

morph_df['morph_cluster'] = cluster_labels
morph_df = morph_df[['TreeNo', 'morph_cluster']]

morph_df.to_csv('data/morph_cluster.csv', index=False)

In [1413]:
gene_df = pd.read_csv('data/gene_spec.csv')
gene_df = gene_df[gene_df['spec'] != 'QB']
gene_df = gene_df.drop(['DNA_ID', 'spec'], axis=1)
data = gene_df.drop(['TreeNo'], axis=1)

scaler = StandardScaler()
data = scaler.fit_transform(data)
data = data[:, [1,2,0]]
kmeans = KMeans(n_clusters=3, random_state=0)
cluster_labels = kmeans.fit_predict(data)

gene_df['gene_cluster'] = cluster_labels
gene_df.to_csv('data/gene_cluster.csv', index=False)

In [1414]:
joined_df= morph_df.merge(gene_df, on='TreeNo')

# sample a random batch of rows
sample = joined_df.sample(50)

# build an association matrix
association_matrix = np.zeros((3,3))
for i in range(3):
    for j in range(3):
        association_matrix[i,j] = np.sum((sample['morph_cluster'] == i) & (sample['gene_cluster'] == j))
print(association_matrix)

decision = np.argmax(association_matrix, axis=1)
print(decision)

[[ 0. 18.  3.]
 [ 0.  2. 13.]
 [ 9.  3.  2.]]
[1 2 0]


In [1415]:
morph_df["morph_predicted_gene_cluster"] = morph_df.apply(lambda x: decision[x['morph_cluster']], axis=1)

In [1416]:
average_gene = gene_df.groupby('gene_cluster').agg({'PC1': 'mean', 'PC2': 'mean', 'PC3': 'mean', 'PC4': 'mean', 'PC5': 'mean', 'PC6': 'mean'}).reset_index()
average_gene['gene_centroids'] = average_gene[['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6']].values.tolist()
average_gene = average_gene[['gene_cluster', 'gene_centroids']]

In [1417]:
gene_df['gene_coordinates'] = gene_df[['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6']].values.tolist()
gene_df = gene_df[['TreeNo', 'gene_coordinates']]
joined_df = morph_df.merge(gene_df, on='TreeNo').drop(['morph_cluster'], axis=1)
# rename columns
joined_df = joined_df.rename(columns={'morph_predicted_gene_cluster': 'gene_cluster'})
final_df = joined_df.merge(average_gene, on='gene_cluster')
final_df.head()

Unnamed: 0,TreeNo,gene_cluster,gene_coordinates,gene_centroids
0,QA-M11,0,"[-8.431936096, -0.500913737, -11.54222572, -2....","[-6.734983830851852, -0.017621014611111093, -8..."
1,QA-M5,0,"[-8.593978553, 0.414067943, -16.68961594, -4.1...","[-6.734983830851852, -0.017621014611111093, -8..."
2,QR-M1,2,"[19.47262912, -1.953619719, -2.119561558, 6.92...","[16.470120757953126, -1.844546040984375, 1.384..."
3,QR-M14,0,"[22.37592773, -2.003944438, -2.581493893, 8.57...","[-6.734983830851852, -0.017621014611111093, -8..."
4,QR-M15,0,"[22.73439088, -1.946043139, -1.961701937, 8.42...","[-6.734983830851852, -0.017621014611111093, -8..."


In [1418]:
def euclidean_distance(row):
    return np.linalg.norm(np.array(row['gene_coordinates']) - np.array(row['gene_centroids']))

final_df['bkm_distance'] = final_df.apply(euclidean_distance, axis=1)
final_df.head()

Unnamed: 0,TreeNo,gene_cluster,gene_coordinates,gene_centroids,bkm_distance
0,QA-M11,0,"[-8.431936096, -0.500913737, -11.54222572, -2....","[-6.734983830851852, -0.017621014611111093, -8...",6.097199
1,QA-M5,0,"[-8.593978553, 0.414067943, -16.68961594, -4.1...","[-6.734983830851852, -0.017621014611111093, -8...",28.524137
2,QR-M1,2,"[19.47262912, -1.953619719, -2.119561558, 6.92...","[16.470120757953126, -1.844546040984375, 1.384...",9.203637
3,QR-M14,0,"[22.37592773, -2.003944438, -2.581493893, 8.57...","[-6.734983830851852, -0.017621014611111093, -8...",31.52882
4,QR-M15,0,"[22.73439088, -1.946043139, -1.961701937, 8.42...","[-6.734983830851852, -0.017621014611111093, -8...",31.933072
