In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import validation_curve

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [2]:
mergedf = pd.read_csv('metadata_otu_merged_famID-to-str.csv', low_memory=False)
# mergedf = pd.read_csv('metadata_otu_merged_famID-to-str_onlyFamwPet.csv', low_memory=False)
# mergedf = pd.read_csv('metadata_otu_merged_non-rarefied.csv', low_memory=False)
dog_fam = pd.read_csv('dogFamily_toMerge.csv')
mergedf = mergedf.drop("Unnamed: 0", axis=1)
mergedf.head(1)

Unnamed: 0,sample_name,age,age_in_years,age_units,agegroup,anonymized_name,body_habitat,body_product,body_site,collection_timestamp,...,k__Bacteria; p__Actinobacteria; c__Actinobacteria; o__Actinomycetales; f__Nocardioidaceae; g__; s__.16,k__Bacteria; p__Firmicutes; c__Erysipelotrichi; o__Erysipelotrichales; f__Erysipelotrichaceae; g__; s__.7,k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__[Odoribacteraceae]; g__Odoribacter; s__.1,k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Prevotellaceae; g__Prevotella; s__intermedia,k__Bacteria; p__Firmicutes; c__Bacilli; o__Lactobacillales; f__Streptococcaceae; g__Streptococcus; s__.41,k__Bacteria; p__Cyanobacteria; c__Chloroplast; o__Streptophyta; f__; g__; s__.6,k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; s__.54,k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Prevotellaceae; g__Prevotella; s__.23,k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; s__.55,k__Bacteria; p__Actinobacteria; c__Actinobacteria; o__Actinomycetales; f__Propionibacteriaceae; g__Propionibacterium; s__granulosum
0,797.02.HA.Forehead,1,1,years,Adolescent,FS_01,UBERON:skin,UBERON:sebum,UBERON:face,4/6/2010,...,0,0,0,0,0,0,0,0,0,0


In [3]:
#MERGE with DOG_FAM info
mergedf = mergedf.merge(dog_fam, left_on="family", right_on="family", suffixes=(False,False), how="outer")
mergedf["have_dog"].fillna("no",inplace = True)
print(mergedf.shape)

(1050, 1088)


# Filtering Train and Test Datasets

In [4]:
#FILTER 1
# only_humans_with_dogs = 1

#FILTER 2
# sample_type = "skin"
#FILTER 3
# not_sample_type = "stool"

# #FILTER 4
# human_role = "Partner"
#FILTER 5
# not_human_role = "Offspring"

#FILTER 6
# not_familyID = 63

# SPLITTING TRAIN and TEST (test = dogs, train = humans)

In [5]:
human_data_train = mergedf[mergedf.host_common_name =="human"]

dog_data_test = mergedf[mergedf.host_common_name =="dog"]
dog_data_test.head(1)

Unnamed: 0,sample_name,age,age_in_years,age_units,agegroup,anonymized_name,body_habitat,body_product,body_site,collection_timestamp,...,k__Bacteria; p__Firmicutes; c__Erysipelotrichi; o__Erysipelotrichales; f__Erysipelotrichaceae; g__; s__.7,k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__[Odoribacteraceae]; g__Odoribacter; s__.1,k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Prevotellaceae; g__Prevotella; s__intermedia,k__Bacteria; p__Firmicutes; c__Bacilli; o__Lactobacillales; f__Streptococcaceae; g__Streptococcus; s__.41,k__Bacteria; p__Cyanobacteria; c__Chloroplast; o__Streptophyta; f__; g__; s__.6,k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; s__.54,k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Prevotellaceae; g__Prevotella; s__.23,k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Bacteroidaceae; g__Bacteroides; s__.55,k__Bacteria; p__Actinobacteria; c__Actinobacteria; o__Actinomycetales; f__Propionibacteriaceae; g__Propionibacterium; s__granulosum,have_dog
20,797.03.CA.BackLeftPaw,Not recorded,Not recorded,years,Not recorded,FS_175,UBERON:skin,UBERON:sebum,UBERON:foot skin,2/10/2010,...,0,0,0,0,0,0,0,0,0,yes


In [6]:
try:
    if only_humans_with_dogs == 1:
            human_data_train = human_data_train[human_data_train.have_dog =="yes"]
    
    elif only_humans_with_dogs ==0:
            human_data_train = human_data_train[human_data_train.have_dog =="no"]
except:
    print("no filter here 1")

try:    
    if sample_type:
        human_data_train = human_data_train[human_data_train.sample_type ==sample_type]
        dog_data_test = dog_data_test[dog_data_test.sample_type ==sample_type]
except:
    print("no filter here 2")
    
try:    
    if not_sample_type:
        human_data_train = human_data_train[human_data_train.sample_type !=not_sample_type]
        dog_data_test = dog_data_test[dog_data_test.sample_type !=not_sample_type]
except:
    print("no filter here 3")
    
try:    
    if human_role:
        human_data_train = human_data_train[human_data_train.role ==human_role]
except:
    print("no filter here 4")
    
try:
    if not_human_role:
        human_data_train = human_data_train[human_data_train.role !=not_human_role]    
except:
    print("no filter here 5")
    
try:    
    if not_familyID:
        human_data_train = human_data_train[human_data_train.role !=not_familyID]
        dog_data_test = dog_data_test[dog_data_test.role !=not_familyID]  
except:
    print("no filter here 6")

no filter here 1
no filter here 2
no filter here 3
no filter here 4
no filter here 5
no filter here 6


# Train/test split

In [7]:
X_train = human_data_train.iloc[:, -1035:-1]
y_train = human_data_train["familyID"].values.reshape(-1, 1)

y_test = dog_data_test["familyID"].values.reshape(-1, 1)
X_test = dog_data_test.iloc[:, -1035:-1]

# Hyperparameter Selection

In [8]:
n_estimators = 1000

# Non-Scaled Model Fitting

In [9]:
#from non-scaled data
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=n_estimators)
rf = rf.fit(X_train, y_train.ravel())
rf.score(X_test, y_test) #geni impurity coefficient

print(f'Test Score: {rf.score(X_test, y_test)}')
print(f'Train Score: {rf.score(X_train, y_train)}')

Test Score: 0.1267605633802817
Train Score: 1.0


# Scaled X Model Fitting

In [10]:
# from sklearn.preprocessing import StandardScaler
# X_scaler = StandardScaler().fit(X_train)

# X_train_scaled = X_scaler.transform(X_train)
# X_test_scaled = X_scaler.transform(X_test)

In [11]:
# from scaled data
# rf_scaled = RandomForestClassifier(n_estimators=n_estimators)
# rf_scaled = rf_scaled.fit(X_train_scaled, y_train.ravel())
# rf_scaled.score(X_test_scaled, y_test) #geni impurity coefficient

# print(f'Test Score: {rf_scaled.score(X_test_scaled, y_test)}')
# print(f'Train Score: {rf_scaled.score(X_train_scaled, y_train)}')

# Evaluate most-guessed family for each dog (7 collection site/predictions per animal) ==> 1 prediction

In [12]:
y_pred = rf.predict(X_test)

outputdf = pd.DataFrame({"SampleName": dog_data_test["sample_name"],"Anonymized_Name":dog_data_test["anonymized_name"],"Prediction": y_pred, "Actual": y_test.ravel()}).reset_index(drop=True)
print(outputdf.shape)
outputdf.head()

(213, 4)


Unnamed: 0,SampleName,Anonymized_Name,Prediction,Actual
0,797.03.CA.BackLeftPaw,FS_175,family#5,family#3
1,797.03.CA.BackRightPaw,FS_175,family#5,family#3
2,797.03.CA.Forehead,FS_175,family#5,family#3
3,797.03.CA.FrontRightPaw,FS_175,family#5,family#3
4,797.03.CA.Stool,FS_175,family#52,family#3


In [13]:
sumOutput = outputdf.groupby(["Anonymized_Name","Actual","Prediction"]).count()
sumOutput.reset_index()
sumOutput.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,SampleName
Anonymized_Name,Actual,Prediction,Unnamed: 3_level_1
FS_175,family#3,family#5,4
FS_175,family#3,family#52,1
FS_175,family#3,family#71,1
FS_176,family#6,family#49,1
FS_176,family#6,family#5,1


In [14]:
# sumOutput.groupby(["Anonymized_Name"])['SampleName'].max()
idx = sumOutput.groupby(["Anonymized_Name"])['SampleName'].transform(max) == sumOutput['SampleName']
maxOutput = sumOutput[idx]
maxOutput = maxOutput.reset_index()
print(maxOutput.shape)
# maxOutput

(51, 4)


In [15]:
maxOutput_noDup = maxOutput.drop_duplicates(subset=['Anonymized_Name'], keep="first")
print(maxOutput_noDup.shape)
# maxOutput_noDup

(36, 4)


In [16]:
maxOutput_C = maxOutput_noDup[maxOutput_noDup.Actual == maxOutput_noDup.Prediction]

print(maxOutput_C.shape)
# maxOutput_C

(6, 4)


In [17]:
maxOutput_I = maxOutput_noDup[maxOutput_noDup.Actual != maxOutput_noDup.Prediction]

print(maxOutput_I.shape)
# maxOutput_I

(30, 4)


In [18]:
correct_total = maxOutput_C["Anonymized_Name"].count()
incorrect_total = maxOutput_I["Anonymized_Name"].count()

accuracy_maxed_pred = correct_total/(correct_total + incorrect_total)
print(f' Accuracy of most-predicted (dog) family: {round(accuracy_maxed_pred,3)}')

 Accuracy of most-predicted (dog) family: 0.167
