In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import ElasticNetCV
from sklearn import cross_validation
import numpy as np
from sklearn import metrics
from FeatureWeights import FeatureWeights
from sklearn.pipeline import Pipeline
import os

In [None]:
df = pd.read_table("/Users/myazdaniUCSD/Documents/microbiome-regression/data/AG/notrim/ag-cleaned_L6.txt")
print df.shape

In [None]:
df.iloc[:,466:].head()

In [None]:
df_rel = df[df["HMP_SITE"] == "FECAL"]
print df_rel.shape

In [None]:
df_rel.iloc[:,466:].head()

# BMI regression

In [None]:
df_rel[["BMI_CORRECTED"]] = df_rel[["BMI_CORRECTED"]].apply(lambda x: pd.to_numeric(x, errors = "coerce"))

In [None]:
df_clean = df_rel.dropna()
print df_clean.shape

In [None]:
df_clean["BMI_CORRECTED"].describe()

In [None]:
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
df_rel[["AGE_CORRECTED"]] = df_rel[["AGE_CORRECTED"]].apply(lambda x: pd.to_numeric(x, errors = "coerce"))
df_clean = df_rel.dropna()
print df_clean.shape

In [None]:
plt.plot(df_clean.AGE_CORRECTED, df_clean.BMI_CORRECTED, '.')

In [None]:
df_age_20 = df_clean[df_clean.AGE_CORRECTED>20]

In [None]:
y = np.array(df_age_20["BMI_CORRECTED"])
X = np.array(df_age_20.iloc[:,467:])
print "y shape", y.shape
print "X shape", X.shape

In [None]:
RF = RandomForestRegressor(n_estimators=1000, n_jobs=-1)

In [None]:
scores = -1*cross_validation.cross_val_score(RF, X, y, cv=10, scoring = 'mean_absolute_error')

In [None]:
print np.mean(scores), np.std(scores)

In [None]:
KNN = GridSearchCV(estimator=KNeighborsRegressor(), param_grid=dict(n_neighbors=range(1,11)), n_jobs = -1, scoring = 'mean_absolute_error')
scores = -1*cross_validation.cross_val_score(KNN, X, y, cv=10, scoring = 'mean_absolute_error')
print np.mean(scores), np.std(scores)

In [None]:
ENet = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], eps=0.001, n_alphas=100, normalize=False, 
                            max_iter=1000, tol=0.0001, cv=3, copy_X=True, n_jobs=-1)

scores =-1*cross_validation.cross_val_score(ENet, X, y, cv=10, scoring = 'mean_absolute_error')
print np.mean(scores), np.std(scores)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.85, random_state=42)
print X_train.shape
print y_train.shape

In [None]:
RF = RandomForestRegressor(n_estimators=1000, n_jobs=-1)
scores = -1*cross_validation.cross_val_score(RF, X_train, y_train, cv=10, scoring = 'mean_absolute_error')
print np.mean(scores), np.std(scores)

In [None]:
KNN = GridSearchCV(estimator=KNeighborsRegressor(), param_grid=dict(n_neighbors=range(1,11)), n_jobs = -1, scoring = 'mean_absolute_error')
scores = -1*cross_validation.cross_val_score(KNN, X_train, y_train, cv=10, scoring = 'mean_absolute_error')
print np.mean(scores), np.std(scores)

In [None]:
FW = FeatureWeights()

KNN = GridSearchCV(estimator=KNeighborsRegressor(), 
                   param_grid=dict(n_neighbors=range(1,11), p=[1,2]), 
                   n_jobs = -1, scoring = 'mean_absolute_error')        

metric_KNN = Pipeline([('metric', FW), ('knn', KNN)])

scores =-1*cross_validation.cross_val_score(metric_KNN, X_train, y_train, cv=10, 
                                            scoring = 'mean_absolute_error')



In [None]:
print np.mean(scores), np.std(scores)

In [None]:
FW = FeatureWeights()
FW.fit(X_train, y_train)

In [None]:
found_weights = np.asarray(FW.weights).squeeze()

In [None]:
microbe_names = np.array(df_age_20.columns)[467:]

In [None]:
list(microbe_names[np.argsort(-found_weights)])

In [None]:
plt.plot(np.log10(1e-8+df_age_20['k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhodospirillales;f__Acetobacteraceae;g__Acetobacter']), np.log10(df_age_20.BMI_CORRECTED), '.')

## Age classification

In [None]:
df_rel[["AGE_CORRECTED"]] = df_rel["AGE_CORRECTED"].apply(lambda x: pd.to_numeric(x, errors = "coerce"))
df_rel["AGE_CORRECTED"].describe()

In [None]:
df_clean = df_rel.dropna()
print df_clean.shape

In [None]:
y = np.array(df_clean["AGE_CORRECTED"])
X = np.array(df_clean.iloc[:,467:])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
RF = RandomForestRegressor(n_estimators=1000, n_jobs=-1)
RF.fit(X_train, y_train)

In [None]:
RF.score(X_train, y_train)

In [None]:
RF.score(X_test, y_test)

In [None]:
scores = -1*cross_validation.cross_val_score(RF, X, y, cv=5, scoring = 'mean_absolute_error')

In [None]:
scores

In [None]:
np.mean(scores)

In [None]:
KNN = GridSearchCV(estimator=KNeighborsRegressor(), param_grid=dict(n_neighbors=range(10,20)), n_jobs = -1, scoring = 'mean_absolute_error')
scores = -1*cross_validation.cross_val_score(KNN, X, y, cv=10, scoring = 'mean_absolute_error')
np.mean(scores)