#https://wwwn.cdc.gov/Nchs/Nhanes/Search/variablelist.aspx?Component=Questionnaire&CycleBeginYear=2013
##The variable dictionary to understand the code at the top of each column. 

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import norm
from scipy.stats import t
from numpy.random import seed

import re
import sklearn

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

In [None]:
from pandas.io.json import json_normalize 
data = pd.read_csv('/Users/Sandi/Desktop/Projects/Capstone_2/questionnaire.csv')
demographics = pd.read_csv('/Users/Sandi/Desktop/Projects/Capstone_2/demographic.csv')
labs = pd.read_csv('/Users/Sandi/Desktop/Projects/Capstone_2/labs.csv')

**Data Set Merge and Select Attribute**

In [None]:
data.drop(['SEQN'], axis = 1, inplace=True)
demographics.drop(['SEQN'], axis = 1, inplace=True)
labs.drop(['SEQN'], axis = 1, inplace=True)
#https://www.kaggle.com/what0919/diabetes-prediction

In [None]:
df = pd.concat([data, demographics], axis=1, join='inner')
df = pd.concat([df, labs], axis=1, join='inner')
df.describe()
#merge

**NaN Handling and Feature Column Selection**

In [None]:
from sklearn.feature_selection import VarianceThreshold

df = df.rename(columns = {'SEQN' : 'ID',
                          'RIAGENDR' : 'Gender',
                          'RIDAGEYR' : 'AgeAtSurvey', 
                          'WHD120' : 'WeightAt25Yrs', 
                          'WHD130' : 'HeightAt25Yrs',
                          'LBXPLTSI' : 'PlateletCount',
                          'LBXSCK' : 'CreatinePhosphokinase',
                          'MCQ160A' : 'HasArthritis',
                          'MCQ010' : 'HasAsthma',
                          'CSQ204' : 'AllergyCongestionPast12Months',
                          'DBQ010' : 'Breast_fed',
                          'DBQ229' : 'UseMilk>=5x/wk',
                          'DBQ235B' : 'DrankMilkAge13to17yrs',
                          'DBQ235C' : 'DrankMilkAge18to35yrs',
                          'PUQ100' : 'ChemicalPesticideUsedInHomePast7days',
                          'PUQ110' : 'ChemicalWeedKillUsedExteriorHomePast7days',
                          'PAQ610' : 'NumberOfDays/WeekVigorousWork',
                          'PAD615' : 'Minutes/DayVigorousIntensityWork',
                          'PAQ625' : 'NumberOfDays/WeekModerateWork',
                          'PAD630' : 'Minutes/DayModerateIntensityWork',
                          'PAQ635' : 'WalkOrBicycle10+Min/weekYN',
                          'PAQ640' : 'NumberDays/WeekWalkBicycle',
                          'PAD645' : 'Minutes/DayWalkBicycleTransportation',
                          'PAQ650' : 'VigorousRecreationalActivities10+min/WeekYN',
                          'PAQ655' : 'Days/WeekVigorousRecreationalActivities',
                          'PAD660' : 'Minutes/DayVigorousRecreationalActivities',
                          'PAQ665' : 'ModerateRecreationalActivities10+min/WeekYN',
                          'PAQ670' : 'Days/WeekModerateRecreationalActivities',
                          'PAD675' : 'Minutes/DayModerateRecreationalActivities',
                          'PAD680' : 'Minutes/DaySedentaryActivity'})

df = df.loc[:, ['ID', 'Gender', 'AgeAtSurvey', 'WeightAt25Yrs', 'HeightAt25Yrs','PlateletCount', 'CreatinePhosphokinase', 
                'HasArthritis', 'HasAsthma', 'AllergyCongestionPast12Months', 'Breast_fed', 'UseMilk>=5x/wk', 
                'DrankMilkAge13to17yrs', 'DrankMilkAge18to35yrs', 'ChemicalPesticideUsedInHomePast7days', 'ChemicalWeedKillUsedExteriorHomePast7days',
                'NumberOfDays/WeekVigorousWork', 'Minutes/DayVigorousIntensityWork',  
                'NumberOfDays/WeekModerateWork', 'Minutes/DayModerateIntensityWork', 
                'WalkOrBicycle10+Min/weekYN', 'NumberDays/WeekWalkBicycle',
                'Minutes/DayWalkBicycleTransportation', 'VigorousRecreationalActivities10+min/WeekYN', 
                'Days/WeekVigorousRecreationalActivities',
                'Minutes/DayVigorousRecreationalActivities', 'ModerateRecreationalActivities10+min/WeekYN', 
                'Days/WeekModerateRecreationalActivities',
                'Minutes/DayModerateRecreationalActivities', 'Minutes/DaySedentaryActivity']]

#'PAQ677' : 'Past7DaysCardio20Minutes+','PAQ678' : 'Past7DaysStrengthTone20Minutes+', 'PAQ706' : 'Past7DaysCardio60Minutes+'


In [None]:
df.info()


**Feature Generation**
new column should be created for BMI.  
#The formula is BMI = kg/m2 where kg is a person’s weight in kilograms and m2 is their height in meters squared.
'WeightAt25Yrs' in kg divided by the 'HeightAt25Yrs' in meters squared.  

In [None]:
df.insert(4, 'BMI', df['WeightAt25Yrs']/ (df['WeightAt25Yrs']**2), True)
#need to check the units for weight and height.
#https://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/WHQ_H.htm#Component_Description
#English Instructions:
    #ENTER HEIGHT IN FEET AND INCHES OR METERS AND CENTIMETERS.
    #RECORD CURRENT WEIGHT. ENTER WEIGHT IN POUNDS OR KILOGRAMS. 
    #IF PREGNANT, ASK FOR WEIGHT BEFORE PREGNANCY. 

In [None]:
df.info()

In [None]:
colormap = plt.cm.viridis
plt.figure(figsize=(20,20))
sns.heatmap(df.astype(float).drop(axis=1, labels='ID').corr(), linewidths=0.1, vmax=1.0, square=True, cmap=colormap, annot=True)

In [None]:
#https://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/MCQ_H.htm#MCQ160a
df['HasArthritis'].value_counts()
#those marked as 1.0 said yes that a doctor or health pro told them they had arthritis

**Question:  Is there a correlation with those who reported having been told they had arthritis, and their blood counts?**
#When arthritis = 1.0, report the Creatine Phosphokinase.  


In [None]:
arthritis_bc_yes = df.loc[(df['HasArthritis'] == 1), ['CreatinePhosphokinase', 'PlateletCount']]
arthritis_bc_yes.head(10)

In [None]:
arthritis_bc = arthritis_bc_yes.dropna()
#Eliminate NaN values, drop those rows.
#dropna
arthritis_bc.head(10)

In [None]:
_ = plt.scatter(arthritis_bc['PlateletCount'], arthritis_bc['CreatinePhosphokinase'])

_ = plt.xlabel('Platelet Count 1000cells/uL')
_ = plt.ylabel('Creatine Phosphokinase (CPK)')
_ = plt.title('Identified with Having Arthritis')  
#Add cross hairs?

**Answer:**  
The scatterplot shows most participants identified with arthritis are in the lower left quadrant, which indicates lower platelet and CPK counts.  Based on this visual alone, a hyphotheses statement such that a participant with low levels of CPK and low platelet counts have a good chance of being told by a health professional that they have arthritis.
Further, there are no participants in the upper right quadrant, which suggests that higher counts for both CPK and platelets relates to those not being diagnosed with arthritis.
There are definitley a few outliers in the upper left quadrant that are not significant and would not be included when making generalizations.  
There are more outliers in the lower right quadrant, perhaps significant enough to look further into whether the higher platelet counts play a role in diagnosing possible arthritis.

In general, high CPK levels in the muscle suggest the presence of inflammatory muscle disease, but they can also be caused by trauma, injection into the muscle, or muscle disease due to hypothyroidism. Conversely, low levels of CPK can be indicative of rheumatoid arthritis.
https://www.arthritis-health.com/glossary/creatine-phosphokinase

**Question:  Examining participants who have been diagnosed with asthma.  Can machine learning help in predicting asthma in people based on certain factors of nutrition growing up, current chemical exposure, or current activity levels?**


In [None]:
asthma_yes_all = df.loc[(df['HasAsthma'] == 1), 
                ['AllergyCongestionPast12Months', 'Breast_fed', 'UseMilk>=5x/wk', 
                'DrankMilkAge13to17yrs', 'DrankMilkAge18to35yrs', 'ChemicalPesticideUsedInHomePast7days', 'ChemicalWeedKillUsedExteriorHomePast7days',
                'NumberOfDays/WeekVigorousWork', 'Minutes/DayVigorousIntensityWork',  
                'NumberOfDays/WeekModerateWork', 'Minutes/DayModerateIntensityWork', 
                'WalkOrBicycle10+Min/weekYN', 'NumberDays/WeekWalkBicycle',
                'Minutes/DayWalkBicycleTransportation', 'VigorousRecreationalActivities10+min/WeekYN', 
                'Days/WeekVigorousRecreationalActivities', 'Minutes/DayVigorousRecreationalActivities', 'ModerateRecreationalActivities10+min/WeekYN', 
                'Days/WeekModerateRecreationalActivities', 'Minutes/DayModerateRecreationalActivities', 
                'Minutes/DaySedentaryActivity', 'BMI']]


**Look at those who have been identified as having asthma, connecting to their answers for the milk, being breast-fed, chemical pesticide use, BMI at age 25, and columns related to exercise and activity.**
clustering analysis, machine learning predicting.  Will give groups.  Pick one group, plot.





In [None]:
from sklearn import linear_model
from sklearn.svm import SVC
from sklearn.model_selection import KFold
#from sklearn.cross_validation import KFold;
from sklearn.metrics import mean_squared_error, r2_score


In [None]:
asthma_yes_all.info()

**using this code with changes to work for HasAsthma.**

#Drop all except the milk consumption, breastfed, and chemical pesticide and weed kill.

df.drop(['GlycoHemoglobin'], axis = 1, inplace=True)


#data -> attributes, target -> diabetes

data = df.drop(['Diabetes'], axis=1)
target = df[['Diabetes']]

#seperate training set and test set

train_X = data[:6000]
test_X = data[6000:]
train_Y = target[:6000]
test_Y = target[6000:]

#create linear regression obj

lr_regr = linear_model.LinearRegression()

#training via linear regression model

lr_regr.fit(train_X, train_Y)

#make prediction using the test set

lr_pred_diabetes = lr_regr.predict(test_X)
lr_score = lr_regr.score(test_X, test_Y)

print('LRr_Coefficients: ', lr_regr.coef_)
print('LR_Mean Square Error: %.2f' % mean_squared_error(test_Y, lr_pred_diabetes))
print('LR_Variance score: %.2f' % r2_score(test_Y, lr_pred_diabetes))
print('Score: %.2f' % lr_regr.score(test_X, test_Y))


**Then to look at those who have been identified as having allergycongestion, connecting to their answers for the milk, being breast-fed, chemical pesticide use.**

In [None]:
#This can be used for a separate study on children
#'PAQ677' : 'Past7DaysCardio20Minutes+'
#'PAQ678' : 'Past7DaysStrengthTone20Minutes+'
#'PAQ706' : 'Past7DaysCardio60Minutes+'
#**Exercise conversion to be counted at yes if indicated 3 or more days that week**
#old value(string) into new value(integer)  #1 == "yes"
#df.loc[(df['Past7DaysCardio20Minutes+'] >=3) & (df['Past7DaysCardio20Minutes+'] <=7),'Past7DaysCardio20Minutes+'] = 1
#df.loc[(df['Past7DaysStrengthTone20Minutes+'] >=3) & (df['Past7DaysStrengthTone20Minutes+'] <=7),'Past7DaysStrengthTone20Minutes+'] = 1
#df.loc[(df['Past7DaysCardio60Minutes+'] >=3) & (df['Past7DaysCardio60Minutes+'] <=7),'Past7DaysCardio60Minutes+'] = 1