In [None]:
#Import libraries needed
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from piechart import corr_plot 
from piechart import glm_regression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV
from sklearn.metrics import mean_squared_error
# %matplotlib inline #matplotlib formatting
from sklearn.utils import resample


cardio = pd.read_csv( "~./cardio_disease.csv", sep = ';') #import csv file with commas as delimeter

In [None]:
cardio.head(10) #show top 10 records

In [None]:
#check for missing columns
for col in cardio.columns:
    pct_missing = np.mean(cardio[col].isnull())
    print('{} - {}%'.format(col, round(pct_missing*100)))

In [None]:
cardio.duplicated().sum() #check for number of duplicate rows

In [None]:
cardio.drop_duplicates(inplace=True) #drop duplicate rows

In [None]:
cardio.duplicated().sum()

In [None]:
cardio.info #review info

In [None]:
cardio.insert(4, 'BMI', round((cardio['WEIGHT']/(cardio['HEIGHT']/100)**2),2)) #add BMI calculated column

In [None]:
cardio.info #review info with new column

In [None]:
gender = cardio.groupby(["GENDER"]).size().reset_index(name="Cnt") #review gender

In [None]:
gender

In [None]:
age = cardio.groupby(["AGE"]).size().reset_index(name="Cnt")

In [None]:
age

In [None]:
cardio.describe()

In [None]:
fig, axes = plt.subplots(1,3, figsize=(18,4))
sns.histplot(cardio.AGE, bins=10, kde=True, ax=axes[0])
sns.histplot(cardio.HEIGHT, bins=10, kde=True, ax=axes[1])
sns.histplot(cardio.WEIGHT, bins=10, kde=True, ax=axes[2])

In [None]:
sns.histplot(cardio.AGE, bins=6)

In [None]:
cardio.AGE.min()

In [None]:
cardio.AGE.max()

In [None]:
fig, ax = plt.subplots(ncols=2, figsize=(20,10))
sns.boxplot(data=cardio, x='GENDER', y='AGE', ax=ax[0])
sns.boxplot(data=cardio, x='GENDER', y='BMI', ax=ax[1])

In [None]:
sns.boxplot(data=cardio, x='GENDER', y='WEIGHT')

In [None]:
sns.histplot(cardio.HEIGHT, bins=10)

In [None]:
sns.histplot(cardio.WEIGHT, bins=7)

In [None]:
corr_plot(cardio) #passing the cardio dataframe to the correlation plot function. Output is a heatmap

In [None]:
cardio_majority = cardio[cardio.CARDIO_DISEASE == 1]
cardio_minority = cardio[cardio.CARDIO_DISEASE == 0]
cardio_minority_upsampled = resample(cardio_minority, replace=True, n_samples=33053, random_state=34)
cardio_fin = pd.concat([cardio_majority, cardio_minority_upsampled])
print(cardio_fin.CARDIO_DISEASE.value_counts())

In [None]:
X = cardio.drop(columns= ['CARDIO_DISEASE'], axis=1) 
y = cardio.CARDIO_DISEASE.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.40,stratify=y, shuffle=True,random_state=1)

In [None]:
logistic_regression = glm_regression(X_train, y_train, X_test,y_test)

In [None]:
rfe_selector = RFE(estimator=logistic_regression, n_features_to_select=13, step=10, verbose=5)
rfe_selector.fit(X, y)
rfe_support = rfe_selector.get_support()
rfe_feature = X.loc[:,rfe_support].columns.tolist()
print(str(len(rfe_feature)), 'selected features')
print(rfe_feature)

In [None]:
rdg = Ridge()
alphas = 10**np.linspace(10,-2,100)*0.5
ridge_cv = RidgeCV(alphas = alphas, scoring = 'neg_mean_squared_error', normalize = True)
ridge_cv.fit(X_train, y_train)
ridge_cv.alpha_

In [None]:
ridge_reg = Ridge(alpha = ridge_cv.alpha_, normalize = True)
ridge_reg.fit(X_train, y_train)
mean_squared_error(y_test, ridge_reg.predict(X_test))

In [None]:
ridge_reg.fit(X, y)
pd.Series(ridge_reg.coef_, index = X.columns)

In [None]:
lasso = Lasso(max_iter = 10000, normalize = True)
lasso_cv = LassoCV(alphas = None, cv = 5, max_iter = 100000, normalize = True)
lasso_cv.fit(X_train, y_train)
lasso.set_params(alpha=lasso_cv.alpha_)
lasso.fit(X_train, y_train)
y_preds_lasso = lasso.predict(X_test)
mean_squared_error(y_test, lasso.predict(X_test))

In [None]:
lasso_cv.alpha_

In [None]:
pd.Series(lasso.coef_, index=X.columns)



In [None]:
from sklearn.metrics import r2_score
r2_score(y_test, y_preds_lasso)