In [None]:
# importing libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the data
# give columns name
df = pd.read_csv('smoking.csv')
print(df.shape)
df.head()

In [None]:
# Dorp the columns with miss data
df.replace('?',np.nan, inplace=True)
df = df.dropna()
df.head()

In [None]:
print(df.shape)

In [None]:
df = df.replace({'gender': {'F': 0, 'M': 1}})
df = df.replace({'oral': {'Y': 1, 'N': 0}})
df = df.replace({'tartar': {'Y': 1, 'N': 0}})
df = df.drop('ID',axis=1)
df = df.drop('oral',axis=1)
df = df.drop('eyesight(left)',axis=1)
df = df.drop('eyesight(right)',axis=1)
df = df.drop('hearing(left)',axis=1)
df = df.drop('hearing(right)',axis=1)
df = df.drop('LDL',axis=1)
df = df.drop('Urine protein',axis=1)
df = df.drop('HDL',axis=1)
df = df.drop('Cholesterol',axis=1)
df = df.drop('AST',axis=1)

In [None]:
df.head()

In [None]:
# num_list = list(df.columns)
num_list = ['age','systolic', 'relaxation', 'height(cm)', 'weight(kg)',
       'triglyceride', 'waist(cm)', 'hemoglobin', 'serum creatinine', 'Gtp']
fig = plt.figure(figsize=(10,30))

for i in range(len(num_list)):
    plt.subplot(15,2,i+1)
    plt.title(num_list[i])
    plt.hist(df[num_list[i]],color='green',alpha=0.5)

plt.tight_layout()

In [None]:
num_list4=['age','systolic', 'relaxation', 'height(cm)', 'weight(kg)',
       'triglyceride', 'waist(cm)', 'hemoglobin', 'serum creatinine', 'Gtp']
fig = plt.figure(figsize=(10,40))

for i in range(len(num_list4)):
    plt.subplot(11,2,i+1)
    plt.title(num_list4[i])
    plt.violinplot(df[num_list4[i]])

plt.tight_layout()

In [None]:
# sns.pairplot(df.iloc[:,1:27], hue='smoking')
sns.pairplot(df,hue='smoking',vars=['age', 'systolic', 'relaxation', 'height(cm)', 'weight(kg)', 'waist(cm)',
       'triglyceride', 'hemoglobin', 'serum creatinine', 'Gtp'])

In [None]:
# Get the correlation of the columns
df.corr()

In [None]:
# Visualization the correlation
plt.figure(figsize=(20,20))
sns.heatmap(df.corr(), annot=True, fmt='.0%')

In [None]:
df.describe()

In [None]:
df['smoking'].value_counts()

In [None]:
# df.groupby('smoking').mean()

df1=df.groupby('smoking').mean().T
df1['gap'] = df1.apply(lambda x: (x[1] - x[0])/x[1]*100, axis = 1)
df1.sort_values(by="gap",ascending=False)

In [None]:
# split the data set into independent (X) and dependent (Y) data set
X = df.drop(columns='smoking', axis =1)
Y = df['smoking']

In [None]:
X_train, X_test,Y_train,Y_test = train_test_split(X, Y, test_size=0.25, random_state=0)

In [None]:
# Scale the data (Feature Scaling)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test =  sc.fit_transform(X_test)
# print(X_train)

In [None]:
# Crfeate a function for the models
def models (X_train, Y_train):

    # Logistic Regression
    from sklearn.linear_model import LogisticRegression
    log = LogisticRegression(random_state=0)
    log.fit(X_train, Y_train)
    
    # Decision Tree
    from sklearn.tree import DecisionTreeClassifier
    tree = DecisionTreeClassifier(criterion = 'entropy', random_state=0)
    tree.fit(X_train, Y_train)
    
    # Random Forest Classifier
    from sklearn.ensemble import RandomForestClassifier
    forest = RandomForestClassifier(n_estimators =10, criterion = 'entropy', random_state=0)
    forest.fit(X_train, Y_train)
    
    # Print the models accuracy on the training data
    print('[0]Logistic Regression Training Accuracy:', log.score(X_train, Y_train))
    print('[1]Decision Tree Training Accuracy:', tree.score(X_train, Y_train))
    print('[2]Random Forest Training Accuracy:', forest.score(X_train, Y_train))
    
    return log, tree, forest

In [None]:
# Getting all of the models
model = models(X_train, Y_train)

In [None]:
#  test model accuracy on test on confusion matrix
from sklearn.metrics import confusion_matrix

for i in range( len(model)):
    print('Model ', i)
    cm = confusion_matrix(Y_test,model[i].predict(X_test))

    TP = cm[0][0]
    TN = cm[1][1]
    FN = cm[1][0]
    FP = cm[0][1]

    print(cm)
    print('Testing Accuracy = ', (TP + TN)/(TP+TN+FN+FP))
    print()

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
for i in range( len(model)):
    print('Model ', i)
    print (classification_report(Y_test,model[i].predict(X_test)))
    print(accuracy_score(Y_test,model[i].predict(X_test)))