In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

# Modeling
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# Imbalanced learning
from imblearn.pipeline import Pipeline

# Metrics
from sklearn.metrics import (accuracy_score, f1_score, precision_score, 
                             recall_score, roc_auc_score, classification_report, 
                             confusion_matrix, roc_curve, precision_recall_curve)

In [4]:
df = pd.read_csv(r'/home/imran/Desktop/Nayeem/Youth-Tobacco-Survey/CODE/South_East_Asia.csv')

In [5]:
df['current_tobacco'] = np.where(
    ((df['q1'] == 1) & (df['q2'] == 2) & (df['q3'] == 2)),'no','yes')

df.drop(columns=['q1', 'q2','q3'], inplace=True)

df['Susceptibility'] = np.where(
    ((df['q7'] == 1) & (df['q8'] == 1) ),'non-susceptible','susceptible')

df.drop(columns=['q7', 'q8'], inplace=True)

df['tobacco_naive'] = np.where(
    ((df['q4'] == 2) & (df['q5'] == 2) & (df['q6'] == 2)),'no','yes')

df.drop(columns=['q4', 'q5','q6'], inplace=True)

In [6]:
mapping = {
    1: '13 or below', 2: '13 or below', 3: '13 or below',
    4: '14 years',
    5: '15 or above', 6: '15 or above', 7: '15 or above'
}

df['q9'] = df['q9'].map(mapping)

mapping = {
    1: 'Male',
    2: 'Female'
}

df['q10'] = df['q10'].map(mapping)

mapping = {
    1: 'No',
    2: 'Yes', 3: 'Yes', 4: 'Yes', 5: 'Yes'
}

df['q11'] = df['q11'].map(mapping)

mapping = {
    1: 'No',
    2: 'Yes', 3: 'Yes', 4: 'Yes', 5: 'Yes'
}

df['q12'] = df['q12'].map(mapping)

mapping = {
    1: 'No',
    2: 'Yes', 3: 'Yes', 4: 'Yes', 5: 'Yes'
}

df['q13'] = df['q13'].map(mapping)

mapping = {
    1: 'Yes',
    2: 'No'
}

df['q14'] = df['q14'].map(mapping)

mapping = {
    1: 'No',
    2: 'Yes', 3: 'Yes', 4: 'Yes'
}

df['q15'] = df['q15'].map(mapping)

mapping = {
    1: 'Yes',
    2: 'No'
}

df['q16'] = df['q16'].map(mapping)

mapping = {
    1: 'Yes',
    2: 'No'
}

df['q17'] = df['q17'].map(mapping)

mapping = {
    1: 'Yes',
    2: 'No'
}

df['q18'] = df['q18'].map(mapping)

mapping = {
    1: 'No',
    2: 'Yes',
    3 : 'No'
}

df['q19'] = df['q19'].map(mapping)

mapping = {
    1: 'Yes',
    2: 'Yes',
    3 : 'No'
}

df['q20'] = df['q20'].map(mapping)

mapping = {
    1: 'Yes',
    2: 'No',
    3 : 'No'
}

df['q21'] = df['q21'].map(mapping)

mapping = {
    1: 'No',
    2: 'Yes',
    3 : 'No'
}

df['q22'] = df['q22'].map(mapping)

mapping = {
    1: 'No',
    2: 'Yes',
    3 : 'No'
}

df['q23'] = df['q23'].map(mapping)

df['q27'] = np.where(
    ((df['q24'] == 3) & (df['q25'] == 2) & (df['q26'] == 2)),'no','yes')

df.drop(columns=['q24', 'q25','q26'], inplace=True)

In [7]:
df.head(2)

Unnamed: 0,q9,q10,q11,q12,q13,q14,q15,q16,q17,q18,...,q22,q23,Country,Year,Region,Income Group,current_tobacco,Susceptibility,tobacco_naive,q27
0,13 or below,Female,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,...,Yes,Yes,Bangladesh,2013,South East Asia,Lower middle income,no,non-susceptible,no,yes
1,13 or below,Female,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,...,Yes,Yes,Bangladesh,2013,South East Asia,Lower middle income,yes,susceptible,yes,yes


In [8]:
df.shape

(35461, 23)

In [9]:
df.duplicated().sum()
df = df.drop_duplicates()

In [10]:
X = df.drop(columns = ['current_tobacco','Susceptibility','tobacco_naive','Country','Year','Region','Income Group'])
y = df['Susceptibility']
y = np.where(y == 'non-susceptible', 0, 1)

In [11]:
X.columns

Index(['q9', 'q10', 'q11', 'q12', 'q13', 'q14', 'q15', 'q16', 'q17', 'q18',
       'q19', 'q20', 'q21', 'q22', 'q23', 'q27'],
      dtype='str')

In [12]:
cat_columns = ['q9','q10','q11','q12','q13','q14','q15','q16',
              'q17','q18','q19','q20','q21','q22','q23','q27']

pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat',pipeline, cat_columns)
    ],
    remainder='passthrough'
)

In [13]:
X = preprocessor.fit_transform(X)

In [14]:
from imblearn.over_sampling import SMOTE
X_train,X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state= 42)
smote = SMOTE(random_state= 42)
X_train,y_train = smote.fit_resample(X_train,y_train)

In [15]:
X_train[1]

array([1., 0., 0., 1., 1., 0., 1., 1., 1., 1., 0., 0., 1., 0., 0., 0., 1.])

In [16]:
logistic = LogisticRegression()
model = logistic.fit(X_train,y_train)

In [17]:
y_pred = model.predict(X_test)
accuracy_score(y_test,y_pred)

0.6393134552743673

In [18]:
random = RandomForestClassifier()
model = random.fit(X_train,y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test,y_pred)

0.6256577298922575

In [19]:
xg = XGBClassifier()
model = xg.fit(X_train,y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test,y_pred)

0.6722625908293661

In [20]:
svm = SVC()
model = svm.fit(X_train,y_train)
y_pred = model.predict(X_test)
accuracy_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred)
precision = precision_score(y_test,y_pred)
recall = recall_score(y_test,y_pred)
roc_auc = roc_auc_score(y_test,y_pred)