In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
dataset = pd.read_csv("./datasets/Holiday Package Travel.csv")
df = dataset
df.head()

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Single,1.0,1,2,1,0.0,Manager,20993.0
1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Single,7.0,1,3,0,0.0,Executive,17090.0
3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0
4,200004,0,,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0


In [3]:
## removing not neccessary column
df.drop('CustomerID', axis=1, inplace=True)
df.head()

Unnamed: 0,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Single,1.0,1,2,1,0.0,Manager,20993.0
1,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Single,7.0,1,3,0,0.0,Executive,17090.0
3,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0
4,0,,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0


In [4]:
## info of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4888 entries, 0 to 4887
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ProdTaken                 4888 non-null   int64  
 1   Age                       4662 non-null   float64
 2   TypeofContact             4863 non-null   object 
 3   CityTier                  4888 non-null   int64  
 4   DurationOfPitch           4637 non-null   float64
 5   Occupation                4888 non-null   object 
 6   Gender                    4888 non-null   object 
 7   NumberOfPersonVisiting    4888 non-null   int64  
 8   NumberOfFollowups         4843 non-null   float64
 9   ProductPitched            4888 non-null   object 
 10  PreferredPropertyStar     4862 non-null   float64
 11  MaritalStatus             4888 non-null   object 
 12  NumberOfTrips             4748 non-null   float64
 13  Passport                  4888 non-null   int64  
 14  PitchSat

In [5]:
## Checking Null values
#for feature in df.columns:
features_with_na = [column for column in df.columns if df[column].isnull().sum() > 0]
for feaure in features_with_na:
    print(feaure, df[feaure].isnull().sum())

Age 226
TypeofContact 25
DurationOfPitch 251
NumberOfFollowups 45
PreferredPropertyStar 26
NumberOfTrips 140
NumberOfChildrenVisiting 66
MonthlyIncome 233


In [6]:
numeric_features = [feature for feature in df.columns if df[feature].dtype != 'O']
categorical_features = [feature for feature in df.columns if df[feature].dtype == 'O']
numeric_features, categorical_features

(['ProdTaken',
  'Age',
  'CityTier',
  'DurationOfPitch',
  'NumberOfPersonVisiting',
  'NumberOfFollowups',
  'PreferredPropertyStar',
  'NumberOfTrips',
  'Passport',
  'PitchSatisfactionScore',
  'OwnCar',
  'NumberOfChildrenVisiting',
  'MonthlyIncome'],
 ['TypeofContact',
  'Occupation',
  'Gender',
  'ProductPitched',
  'MaritalStatus',
  'Designation'])

In [7]:
discrete_features = [feature for feature in numeric_features if df[feature].unique().sum() < 25]
continuous_features = [feature for feature in numeric_features if feature not in discrete_features]
discrete_features, continuous_features

(['ProdTaken',
  'CityTier',
  'NumberOfPersonVisiting',
  'Passport',
  'PitchSatisfactionScore',
  'OwnCar'],
 ['Age',
  'DurationOfPitch',
  'NumberOfFollowups',
  'PreferredPropertyStar',
  'NumberOfTrips',
  'NumberOfChildrenVisiting',
  'MonthlyIncome'])

In [8]:
categorical_features

['TypeofContact',
 'Occupation',
 'Gender',
 'ProductPitched',
 'MaritalStatus',
 'Designation']

In [9]:
for feature in categorical_features:
    print("-----------------------",feature,"---------------------")
    print(df[feature].value_counts())
    print("\n")

----------------------- TypeofContact ---------------------
TypeofContact
Self Enquiry       3444
Company Invited    1419
Name: count, dtype: int64


----------------------- Occupation ---------------------
Occupation
Salaried          2368
Small Business    2084
Large Business     434
Free Lancer          2
Name: count, dtype: int64


----------------------- Gender ---------------------
Gender
Male       2916
Female     1817
Fe Male     155
Name: count, dtype: int64


----------------------- ProductPitched ---------------------
ProductPitched
Basic           1842
Deluxe          1732
Standard         742
Super Deluxe     342
King             230
Name: count, dtype: int64


----------------------- MaritalStatus ---------------------
MaritalStatus
Married      2340
Divorced      950
Single        916
Unmarried     682
Name: count, dtype: int64


----------------------- Designation ---------------------
Designation
Executive         1842
Manager           1732
Senior Manager     742
AVP 

In [10]:
## merging Gender similar columns
df['Gender'].replace('Fe Male', 'Female', inplace=True)
df['Gender'].value_counts()

Gender
Male      2916
Female    1972
Name: count, dtype: int64

In [11]:
## merging the Maritalstatus similar columns
df['MaritalStatus'].replace('Unmarried', 'Single', inplace=True)
df['MaritalStatus'].value_counts()

MaritalStatus
Married     2340
Single      1598
Divorced     950
Name: count, dtype: int64

In [None]:
df.head()

Unnamed: 0,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Single,1.0,1,2,1,0.0,Manager,20993.0
1,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Single,7.0,1,3,0,0.0,Executive,17090.0
3,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0
4,0,,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0


In [13]:
features_with_na

['Age',
 'TypeofContact',
 'DurationOfPitch',
 'NumberOfFollowups',
 'PreferredPropertyStar',
 'NumberOfTrips',
 'NumberOfChildrenVisiting',
 'MonthlyIncome']

In [14]:
## imputing the Null Values
for feature in features_with_na:
    if feature in numeric_features:
        if feature in discrete_features:
            df[feature].fillna(df[feature].mode()[0], inplace=True)
        else:
            df[feature].fillna(df[feature].median(), inplace=True)
    else:
        df[feature].fillna(df[feature].mode()[0], inplace=True)

In [15]:
df.isnull().sum()

ProdTaken                   0
Age                         0
TypeofContact               0
CityTier                    0
DurationOfPitch             0
Occupation                  0
Gender                      0
NumberOfPersonVisiting      0
NumberOfFollowups           0
ProductPitched              0
PreferredPropertyStar       0
MaritalStatus               0
NumberOfTrips               0
Passport                    0
PitchSatisfactionScore      0
OwnCar                      0
NumberOfChildrenVisiting    0
Designation                 0
MonthlyIncome               0
dtype: int64

In [16]:
## checking the duplicated values
df.duplicated().sum(), df.shape

(141, (4888, 19))

In [17]:
## dropping duplicated values
df.drop_duplicates(inplace=True), df.shape

(None, (4747, 19))

In [18]:
df['TotalPersonsVisiting'] = df['NumberOfPersonVisiting'] + df['NumberOfChildrenVisiting']
df.drop(columns=['NumberOfPersonVisiting', 'NumberOfChildrenVisiting'], axis=1, inplace=True)

In [19]:
## diving the dataset into X and y
X = df.drop("ProdTaken", axis=1)
y = df['ProdTaken']
X

Unnamed: 0,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,Designation,MonthlyIncome,TotalPersonsVisiting
0,41.0,Self Enquiry,3,6.0,Salaried,Female,3.0,Deluxe,3.0,Single,1.0,1,2,1,Manager,20993.0,3.0
1,49.0,Company Invited,1,14.0,Salaried,Male,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,Manager,20130.0,5.0
2,37.0,Self Enquiry,1,8.0,Free Lancer,Male,4.0,Basic,3.0,Single,7.0,1,3,0,Executive,17090.0,3.0
3,33.0,Company Invited,1,9.0,Salaried,Female,3.0,Basic,3.0,Divorced,2.0,1,5,1,Executive,17909.0,3.0
4,36.0,Self Enquiry,1,8.0,Small Business,Male,3.0,Basic,4.0,Divorced,1.0,0,5,1,Executive,18468.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4883,49.0,Self Enquiry,3,9.0,Small Business,Male,5.0,Deluxe,4.0,Single,2.0,1,1,1,Manager,26576.0,4.0
4884,28.0,Company Invited,1,31.0,Salaried,Male,5.0,Basic,3.0,Single,3.0,1,3,1,Executive,21212.0,6.0
4885,52.0,Self Enquiry,3,17.0,Salaried,Female,4.0,Standard,4.0,Married,7.0,0,1,1,Senior Manager,31820.0,7.0
4886,19.0,Self Enquiry,3,16.0,Small Business,Male,4.0,Basic,3.0,Single,3.0,0,5,0,Executive,20289.0,5.0


In [20]:
y

0       1
1       0
2       1
3       0
4       0
       ..
4883    1
4884    1
4885    1
4886    1
4887    1
Name: ProdTaken, Length: 4747, dtype: int64

In [21]:
# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [22]:
cat_features = X_train.select_dtypes(include='object').columns
num_features = X_train.select_dtypes(exclude='object').columns

preprocessing = ColumnTransformer (
    [
        ('cat', OneHotEncoder(), cat_features),
        ('num', StandardScaler(), num_features)
    ]
)

X_train = preprocessing.fit_transform(X_train)
X_test = preprocessing.transform(X_test)

In [31]:
# Initialize the AdaBoostClassifier model with hyperparameters
model = AdaBoostClassifier()
# Train the model using the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_train = model.predict(X_train)

# Make predictions on the test data
y_pred_test = model.predict(X_test)

# calculating train dataset scores
# Calculate accuracy score
print("accuracy_score :", accuracy_score(y_train, y_pred_train))
# Calculate precision score
print("precssion_score :", precision_score(y_train, y_pred_train))
# Calculate recall score
print("recall_score :", recall_score(y_train, y_pred_train))
# Calculate F1 score
print("f1_score :", f1_score(y_train, y_pred_train))
# Generate confusion matrix
print("confusion_matrix :\n", confusion_matrix(y_train, y_pred_train))
# Generate classification report
print("classification_report :\n", classification_report(y_train, y_pred_train))

print("---------------------------------------------------------------------")

# calculating test dataset scores
# Calculate accuracy score
print("accuracy_score :", accuracy_score(y_test, y_pred_test))
# Calculate precision score
print("precision_score :", precision_score(y_test, y_pred_test))
# Calculate recall score
print("recall_score :", recall_score(y_test, y_pred_test))
# Calculate F1 score
print("f1_score :", f1_score(y_test, y_pred_test))
# Generate confusion matrix
print("confusion_matrix :\n", confusion_matrix(y_test, y_pred_test))
# Generate classification report
print("classification_report :\n", classification_report(y_test, y_pred_test))

accuracy_score : 0.8480337078651685
precssion_score : 0.6984615384615385
recall_score : 0.33880597014925373
f1_score : 0.4562814070351759
confusion_matrix :
 [[2792   98]
 [ 443  227]]
classification_report :
               precision    recall  f1-score   support

           0       0.86      0.97      0.91      2890
           1       0.70      0.34      0.46       670

    accuracy                           0.85      3560
   macro avg       0.78      0.65      0.68      3560
weighted avg       0.83      0.85      0.83      3560

---------------------------------------------------------------------
accuracy_score : 0.8424599831508003
precision_score : 0.6554621848739496
recall_score : 0.3482142857142857
f1_score : 0.45481049562682213
confusion_matrix :
 [[922  41]
 [146  78]]
classification_report :
               precision    recall  f1-score   support

           0       0.86      0.96      0.91       963
           1       0.66      0.35      0.45       224

    accuracy           

### Lets Train and predict the data for different alogirthms

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [25]:
# Initialize the RandomForestClassifier model with hyperparameters
models = {

    'logistic_regression' : LogisticRegression(),
    'knn' : KNeighborsClassifier(),
    'naive_bayes' : GaussianNB(),
    'decision_tree' : DecisionTreeClassifier(),
    'random_forest' : RandomForestClassifier(),
    'adaboost': AdaBoostClassifier(),
}
models

{'logistic_regression': LogisticRegression(),
 'knn': KNeighborsClassifier(),
 'naive_bayes': GaussianNB(),
 'decision_tree': DecisionTreeClassifier(),
 'random_forest': RandomForestClassifier(),
 'adaboost': AdaBoostClassifier()}

In [26]:
for m in range(len(list(models))):

    model = list(models.values())[m]

# Train the model using the training data
    model.fit(X_train, y_train)


# Make predictions on the test data
    y_pred_train = model.predict(X_train)


# Make predictions on the test data
    y_pred_test = model.predict(X_test)

    print(f"----------------- {list(models.keys())[m]} ---------------------")

    # calculating train dataset scores
    print("train dataset prediction values")
# Calculate accuracy score
    print("- accuracy_score :", accuracy_score(y_train, y_pred_train))
# Calculate precision score
    print("- precssion_score :", precision_score(y_train, y_pred_train))
# Calculate recall score
    print("- recall_score :", recall_score(y_train, y_pred_train))
# Calculate F1 score
    print("- f1_score :", f1_score(y_train, y_pred_train))
# Generate confusion matrix
    print("- confusion_matrix :\n", confusion_matrix(y_train, y_pred_train))
# Generate classification report
    print("- classification_report :\n", classification_report(y_train, y_pred_train))

    print("---------------------------------------------------------------------")

    # calculating test dataset scores
    print("test dataset prediction values")
# Calculate accuracy score
    print("- accuracy_score :", accuracy_score(y_test, y_pred_test))
# Calculate precision score
    print("- precision_score :", precision_score(y_test, y_pred_test))
# Calculate recall score
    print("- recall_score :", recall_score(y_test, y_pred_test))
# Calculate F1 score
    print("- f1_score :", f1_score(y_test, y_pred_test))
# Generate confusion matrix
    print("- confusion_matrix :\n", confusion_matrix(y_test, y_pred_test))
# Generate classification report
    print("- classification_report :\n", classification_report(y_test, y_pred_test))
    print("*"*50)
    print("\n")
        

----------------- logistic_regression ---------------------
train dataset prediction values
- accuracy_score : 0.8438202247191011
- precssion_score : 0.7021276595744681
- recall_score : 0.2955223880597015
- f1_score : 0.41596638655462187
- confusion_matrix :
 [[2806   84]
 [ 472  198]]
- classification_report :
               precision    recall  f1-score   support

           0       0.86      0.97      0.91      2890
           1       0.70      0.30      0.42       670

    accuracy                           0.84      3560
   macro avg       0.78      0.63      0.66      3560
weighted avg       0.83      0.84      0.82      3560

---------------------------------------------------------------------
test dataset prediction values
- accuracy_score : 0.8407750631844987
- precision_score : 0.6635514018691588
- recall_score : 0.3169642857142857
- f1_score : 0.42900302114803623
- confusion_matrix :
 [[927  36]
 [153  71]]
- classification_report :
               precision    recall  f1-sc

## Hyperparameter Tuning

In [27]:
rf_params = {
    "criterion" : ["gini", "entropy", "log_loss"],
    "max_depth" : [1,2,3,4,5,6,7,8,9,10,11,15,20],
    "max_features": ["sqrt", "log2"],
    "class_weight": ["balanced", "balanced_subsample"]
}

adaboost_params = {

    "algorithm" : ["SAMME", "SAMME.R"],
    "n_estimators" : [50,100,150,200],
    "learning_rate" : [0.001, 0.01, 0.1, 1]
}

In [28]:
# Initialize the AdaBoostClassifier model with hyperparameters
randomcv_models = [

    ('Random Forest', RandomForestClassifier(), rf_params),
    ('Adaboost', AdaBoostClassifier(), adaboost_params)
]

In [33]:
# hyperparameter tuning using randomizedCV
model_params = {}
for name, model, params in randomcv_models:

    random_cv = RandomizedSearchCV(estimator=model,
                                param_distributions=params,
                                scoring="accuracy",
                                n_jobs=-1,
                                verbose=1,
                                cv=5,
                                refit=True,
                                )
    random_cv.fit(X_train, y_train)
    model_params[name] = random_cv.best_params_

model_params

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits


{'Random Forest': {'max_features': 'log2',
  'max_depth': 20,
  'criterion': 'log_loss',
  'class_weight': 'balanced'},
 'Adaboost': {'n_estimators': 100, 'learning_rate': 1, 'algorithm': 'SAMME.R'}}

In [34]:
# Initialize the AdaBoostClassifier model with hyperparameters
model = AdaBoostClassifier(n_estimators=100, learning_rate=1, algorithm="SAMME.R" )
# Train the model using the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_train = model.predict(X_train)

# Make predictions on the test data
y_pred_test = model.predict(X_test)

# calculating train dataset scores
print("train dataset prediction values")
# Calculate accuracy score
print("- accuracy_score :", accuracy_score(y_train, y_pred_train))
# Calculate precision score
print("- precssion_score :", precision_score(y_train, y_pred_train))
# Calculate recall score
print("- recall_score :", recall_score(y_train, y_pred_train))
# Calculate F1 score
print("- f1_score :", f1_score(y_train, y_pred_train))
# Generate confusion matrix
print("- confusion_matrix :\n", confusion_matrix(y_train, y_pred_train))
# Generate classification report
print("- classification_report :\n", classification_report(y_train, y_pred_train))

print("---------------------------------------------------------------------")

# calculating test dataset scores
print("test dataset prediction values")
# Calculate accuracy score
print("- accuracy_score :", accuracy_score(y_test, y_pred_test))
# Calculate precision score
print("- precision_score :", precision_score(y_test, y_pred_test))
# Calculate recall score
print("- recall_score :", recall_score(y_test, y_pred_test))
# Calculate F1 score
print("- f1_score :", f1_score(y_test, y_pred_test))
# Generate confusion matrix
print("- confusion_matrix :\n", confusion_matrix(y_test, y_pred_test))
# Generate classification report
print("- classification_report :\n", classification_report(y_test, y_pred_test))
print("*"*50)
print("\n")
        

train dataset prediction values
- accuracy_score : 0.854494382022472
- precssion_score : 0.7222222222222222
- recall_score : 0.3686567164179104
- f1_score : 0.4881422924901186
- confusion_matrix :
 [[2795   95]
 [ 423  247]]
- classification_report :
               precision    recall  f1-score   support

           0       0.87      0.97      0.92      2890
           1       0.72      0.37      0.49       670

    accuracy                           0.85      3560
   macro avg       0.80      0.67      0.70      3560
weighted avg       0.84      0.85      0.83      3560

---------------------------------------------------------------------
test dataset prediction values
- accuracy_score : 0.8424599831508003
- precision_score : 0.6412213740458015
- recall_score : 0.375
- f1_score : 0.4732394366197183
- confusion_matrix :
 [[916  47]
 [140  84]]
- classification_report :
               precision    recall  f1-score   support

           0       0.87      0.95      0.91       963
       