## Importing neccessary libraries

In [56]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import chi2
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder, MinMaxScaler
import warnings
warnings.filterwarnings("ignore")

## Loading the dataset

In [57]:
df = pd.read_csv('Traveling_Data.csv')
df.head()

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,200000,1,41.0,Self Enquiry,3,6,Salaried,Female,3,3,Deluxe,3,Single,1,1,2,1,0,Manager,20993.0
1,200001,0,49.0,Company Invited,1,14,Salaried,Male,3,4,Deluxe,4,Divorced,2,0,3,1,2,Manager,20130.0
2,200002,1,37.0,Self Enquiry,1,8,Free Lancer,Male,3,4,Basic,3,Single,7,1,3,0,0,Executive,17090.0
3,200003,0,33.0,Company Invited,1,9,Salaried,Female,2,3,Basic,3,Divorced,2,1,5,1,1,Executive,17909.0
4,200005,0,32.0,Company Invited,1,8,Salaried,Male,3,3,Basic,3,Single,1,0,5,1,1,Executive,18068.0


## Checking the distribution of target veriable

In [58]:
df.ProdTaken.value_counts()

ProdTaken
0    3331
1     797
Name: count, dtype: int64

# Separating Input and Output Features

In [59]:
from sklearn.feature_selection import chi2

In [60]:
le = LabelEncoder()
TypeofContact_encoded = le.fit_transform(df["TypeofContact"])
chi_scores, p_values = chi2(TypeofContact_encoded.reshape(-1,1), df["ProdTaken"])
p_values

array([0.0522694])

In [61]:
chi_scores, p_values = chi2(df["CityTier"].values.reshape(-1,1), df["ProdTaken"])
p_values

array([3.0520272e-05])

In [62]:
le = LabelEncoder()
Occupation_encoded = le.fit_transform(df["Occupation"])
chi_scores, p_values = chi2(Occupation_encoded.reshape(-1,1), df["ProdTaken"])
p_values

array([0.1727841])

In [63]:
le = LabelEncoder()
Gender_encoded = le.fit_transform(df["Gender"])
chi_scores, p_values = chi2(Gender_encoded.reshape(-1,1), df["ProdTaken"])
p_values

array([0.25146654])

In [64]:
le = LabelEncoder()
NumberOfPersonVisiting_encoded = le.fit_transform(df["NumberOfPersonVisiting"])
chi_scores, p_values = chi2(NumberOfPersonVisiting_encoded.reshape(-1,1), df["ProdTaken"])
p_values

array([0.9021769])

In [65]:
le = LabelEncoder()
NumberOfFollowups_encoded = le.fit_transform(df["NumberOfFollowups"])
chi_scores, p_values = chi2(NumberOfFollowups_encoded.reshape(-1,1), df["ProdTaken"])
p_values

array([1.46039745e-05])

In [66]:
le = LabelEncoder()
ProductPitched_encoded = le.fit_transform(df["ProductPitched"])
chi_scores, p_values = chi2(ProductPitched_encoded.reshape(-1,1), df["ProdTaken"])
p_values

array([3.96256846e-31])

In [67]:
chi_scores, p_values = chi2(df["PreferredPropertyStar"].values.reshape(-1,1), df["ProdTaken"])
p_values

array([0.00675455])

In [68]:
le = LabelEncoder()
MaritalStatus_encoded = le.fit_transform(df["MaritalStatus"])
chi_scores, p_values = chi2(MaritalStatus_encoded.reshape(-1,1), df["ProdTaken"])
p_values

array([1.73635525e-15])

In [69]:
chi_scores, p_values = chi2(df["NumberOfTrips"].values.reshape(-1,1), df["ProdTaken"])
p_values

array([0.31094093])

In [70]:
chi_scores, p_values = chi2(df["Passport"].values.reshape(-1,1), df["ProdTaken"])
p_values

array([1.74994525e-48])

In [71]:
chi_scores, p_values = chi2(df["PitchSatisfactionScore"].values.reshape(-1,1), df["ProdTaken"])
p_values

array([0.00921053])

In [72]:
chi_scores, p_values = chi2(df["OwnCar"].values.reshape(-1,1), df["ProdTaken"])
p_values

array([0.7283217])

In [73]:
chi_scores, p_values = chi2(df["NumberOfChildrenVisiting"].values.reshape(-1,1), df["ProdTaken"])
p_values

array([0.67414873])

In [74]:
le = LabelEncoder()
Designation_encoded = le.fit_transform(df["Designation"])
chi_scores, p_values = chi2(Designation_encoded.reshape(-1,1), df["ProdTaken"])
p_values

array([1.44491492e-06])

# Separating Input and Output Features

In [75]:
X = df[["Age", "CityTier", "NumberOfFollowups", "ProductPitched", "PreferredPropertyStar", "MaritalStatus", "Passport", "PitchSatisfactionScore", "Designation"]]
y = df["ProdTaken"]

In [76]:
df["Passport"] = df["Passport"].astype(str).str.replace("0", "No").str.replace("1", "Yes")
df["Passport"].value_counts()

Passport
No     2909
Yes    1219
Name: count, dtype: int64

In [77]:
X.Age.min(), X.Age.max()

(18.0, 61.0)

In [78]:
X.Designation.value_counts()

Designation
Executive         1615
Manager           1422
Senior Manager     737
AVP                250
VP                 104
Name: count, dtype: int64

# Train_Test_Split

In [79]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

# Data Preprocessing

In [80]:
preprocessor = ColumnTransformer(
    transformers=[
        ("age_scaled", MinMaxScaler(), ["Age"]),
        ("product_pitched_encoded", OrdinalEncoder(categories=[["Basic", "Standard", "Deluxe", "Super Deluxe", "King"]]), ["ProductPitched"]),
        ("marital_status_encoded", OneHotEncoder(handle_unknown="ignore"), ["MaritalStatus"]),
        ("passport_encoded", OneHotEncoder(handle_unknown="ignore"), ["Passport"]),
        ("designation_encoded", OneHotEncoder(handle_unknown="ignore"), ["Designation"]),
    ],
    remainder="passthrough"
)

In [81]:
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)
X_train_transformed.shape

(2889, 17)

In [82]:
y_train.value_counts(), y_test.value_counts()

(ProdTaken
 0    2340
 1     549
 Name: count, dtype: int64,
 ProdTaken
 0    991
 1    248
 Name: count, dtype: int64)

# Target Variable is imbalanced so let's balanced it 

In [83]:
smote = SMOTE(random_state=42)
Xtrain_resampled_smote, ytrain_resampled_smote = smote.fit_resample(X_train_transformed,y_train)
ytrain_resampled_smote.value_counts()

ProdTaken
1    2340
0    2340
Name: count, dtype: int64

In [84]:
Xtest_resampled_smote, ytest_resampled_smote = smote.fit_resample(X_test_transformed, y_test)
ytest_resampled_smote.value_counts()

ProdTaken
0    991
1    991
Name: count, dtype: int64

In [85]:
Xtrain_resampled_smote.shape

(4680, 17)

In [86]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "SVM": SVC(),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
}


In [87]:
from sklearn.metrics import accuracy_score, classification_report

In [88]:
results = {}

for name, model in models.items():
    model.fit(Xtrain_resampled_smote, ytrain_resampled_smote)
    y_pred = model.predict(Xtest_resampled_smote)
    

    acc = accuracy_score(ytest_resampled_smote, y_pred)
    results[name] = acc

    print(f"\n{name}")
    print("Accuracy:", acc)
   # print(classification_report(ytest_resampled_smote, y_pred))



Logistic Regression
Accuracy: 0.7386478304742684

KNN
Accuracy: 0.8037336024217961

SVM
Accuracy: 0.7492431886982845

Naive Bayes
Accuracy: 0.6962663975782039

Decision Tree
Accuracy: 0.8319878910191726

Random Forest
Accuracy: 0.884460141271443

Gradient Boosting
Accuracy: 0.8748738647830474


## Checking the cross-validation score

In [89]:
from sklearn.model_selection import cross_val_score

model = RandomForestClassifier()
scores = cross_val_score(
    model,
    np.vstack([Xtrain_resampled_smote, Xtest_resampled_smote]),
    np.hstack([ytrain_resampled_smote, ytest_resampled_smote]),
    cv = 5,
    scoring = "accuracy"
)

print("Accuracy", scores)
print("Mean Accuracy", np.mean(scores))

Accuracy [0.86196549 0.95573893 0.93993994 0.91066066 0.96171171]
Mean Accuracy 0.9260033476837679


## Checking the best hyperparameter values 

In [90]:
from sklearn.model_selection import GridSearchCV

In [98]:
rf = RandomForestClassifier(random_state=42)
param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [None, 10,11,12,13,14,15,16,17,18,19, 20],
    "min_samples_split": [1, 2, 3, 4, 5],
    "min_samples_leaf": [1, 2, 3, 4, 5],
}

grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1,
    verbose=1
)

grid_search.fit(Xtrain_resampled_smote, ytrain_resampled_smote)
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)


Fitting 5 folds for each of 600 candidates, totalling 3000 fits
Best Parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best Accuracy: 0.9192307692307692


## Checking the metrics

In [92]:
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
    classification_report
)

In [93]:
model = RandomForestClassifier(random_state=42, max_depth= 14,min_samples_leaf= 1, min_samples_split= 2, class_weight='balanced', n_estimators= 100)

model.fit(Xtrain_resampled_smote, ytrain_resampled_smote)
y_pred = model.predict(Xtest_resampled_smote)

accuracy = accuracy_score(ytest_resampled_smote, y_pred)
cm = confusion_matrix(ytest_resampled_smote, y_pred)
precision = precision_score(ytest_resampled_smote, y_pred)
recall = recall_score(ytest_resampled_smote, y_pred)
f1 = f1_score(ytest_resampled_smote, y_pred)

print("Accuracy:", accuracy)
print("\nConfusion Matrix:\n", cm)
print("\nPrecision:", precision)
print("\nRecall:", recall)
print("\nF1-Score:", f1)
print("\nClassification Report :")
print(classification_report(ytest_resampled_smote, y_pred))

Accuracy: 0.8864783047426842

Confusion Matrix:
 [[941  50]
 [175 816]]

Precision: 0.9422632794457275

Recall: 0.8234106962663976

F1-Score: 0.8788368336025848

Classification Report :
              precision    recall  f1-score   support

           0       0.84      0.95      0.89       991
           1       0.94      0.82      0.88       991

    accuracy                           0.89      1982
   macro avg       0.89      0.89      0.89      1982
weighted avg       0.89      0.89      0.89      1982



# Save the model

In [94]:
import pickle

In [95]:
with open("tourism_model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("preprocessor.pkl", "wb") as f:
    pickle.dump(preprocessor, f)

print("Model and Preprocessor saved successfully")
    

Model and Preprocessor saved successfully


In [96]:
import sklearn
print(sklearn.__version__)

1.5.1
