In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

In [3]:
data = pd.read_csv("Travel.csv")
data.head(4)

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Single,1.0,1,2,1,0.0,Manager,20993.0
1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Single,7.0,1,3,0,0.0,Executive,17090.0
3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0


# Step 1<br>
1.Cleaning the data<br>
2.Handling Duplicates<br>
3.Check Datatype<br>
4.Understand the dataset<br>

In [None]:
print(data.size)
print(data.shape)
data.isnull().sum()

97760
(4888, 20)


CustomerID                    0
ProdTaken                     0
Age                         226
TypeofContact                25
CityTier                      0
DurationOfPitch             251
Occupation                    0
Gender                        0
NumberOfPersonVisiting        0
NumberOfFollowups            45
ProductPitched                0
PreferredPropertyStar        26
MaritalStatus                 0
NumberOfTrips               140
Passport                      0
PitchSatisfactionScore        0
OwnCar                        0
NumberOfChildrenVisiting     66
Designation                   0
MonthlyIncome               233
dtype: int64

In [None]:
data['MaritalStatus'].value_counts()
data['MaritalStatus'] = data['MaritalStatus'].replace('Single','Unmarried')

In [None]:
data['Gender'].value_counts()
data['Gender'] = data['Gender'].replace('Fe Male','Female')

In [None]:
# percentage of missing values present in features(columns)
per_of_na_value = [col for col in data.columns if data[col].isnull().sum() >=1]
for col in per_of_na_value:
    per = data[col].isnull().mean()*100
    print(col,round(per,4),"% missing value")

Age 4.6236 % missing value
TypeofContact 0.5115 % missing value
DurationOfPitch 5.135 % missing value
NumberOfFollowups 0.9206 % missing value
PreferredPropertyStar 0.5319 % missing value
NumberOfTrips 2.8642 % missing value
NumberOfChildrenVisiting 1.3502 % missing value
MonthlyIncome 4.7668 % missing value


In [63]:
data[per_of_na_value].describe(exclude='object')

Unnamed: 0,Age,DurationOfPitch,NumberOfFollowups,PreferredPropertyStar,NumberOfTrips,NumberOfChildrenVisiting,MonthlyIncome
count,4662.0,4637.0,4843.0,4862.0,4748.0,4822.0,4655.0
mean,37.622265,15.490835,3.708445,3.581037,3.236521,1.187267,23619.853491
std,9.316387,8.519643,1.002509,0.798009,1.849019,0.857861,5380.698361
min,18.0,5.0,1.0,3.0,1.0,0.0,1000.0
25%,31.0,9.0,3.0,3.0,2.0,1.0,20346.0
50%,36.0,13.0,4.0,3.0,3.0,1.0,22347.0
75%,44.0,20.0,4.0,4.0,4.0,2.0,25571.0
max,61.0,127.0,6.0,5.0,22.0,3.0,98678.0


In [96]:
data['Age'].fillna(data.Age.median(),inplace=True)

data.TypeofContact.fillna(data.TypeofContact.mode()[0],inplace=True)

data.DurationOfPitch.fillna(data.DurationOfPitch.median(),inplace=True)

data.NumberOfFollowups.fillna(data.NumberOfFollowups.mode()[0],inplace=True)

data.PreferredPropertyStar.fillna(data.PreferredPropertyStar.mode()[0],inplace=True)

data.NumberOfTrips.fillna(data.NumberOfTrips.median(),inplace=True)

data.NumberOfChildrenVisiting.fillna(data.NumberOfChildrenVisiting.mode()[0],inplace=True)

data.MonthlyIncome.fillna(data.MonthlyIncome.median(),inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data.DurationOfPitch.fillna(data.DurationOfPitch.median(),inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data.NumberOfTrips.fillna(data.NumberOfTrips.median(),inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the inte

In [153]:
data.isnull().sum()
data.drop(['CustomerID'],axis=1,inplace=True)

### Feature Engineering<br>
Features Extract

In [None]:
data['TotalVisit'] = data['NumberOfChildrenVisiting'] + data['NumberOfPersonVisiting']
data.drop(columns=['NumberOfChildrenVisiting','NumberOfPersonVisiting'],axis=1,inplace=True)
data.TotalVisit = data.TotalVisit.astype(np.int32)

In [156]:
X = data.drop(columns=['ProdTaken'],axis=1)
y = data['ProdTaken']

In [201]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=20)

In [202]:
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer

# create column transform
catagorical_col = X.select_dtypes(include="object").columns
numerical_col = X.select_dtypes(exclude="object").columns

scaler = StandardScaler()
encoder = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer([
    ('OnHotEncoder',encoder,catagorical_col),
    ('StandardScaler',scaler,numerical_col)
])

X_train_trans = pd.DataFrame(preprocessor.fit_transform(X_train))
X_test_trans = pd.DataFrame(preprocessor.transform(X_test))


### Model training

In [208]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,precision_score,f1_score

In [212]:
model = {"randomForest" : RandomForestClassifier(),
         "decisionTRee": DecisionTreeClassifier()}
for key,value in model.items():
    value.fit(X_train_trans, y_train)
    
    y_pred = value.predict(X_test_trans)
    print(key,"Scores")
    print("Accuracy Score : ",accuracy_score(y_pred,y_test))
    print("f1 Score : ",f1_score(y_pred,y_test))
    
    

randomForest Scores
Accuracy Score :  0.9396728016359919
f1 Score :  0.8102893890675241
decisionTRee Scores
Accuracy Score :  0.934560327198364
f1 Score :  0.825136612021858


In [213]:
params = {
    "max_depth" : [5,8,15,None,10],
    "max_features" : [5,7,"auto",10],
    "min_samples_split" : [2,8,15,20],
    "n_estimators" : [100,200,500,1000]
}
random_cv_model = [("RF",RandomForestClassifier(),params)]

In [214]:
from sklearn.model_selection import RandomizedSearchCV

model_params = {}
for name,model , params in random_cv_model:
    random = RandomizedSearchCV(estimator=model,param_distributions=params,n_iter=100,
                                cv=3,verbose=2,n_jobs=1)
    random.fit(X_train_trans,y_train)
    model_params[name] = random.best_params_

for model_name in model_params:
    print(model_params[model_name])

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END max_depth=None, max_features=5, min_samples_split=2, n_estimators=100; total time=   0.7s
[CV] END max_depth=None, max_features=5, min_samples_split=2, n_estimators=100; total time=   0.5s
[CV] END max_depth=None, max_features=5, min_samples_split=2, n_estimators=100; total time=   0.5s
[CV] END max_depth=10, max_features=5, min_samples_split=20, n_estimators=100; total time=   0.5s
[CV] END max_depth=10, max_features=5, min_samples_split=20, n_estimators=100; total time=   0.4s
[CV] END max_depth=10, max_features=5, min_samples_split=20, n_estimators=100; total time=   0.4s
[CV] END max_depth=None, max_features=10, min_samples_split=8, n_estimators=100; total time=   0.9s
[CV] END max_depth=None, max_features=10, min_samples_split=8, n_estimators=100; total time=   0.7s
[CV] END max_depth=None, max_features=10, min_samples_split=8, n_estimators=100; total time=   0.8s
[CV] END max_depth=5, max_features=7, min_samp

KeyboardInterrupt: 

In [219]:
model = {
    "random_forest" : RandomForestClassifier(n_estimators=1000,min_samples_split=2,max_features=7,max_depth=None)
}

for key ,value in model.items():
    value.fit(X_train_trans,y_train)
    y_pred = value.predict(X_test_trans)
    
    print(key)
    print("Accuracy:",accuracy_score(y_test,y_pred))
    print("Precision:",precision_score(y_test,y_pred,average='macro'))
    print("f1_score",f1_score(y_pred,y_test))

random_forest
Accuracy: 0.9539877300613497
Precision: 0.9511399216829359
f1_score 0.8640483383685801
