In [160]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt ###For visualizations

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [161]:
####Reading the dataset######
leads =pd.read_csv("/kaggle/input/leads-data-upgrad/Leads.csv")

In [162]:
# Look at the first few entries

leads.head()

In [163]:
# Inspect the shape of the dataset

leads.shape

In [164]:
# Inspect the different columsn in the dataset

leads.columns

In [165]:
leads.describe()

In [166]:
leads.info()

Cleaning the data 

In [167]:
leads.isnull().sum()

We see a lot many columns with high number of missing values. Let us eliminate columns with more than 30% or 3000 missing values.

In [168]:
for col in leads.columns:
    if leads[col].isnull().sum() > 3000:
        leads.drop(col, 1, inplace=True)

In [169]:
leads.isnull().sum()

Let us drop variables that wouldn't be required for the particular analysis.

In [170]:
leads.drop(['City'], axis = 1, inplace = True)
leads.drop(['Country'], axis = 1, inplace = True)

In [171]:
round(100*(leads.isnull().sum()/len(leads.index)), 2)###Find out the missing values as a percentage 

In [172]:
leads.isnull().sum()

In [173]:
# Get the value counts of all the columns

for column in leads:
    print(leads[column].astype('category').value_counts())
#     fig = plt.figure()
#     ax = fig.add_axes([0,0,1,1])
#     ax.bar(leads[column].astype('category').value_counts().index.values,leads[column].astype('category').value_counts().values)
#     plt.show()

In [174]:
leads['Lead Profile'].astype('category').value_counts()
fig = plt.figure(figsize=(15,4))
ax = fig.add_axes([0,0,1,1])
ax.bar(leads['Lead Profile'].astype('category').value_counts().index.values,leads['Lead Profile'].astype('category').value_counts().values)
plt.show()

In [175]:
leads['How did you hear about X Education'].value_counts()

In [176]:
leads['Specialization'].value_counts()

In [177]:
leads.drop(['Lead Profile', 'How did you hear about X Education'], axis = 1, inplace = True)

In [178]:
leads.drop(['Do Not Call', 'Search', 'Magazine', 'Newspaper Article', 'X Education Forums', 'Newspaper', 
            'Digital Advertisement', 'Through Recommendations', 'Receive More Updates About Our Courses', 
            'Update me on Supply Chain Content', 'Get updates on DM Content', 
            'I agree to pay the amount through cheque'], axis = 1, inplace = True)

In [179]:
leads['What matters most to you in choosing a course'].value_counts()

In [181]:
leads.drop(['What matters most to you in choosing a course'], axis = 1, inplace=True)

In [182]:
leads.isnull().sum()

Drop the null rows for the column What is you current occupation.

In [183]:
leads = leads[~pd.isnull(leads['What is your current occupation'])]

In [184]:
leads.isnull().sum()

In [185]:
leads = leads[~pd.isnull(leads['Specialization'])]
leads = leads[~pd.isnull(leads['Lead Source'])]
leads = leads[~pd.isnull(leads['Last Activity'])]

In [186]:
leads.isnull().sum()

In [187]:
leads.shape

In [188]:
import numpy as np
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')

In [189]:
imp.fit(leads[['TotalVisits','Page Views Per Visit']])
transformed_data=imp.fit_transform(leads[['TotalVisits','Page Views Per Visit']])
transformed_data=pd.DataFrame(transformed_data,columns=['TotalVisits','Page Views Per Visit'])

In [190]:
leads.reset_index(inplace=True,drop=True)

In [191]:
#leads['TotalVisits]
leads['TotalVisits']=transformed_data['TotalVisits']
leads['Page Views Per Visit']=transformed_data['Page Views Per Visit']

In [192]:
leads.isnull().sum()

In [193]:
leads.drop(['Prospect ID', 'Lead Number'], 1, inplace = True)

In [194]:
leads.describe()

In [195]:
import seaborn as sns
ax = sns.boxplot(data=leads[['TotalVisits','Total Time Spent on Website','Page Views Per Visit']], orient="h", palette="Set2")

In [196]:
leads['Total Time Spent on Website'].hist();

In [197]:
# Check the columns which are of type 'object'

temp = leads.loc[:, leads.dtypes == 'object']
temp.columns

In [198]:
dummy = pd.get_dummies(leads[['Lead Origin', 'Lead Source', 'Do Not Email', 'Last Activity',
                              'What is your current occupation','A free copy of Mastering The Interview', 
                              'Last Notable Activity']], drop_first=True)

# Add the results to the master dataframe
leads = pd.concat([leads, dummy], axis=1)

In [199]:
dummy_spl = pd.get_dummies(leads['Specialization'], prefix = 'Specialization')
dummy_spl = dummy_spl.drop(['Specialization_Select'], 1)
leads = pd.concat([leads, dummy_spl], axis = 1)
# Drop the variables for which the dummy variables have been created

leads = leads.drop(['Lead Origin', 'Lead Source', 'Do Not Email', 'Last Activity',
                   'Specialization', 'What is your current occupation',
                   'A free copy of Mastering The Interview', 'Last Notable Activity'], 1)

In [200]:
leads.head()

Test-Train Split

In [201]:
from sklearn.model_selection import train_test_split

In [202]:
X = leads.drop(['Converted'], 1)
X.head()

In [203]:
y = leads['Converted']

y.head()

In [204]:
y.value_counts(normalize=True)

In [205]:
# Split the dataset into 70% train and 30% test

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=100)

In [206]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

X_train[['TotalVisits', 'Page Views Per Visit', 'Total Time Spent on Website']] = scaler.fit_transform(X_train[['TotalVisits', 'Page Views Per Visit', 'Total Time Spent on Website']])
X_test[['TotalVisits', 'Page Views Per Visit', 'Total Time Spent on Website']] = scaler.transform(X_test[['TotalVisits', 'Page Views Per Visit', 'Total Time Spent on Website']])

X_train.head()

Correlation Analysis

In [207]:
leads.corr()

In [208]:
ax = sns.heatmap(leads[['TotalVisits', 'Page Views Per Visit', 'Total Time Spent on Website']].corr())

In [209]:
import xgboost as xgb

In [221]:
xgb_clf = xgb.XGBClassifier(max_depth=7, n_estimators=300, learning_rate=0.3,
                            n_jobs=-1, verbosity=1, gpu_id=-1,subsample=1, colsample_bytree=1,min_child_weight=1,
                           objective="binary:logistic",eval_metric=['error','auc'] )

In [222]:
xgb_clf

In [223]:
xgb_clf.fit(X_train, y_train)

In [224]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    '''
    print the accuracy score, classification report and confusion matrix of classifier
    '''
    if train:
        '''
        training performance
        '''
        print("Train Result:\n")
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_train, clf.predict(X_train))))
        print("Classification Report: \n {}\n".format(classification_report(y_train, clf.predict(X_train))))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_train, clf.predict(X_train))))

        res = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
        print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
        print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))
        
    elif train==False:
        '''
        test performance
        '''
        print("Test Result:\n")        
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_test, clf.predict(X_test))))
        print("Classification Report: \n {}\n".format(classification_report(y_test, clf.predict(X_test))))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_test, clf.predict(X_test))))    
        

In [225]:
print_score(xgb_clf, X_train, y_train, X_test, y_test, train=True)

In [226]:
print_score(xgb_clf, X_train, y_train, X_test, y_test, train=False)

In [237]:
###Feature selection using variable importance#######
from xgboost import plot_importance
var_imp_data=pd.DataFrame({'feature_names':xgb_clf.feature_names_in_,'feature_importances':xgb_clf.feature_importances_})
var_imp_data.sort_values('feature_importances',ascending=False,inplace=True)

In [239]:
var_imp_data[var_imp_data.feature_importances>0]

In [242]:
reduced_feature_space=var_imp_data.feature_names.values

In [245]:
X_train_1=X_train[reduced_feature_space]
X_test_1=X_test[reduced_feature_space]

In [246]:
xgb_clf1 = xgb.XGBClassifier(max_depth=4, n_estimators=300, learning_rate=0.2,
                            n_jobs=-1, verbosity=1, gpu_id=-1,subsample=1, colsample_bytree=1,min_child_weight=1,
                           objective="binary:logistic",eval_metric=['error','auc'] )

In [247]:
xgb_clf1.fit(X_train_1, y_train)

In [249]:
print_score(xgb_clf1, X_train_1, y_train, X_test_1, y_test, train=True)

In [250]:
print_score(xgb_clf1, X_train_1, y_train, X_test_1, y_test, train=False)

In [251]:
###Grid Search#####
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV 
pipeline_xgb = Pipeline([('xgb', xgb.XGBClassifier(max_depth=4, n_estimators=300, learning_rate=0.2,
                            n_jobs=-1, verbosity=1, gpu_id=-1,subsample=1, colsample_bytree=1,min_child_weight=1,
                           objective="binary:logistic",eval_metric=['error','auc'] ))])


In [258]:
params_xgb = {'xgb__max_depth':(4,7),
               'xgb__n_estimators':(300,1000),
               'xgb__learning_rate':(0.2, 0.3)} 

In [259]:
xgb_grid_pipeline = GridSearchCV(pipeline_xgb,
                                 params_xgb,
                                 n_jobs=-1,
                                 cv=5,
                                 verbose=1,
                                 scoring='accuracy')

In [260]:
xgb_grid_pipeline.fit(X_train_1,y_train) 

In [261]:
xgb_grid_pipeline.best_score_

In [263]:
best = xgb_grid_pipeline.best_estimator_.get_params() 

In [264]:
best

In [268]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
def print_score1(clf, X_train, y_train, X_test, y_test, train=True):
    '''
    print the accuracy score, classification report and confusion matrix of classifier
    '''
    if train:
        '''
        training performance
        '''
        print("Train Result:\n")
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_train, clf.predict(X_train))))
        print("Classification Report: \n {}\n".format(classification_report(y_train, clf.predict(X_train))))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_train, clf.predict(X_train))))

#         res = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
#         print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
#         print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))
        
    elif train==False:
        '''
        test performance
        '''
        print("Test Result:\n")        
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_test, clf.predict(X_test))))
        print("Classification Report: \n {}\n".format(classification_report(y_test, clf.predict(X_test))))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_test, clf.predict(X_test))))    
        

In [269]:
print_score1(xgb_grid_pipeline, X_train_1, y_train, X_test_1, y_test, train=True)

In [270]:
print_score1(xgb_grid_pipeline, X_train_1, y_train, X_test_1, y_test, train=False)