In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('20200402_Dataset_final.csv')
test = pd.read_csv('20200422_validation_dataset.csv')
data.head()
test.head()

Unnamed: 0,contactid,K2_Createdtime,K1_Createdtime,industry,leadsource,source,City,Converted,approval_date,Converted_days,MKT_Leadsource,Telesale
0,10928669,2020-01-23,2020-01-23,Tạp hóa,Facebook CTW_Team3,Facebook,Đồng Nai,0,,,1,1
1,10831457,2020-01-15,2020-01-15,Bar & Cafe & Nhà hàng,Google Search Ads_Team4,Google,An Giang,1,2020-01-20 00:00:00 UTC,5.0,1,1
2,10660931,2020-01-03,2020-01-03,Bar & Cafe & Nhà hàng,LBKhachCu,Khác,Lâm Đồng,0,,,1,0
3,10958027,2020-01-31,2020-01-31,Ngành khác,Partner,Partner,Hà Nội,1,2020-02-01 00:00:00 UTC,1.0,0,1
4,10943553,2020-01-30,2020-01-26,Nội thất,Facebook CTW_Team3,Facebook,Đồng Nai,1,2020-02-01 00:00:00 UTC,6.0,1,1


In [3]:
#Check the number of contact from mkt and telesale
print('Total contact in dataset:', data.iloc[:,0].count())
print('Contact from MKT_leadsource:',data.MKT_Leadsource.sum())
print('Contact from telesale:',data.Telesale.sum())
print('Contact from MKT_leadsource or telesale:', data[(data.MKT_Leadsource==1) | (data.Telesale==1)].iloc[:,0].count())

Total contact in dataset: 150605
Contact from MKT_leadsource: 127077
Contact from telesale: 145835
Contact from MKT_leadsource or telesale: 150605


In [4]:
data = data[['contactid', 'industry', 'leadsource', 'City', 'Converted']]
test = test[['contactid', 'industry', 'leadsource', 'City', 'Converted']]
data.rename(columns={'contactid': 'id', 'City': 'city', 'Converted': 'converted'}, inplace=True)
test.rename(columns={'contactid': 'id', 'City': 'city', 'Converted': 'converted'}, inplace=True)
data.head()

Unnamed: 0,id,industry,leadsource,city,converted
0,10055792,Bar & Cafe & Nhà hàng,Google Search Ads_Team4,Hà Nội,0
1,4911554,Điện thoại & Điện máy,DTD_Bình Định,Bình Định,0
2,5851200,Điện thoại & Điện máy,Partner,Hà Nội,1
3,8798186,Ngành khác,GHECHUA,Hồ Chí Minh,0
4,7864381,Bar & Cafe & Nhà hàng,Google Search Ads_Team4,Kon Tum,0


In [5]:
# model, evaluation metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, roc_curve, auc, precision_recall_curve, f1_score, accuracy_score, precision_score, recall_score

In [6]:
feature_columns = ['industry', 'leadsource', 'city']

y = data['converted']
X = data[feature_columns]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=1)

y_test = test['converted']
X_test = test[feature_columns]

In [13]:
# One-hot Encoder
oh_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

oh_X_train = pd.DataFrame(oh_encoder.fit_transform(X_train[feature_columns]))
oh_X_valid = pd.DataFrame(oh_encoder.fit_transform(X_valid[feature_columns]))
oh_X_test = pd.DataFrame(oh_encoder.fit_transform(X_test[feature_columns]))

In [14]:
# One-hot encoding removed index; put it back
oh_X_train.index = X_train.index
oh_X_valid.index = X_valid.index
oh_X_test.index = X_test.index

In [15]:
rdf_model = RandomForestClassifier(n_estimators=1000, class_weight='balanced', random_state=0, n_jobs=4)
rdf_model.fit(oh_X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=1000, n_jobs=4, oob_score=False,
                       random_state=0, verbose=0, warm_start=False)

In [16]:
pred = rdf_model.predict_proba(oh_X_valid)[:,1]

fpr, tpr, thresholds = roc_curve(y_valid, pred, pos_label=1)
roc_auc = auc(fpr, tpr)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

ValueError: Number of features of the model must match the input. Model n_features is 177 and input n_features is 156 

In [11]:
#final score on valid dataset
from sklearn.metrics import confusion_matrix, classification_report
y_valid_pred = rdf_model.predict(oh_X_valid)
print(classification_report(y_valid,y_valid_pred))
print(confusion_matrix(y_valid,y_valid_pred))

              precision    recall  f1-score   support

           0       0.89      0.75      0.81     25223
           1       0.30      0.54      0.38      4898

    accuracy                           0.71     30121
   macro avg       0.59      0.65      0.60     30121
weighted avg       0.80      0.71      0.74     30121

[[18854  6369]
 [ 2231  2667]]


In [17]:
# Final score on test dataset
y_test_pred = rdf_model.predict(oh_X_test)
print(classification_report(y_test,y_test_pred))
print(confusion_matrix(y_test,y_test_pred))

ValueError: Number of features of the model must match the input. Model n_features is 177 and input n_features is 129 