## Import Dataset

In [1]:
import pandas as pd
pd.set_option('display.max_rows', 500) # maxmimum showed rows in dataset
pd.set_option('display.max_columns', 500) # maximum showed columns(variables) in dataset
df = pd.read_stata('data_for_ML.dta') #df=dataframe

In [2]:
df.head()

Unnamed: 0,date_time,visitor_id,session_id,log_id,order_id,page_id,reg_time,gender,birthday,county,dist,reg_status,reg_state,history_id,order,mon,Mon,cluster3,dur,duration,obss,visit,DP,pages,VP,datetime,month
0,2015-10-12 01:11:33,1.0.185.4999691,1.0,2009407.0,,1,,,,,,,0,1.0,0.0,1.0,1.0,4.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,20373.0,Oct
1,2016-12-23 15:33:51,1.1.125.10020250,2.0,8371956.0,,8,,,,,,,0,2.0,0.0,1.0,1.0,4.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,20811.0,Dec
2,2016-08-17 15:34:44,1.1.125.10133673,3.0,4109637.0,,13,,,,,,,0,3.0,0.0,1.0,1.0,4.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,20683.0,Aug
3,2016-03-29 20:50:46,1.1.125.10531582,4.0,8439479.0,,21,,,,,,,0,4.0,0.0,1.0,1.0,4.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,20542.0,Mar
4,2015-11-29 12:26:18,1.1.125.10629483,5.0,6343429.0,,36,,,,,,,0,5.0,0.0,1.0,1.0,4.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,20421.0,Nov


## Split the Training Set and Testing Set

In [3]:
## keep <variable list>
data = df[ ['order', 'duration', 'pages', 'mon', 'cluster3', 'reg_state'] ]

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(data.drop('order',1),data.loc[:, 'order'],test_size=0.2, random_state=42)


## Implement the Training Model

In [4]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=500, random_state=42, n_jobs=-1)
rf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [5]:
rf.score(x_train, y_train)

0.9994333003842409

In [6]:
rf.score(x_test, y_test)

0.9912301563707712

In [7]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
actual = y_test
prediction = rf.predict(x_test)
confusion_matrix(actual, prediction)
print(classification_report(actual, prediction))

             precision    recall  f1-score   support

        0.0       0.99      1.00      1.00    133212
        1.0       0.58      0.44      0.50      1340

avg / total       0.99      0.99      0.99    134552



### Implement Cross_Validation

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators = 500, random_state = 42,n_jobs=-1)

param_grid = {"max_features": [1, 2, 3, 4, 5]}
grid_search = GridSearchCV(rf, param_grid=param_grid, cv=10)
grid_search.fit(x_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_features': [1, 2, 3, 4, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [9]:
grid_search.best_params_

{'max_features': 5}