# Read the data

## Choose a dataset
- Online Shoppers Purchasing Intention Dataset
- 原因：特徵都有先命名了

## Analyze the data (statistics, correlation...)
- 所有的特徵都沒有缺失值問題
- 部分特徵 min max 差異數值多出兩到三個數量級，可能比較有影響
- Month, VisitorType 為 String，需做數值轉換

## Define a reasonable problem
- 我認為 Revenue 適合當所求問題答案 y
- 以其他特徵當作影響的因素 x
- 問題的意義：哪些特徵對獲利最有影響
- Revenue 的值為 true or false，不需再做 Label

In [370]:
import time
import pandas as pd
import numpy as np

EXECUTION_START_TIME = time.time()

df = pd.read_csv('./data/online_shoppers_intention.csv')
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12330 non-null  int64  
 1   Administrative_Duration  12330 non-null  float64
 2   Informational            12330 non-null  int64  
 3   Informational_Duration   12330 non-null  float64
 4   ProductRelated           12330 non-null  int64  
 5   ProductRelated_Duration  12330 non-null  float64
 6   BounceRates              12330 non-null  float64
 7   ExitRates                12330 non-null  float64
 8   PageValues               12330 non-null  float64
 9   SpecialDay               12330 non-null  float64
 10  Month                    12330 non-null  object 
 11  OperatingSystems         12330 non-null  int64  
 12  Browser                  12330 non-null  int64  
 13  Region                   12330 non-null  int64  
 14  TrafficType           

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,OperatingSystems,Browser,Region,TrafficType
count,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0
mean,2.315166,80.818611,0.503569,34.472398,31.731468,1194.74622,0.022191,0.043073,5.889258,0.061427,2.124006,2.357097,3.147364,4.069586
std,3.321784,176.779107,1.270156,140.749294,44.475503,1913.669288,0.048488,0.048597,18.568437,0.198917,0.911325,1.717277,2.401591,4.025169
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
25%,0.0,0.0,0.0,0.0,7.0,184.1375,0.0,0.014286,0.0,0.0,2.0,2.0,1.0,2.0
50%,1.0,7.5,0.0,0.0,18.0,598.936905,0.003112,0.025156,0.0,0.0,2.0,2.0,3.0,2.0
75%,4.0,93.25625,0.0,0.0,38.0,1464.157213,0.016813,0.05,0.0,0.0,3.0,2.0,4.0,4.0
max,27.0,3398.75,24.0,2549.375,705.0,63973.52223,0.2,0.2,361.763742,1.0,8.0,13.0,9.0,20.0


In [371]:
df[['Administrative_Duration', 'Informational_Duration', 'ProductRelated_Duration']].describe()

Unnamed: 0,Administrative_Duration,Informational_Duration,ProductRelated_Duration
count,12330.0,12330.0,12330.0
mean,80.818611,34.472398,1194.74622
std,176.779107,140.749294,1913.669288
min,0.0,0.0,0.0
25%,0.0,0.0,184.1375
50%,7.5,0.0,598.936905
75%,93.25625,0.0,1464.157213
max,3398.75,2549.375,63973.52223


In [372]:
print(df[['Month', 'VisitorType']])

Month        VisitorType
0       Feb  Returning_Visitor
1       Feb  Returning_Visitor
2       Feb  Returning_Visitor
3       Feb  Returning_Visitor
4       Feb  Returning_Visitor
...     ...                ...
12325   Dec  Returning_Visitor
12326   Nov  Returning_Visitor
12327   Nov  Returning_Visitor
12328   Nov  Returning_Visitor
12329   Nov        New_Visitor

[12330 rows x 2 columns]


# 分出 train_x, train_y

In [373]:
from sklearn.preprocessing import LabelEncoder

def label_encoder(data_df):
    le = LabelEncoder()
    le.fit(data_df)
    return le.transform(data_df)

train_x = df[['Administrative', 'Administrative_Duration', 'Informational', 'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration', 'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 'Month', 'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType', 'Weekend']]
train_y = df['Revenue']

train_x['Month'] = label_encoder(train_x['Month'])
train_x['VisitorType'] = label_encoder(train_x['VisitorType'])

# 訓練模型並預測
- 決定仿照作業二的訓練方式及計算準確度的方式
- 首先先在不做任何額外 preprocess 條件下嘗試

In [374]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

# Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay Month OperatingSystems Browser Region TrafficType VisitorType Weekend

def train_and_valid_acc(kf_n_splits, rf_max_depth, rf_n_estimators):
    kf = KFold(n_splits=kf_n_splits, random_state=0, shuffle=True)
    kf.get_n_splits(train_x)
    train_acc_list = []
    valid_acc_list = []
    for train_index, valid_index in kf.split(train_x):
        train_x_split = train_x.iloc[train_index]
        train_y_split = train_y.iloc[train_index]
        valid_x_split = train_x.iloc[valid_index]
        valid_y_split = train_y.iloc[valid_index]
        model = RandomForestClassifier(max_depth=rf_max_depth,  n_estimators=rf_n_estimators, random_state=0)
        model.fit(train_x_split, train_y_split)
        train_pred_y = model.predict(train_x_split)
        train_acc = accuracy_score(train_y_split, train_pred_y)
        valid_pred_y = model.predict(valid_x_split)
        valid_acc = accuracy_score(valid_y_split, valid_pred_y)
        train_acc_list.append(train_acc)
        valid_acc_list.append(valid_acc)
    print("Predict Result Accuracy:\n")
    print((
        'average train accuracy: {}\n' +
        '    min train accuracy: {}\n' +
        '    max train accuracy: {}\n' +
        'average valid accuracy: {}\n' +
        '    min valid accuracy: {}\n' +
        '    max valid accuracy: {}').format(
        np.mean(train_acc_list),
        np.min(train_acc_list),
        np.max(train_acc_list),
        np.mean(valid_acc_list),
        np.min(valid_acc_list),
        np.max(valid_acc_list)
    ))

train_and_valid_acc(kf_n_splits = 10, rf_max_depth=None rf_n_estimators = 100)


Predict Result Accuracy:

average train accuracy: 0.9466432369108768
    min train accuracy: 0.9443993872217716
    max train accuracy: 0.9489952239343967
average valid accuracy: 0.9020275750202756
    min valid accuracy: 0.8905109489051095
    max valid accuracy: 0.910786699107867


# Improved Results Step1
- 結果明顯 overfitting
- 推測是使用的特徵太多
- Drop 掉原始資料 std 過小的特徵

In [375]:
train_x.drop(['BounceRates', 'ExitRates', 'SpecialDay'], axis=1)
train_and_valid_acc(kf_n_splits = 10, rf_max_depth=None, rf_n_estimators = 100)

Predict Result Accuracy:

average train accuracy: 0.9999819771109308
    min train accuracy: 0.9999098855546544
    max train accuracy: 1.0
average valid accuracy: 0.9030008110300081
    min valid accuracy: 0.8929440389294404
    max valid accuracy: 0.9148418491484185


# Improced Results Step2
- 結果依然 overfitting
- 繼續 drop

In [376]:
train_x.drop(['Administrative', 'Informational', 'ProductRelated'], axis=1)
train_and_valid_acc(kf_n_splits = 10, rf_max_depth=None, rf_n_estimators = 100)

Predict Result Accuracy:

average train accuracy: 0.9999819771109308
    min train accuracy: 0.9999098855546544
    max train accuracy: 1.0
average valid accuracy: 0.9030008110300081
    min valid accuracy: 0.8929440389294404
    max valid accuracy: 0.9148418491484185


In [377]:
train_x.drop(['Month', 'OperatingSystems', 'Browser'], axis=1)
train_and_valid_acc(kf_n_splits = 10, rf_max_depth=None, rf_n_estimators = 100)

Predict Result Accuracy:

average train accuracy: 0.9999819771109308
    min train accuracy: 0.9999098855546544
    max train accuracy: 1.0
average valid accuracy: 0.9030008110300081
    min valid accuracy: 0.8929440389294404
    max valid accuracy: 0.9148418491484185


In [365]:
train_x.drop(['Region', 'TrafficType', 'VisitorType', 'Weekend'], axis=1)
train_and_valid_acc(kf_n_splits = 10, rf_max_depth=None, rf_n_estimators = 100)

Predict Result Accuracy:

average train accuracy: 0.9999819771109308
    min train accuracy: 0.9999098855546544
    max train accuracy: 1.0
average valid accuracy: 0.9030008110300081
    min valid accuracy: 0.8929440389294404
    max valid accuracy: 0.9148418491484185


# Improved Results Step3
- 依據 HW2 經驗，推測可能是 RandomForest 取樣參數過大
- 也可能是 RandomForest 中 tree depth 過大

In [366]:
train_and_valid_acc(kf_n_splits = 10, rf_max_depth=15, rf_n_estimators = 50)

Predict Result Accuracy:

average train accuracy: 0.9885013967739029
    min train accuracy: 0.9865729476435072
    max train accuracy: 0.9900874110119853
average valid accuracy: 0.9026763990267639
    min valid accuracy: 0.8921330089213301
    max valid accuracy: 0.9148418491484185


In [367]:
train_and_valid_acc(kf_n_splits = 10, rf_max_depth=15, rf_n_estimators = 25)

Predict Result Accuracy:

average train accuracy: 0.9866630620888529
    min train accuracy: 0.9854915742993602
    max train accuracy: 0.9879246643236911
average valid accuracy: 0.9014598540145986
    min valid accuracy: 0.8872668288726683
    max valid accuracy: 0.9132197891321979


In [368]:
train_and_valid_acc(kf_n_splits = 10, rf_max_depth=10, rf_n_estimators = 20)


Predict Result Accuracy:

average train accuracy: 0.9466432369108768
    min train accuracy: 0.9443993872217716
    max train accuracy: 0.9489952239343967
average valid accuracy: 0.9020275750202756
    min valid accuracy: 0.8905109489051095
    max valid accuracy: 0.910786699107867


# Final Result
- average train accuracy: 0.9466432369108768
- average valid accuracy: 0.9020275750202746


In [369]:
EXECUTION_END_TIME = time.time()
print('total execution time: {}'.format(EXECUTION_END_TIME - EXECUTION_START_TIME))



total execution time: 69.86223816871643
