
# 期中考
***
Kaggle期中考知識點⽬標
競賽結束後你可以學會
- 如何處理存在各種缺陷的真實資料
- 使⽤ val / test data 來了解機器學習模型的訓練情形
- 使⽤適當的評估函數了解預測結果
- 應⽤適當的特徵⼯程提升模型的準確率
- 調整機器學習模型的超參數來提升準確率
-清楚的說明⽂件讓別⼈了解你的成果

https://www.kaggle.com/c/3rd-ml100marathon-midterm/data?select=train_data.csv

In [1]:

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [2]:
# 匯入資料

data_path = './'
train_data = pd.read_csv(data_path + 'train_data.csv')
test_data = pd.read_csv(data_path+'test_features.csv')

In [3]:
train_data.shape

(113, 22)

In [4]:
test_data.shape

(33, 21)

In [5]:
set(list(train_data.columns)) - set(list(test_data.columns))

{'poi'}

In [6]:
train_Y = train_data['poi']
train_data = train_data.drop(['poi'], axis=1)
print(f'train_data.shape: {train_data.shape}')
print(f'test_data.shape: {test_data.shape}')

train_data.shape: (113, 21)
test_data.shape: (33, 21)


In [7]:
df = pd.concat([train_data, test_data])
print(f'df.shape: {df.shape}')

df.shape: (146, 21)


In [8]:
df.head()

Unnamed: 0,name,bonus,deferral_payments,deferred_income,director_fees,email_address,exercised_stock_options,expenses,from_messages,from_poi_to_this_person,...,loan_advances,long_term_incentive,other,restricted_stock,restricted_stock_deferred,salary,shared_receipt_with_poi,to_messages,total_payments,total_stock_value
0,RICE KENNETH D,1750000.0,,-3504386.0,,ken.rice@enron.com,19794175.0,46950.0,18.0,42.0,...,,1617011.0,174839.0,2748364.0,,420636.0,864.0,905.0,505050.0,22542539.0
1,SKILLING JEFFREY K,5600000.0,,,,jeff.skilling@enron.com,19250000.0,29336.0,108.0,88.0,...,,1920000.0,22122.0,6843672.0,,1111258.0,2042.0,3627.0,8682716.0,26093672.0
2,SHELBY REX,200000.0,,-4167.0,,rex.shelby@enron.com,1624396.0,22884.0,39.0,13.0,...,,,1573324.0,869220.0,,211844.0,91.0,225.0,2003885.0,2493616.0
3,KOPPER MICHAEL J,800000.0,,,,michael.kopper@enron.com,,118134.0,,,...,,602671.0,907502.0,985032.0,,224305.0,,,2652612.0,985032.0
4,CALGER CHRISTOPHER F,1250000.0,,-262500.0,,christopher.calger@enron.com,,35818.0,144.0,199.0,...,,375304.0,486.0,126027.0,,240189.0,2188.0,2598.0,1639297.0,126027.0


In [9]:
df.isnull()

Unnamed: 0,name,bonus,deferral_payments,deferred_income,director_fees,email_address,exercised_stock_options,expenses,from_messages,from_poi_to_this_person,...,loan_advances,long_term_incentive,other,restricted_stock,restricted_stock_deferred,salary,shared_receipt_with_poi,to_messages,total_payments,total_stock_value
0,False,False,True,False,True,False,False,False,False,False,...,True,False,False,False,True,False,False,False,False,False
1,False,False,True,True,True,False,False,False,False,False,...,True,False,False,False,True,False,False,False,False,False
2,False,False,True,False,True,False,False,False,False,False,...,True,True,False,False,True,False,False,False,False,False
3,False,False,True,True,True,False,True,False,True,True,...,True,False,False,False,True,False,True,True,False,False
4,False,False,True,False,True,False,True,False,False,False,...,True,False,False,False,True,False,False,False,False,False
5,False,True,False,True,True,False,False,False,True,True,...,True,True,False,True,True,True,True,True,False,False
6,False,True,True,True,True,False,False,False,True,True,...,True,True,False,False,True,False,True,True,False,False
7,False,False,False,False,True,False,True,False,False,False,...,True,True,False,False,True,False,False,False,False,False
8,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
9,False,False,True,True,True,False,False,False,False,False,...,True,False,False,False,True,False,False,False,False,False


In [10]:
df.isnull().sum()

name                           0
bonus                         64
deferral_payments            107
deferred_income               97
director_fees                129
email_address                 35
exercised_stock_options       44
expenses                      51
from_messages                 60
from_poi_to_this_person       60
from_this_person_to_poi       60
loan_advances                142
long_term_incentive           80
other                         53
restricted_stock              36
restricted_stock_deferred    128
salary                        51
shared_receipt_with_poi       60
to_messages                   60
total_payments                21
total_stock_value             20
dtype: int64

In [11]:
len(df)

146

In [12]:
# 檢查 DataFrame 空缺值狀況
def na_check(df_data):
    data_na = df.isnull().sum() / len(df_data) * 100
    data_na = data_na.drop(data_na[data_na==0].index).sort_values(ascending=False)
    missing_data = pd.DataFrame({'Missing Ratio': data_na})
    print(missing_data)
na_check(df)

                           Missing Ratio
loan_advances                  97.260274
director_fees                  88.356164
restricted_stock_deferred      87.671233
deferral_payments              73.287671
deferred_income                66.438356
long_term_incentive            54.794521
bonus                          43.835616
from_messages                  41.095890
from_poi_to_this_person        41.095890
from_this_person_to_poi        41.095890
shared_receipt_with_poi        41.095890
to_messages                    41.095890
other                          36.301370
expenses                       34.931507
salary                         34.931507
exercised_stock_options        30.136986
restricted_stock               24.657534
email_address                  23.972603
total_payments                 14.383562
total_stock_value              13.698630


## 特徵處理

In [13]:
obj_feature = []
int_feature = []
for dtype, feature in zip (df.dtypes, df.columns):
    if dtype == 'float64' or dtype == 'int64':
        int_feature.append(feature)
    else:
        obj_feature.append(feature)
        
print(f'obj_feature: {obj_feature}')
print(f'int_feature: {int_feature}')

obj_feature: ['name', 'email_address']
int_feature: ['bonus', 'deferral_payments', 'deferred_income', 'director_fees', 'exercised_stock_options', 'expenses', 'from_messages', 'from_poi_to_this_person', 'from_this_person_to_poi', 'loan_advances', 'long_term_incentive', 'other', 'restricted_stock', 'restricted_stock_deferred', 'salary', 'shared_receipt_with_poi', 'to_messages', 'total_payments', 'total_stock_value']


In [14]:
# 缺失 補0
for col in int_feature:
    df[col] = df[col].fillna(0)

In [15]:
na_check(df)

               Missing Ratio
email_address      23.972603


In [16]:
# Email 補 None
df['email_address'] = df['email_address'].fillna('None')

In [17]:
na_check(df)

Empty DataFrame
Columns: [Missing Ratio]
Index: []


In [18]:
# one hote encoding
df = pd.get_dummies(df)

In [19]:
df.shape

(146, 277)

In [20]:
## 切分訓練、驗證、資料
train_num = train_Y.shape[0]
train_X = df[:train_num]
test_X = df[train_num:]
print(f'train_num: {train_num}')
print(f'train_X.shape: {train_X.shape}')
print(f'test_X.shape: {test_X.shape}')

train_num: 113
train_X.shape: (113, 277)
test_X.shape: (33, 277)


In [21]:
train_x, val_x, train_y, val_y = train_test_split(train_X, train_Y, test_size=0.2, random_state=4)

print(f'train_x.shape: {train_x.shape}')
print(f'val_x.shape: {val_x.shape}')

train_x.shape: (90, 277)
val_x.shape: (23, 277)


In [22]:
# LogisticRegression
lr =LogisticRegression()
lr.fit(train_x, train_y)
val_pred = lr.predict(val_x)
print(f'LogisticRegression {accuracy_score(val_pred, val_y)}')

# 匯出csv
lr_test_pred = lr.predict_proba(test_X)[:,1]
sub = pd.DataFrame({'name': test_data['name'], 'poi': lr_test_pred})
sub.to_csv('3dr_100_lr_v2.csv', index=False)

LogisticRegression 0.8260869565217391


In [23]:
# RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(train_x, train_y)
val_pred = rf.predict(val_x)
print(f'RandomForestClassifier :{accuracy_score(val_pred, val_y)}')

# 匯出csv
rf_test_pred = rf.predict_proba(test_X)[:,1]
sub = pd.DataFrame({'name': test_data['name'], 'poi': rf_test_pred})
sub.to_csv('3dr_100_rf_v2.csv', index=False)

RandomForestClassifier :0.782608695652174


In [24]:
# GradientBoostingClassifier
gdbt = GradientBoostingClassifier()
gdbt.fit(train_x, train_y)
val_pred = gdbt.predict(val_x)
print(f'GradientBoostingClassifier :{accuracy_score(val_pred, val_y)}')

# 匯出csv
gdbt_test_pred = gdbt.predict_proba(test_X)[:,1]
sub = pd.DataFrame({'name': test_data['name'], 'poi': gdbt_test_pred})
sub.to_csv('3dr_100_gdbt_v2.csv', index=False)

GradientBoostingClassifier :0.8695652173913043


In [25]:
# Blending
blending_pred = lr_test_pred *0.3 + rf_test_pred *0.3 + gdbt_test_pred *0.4
sub = pd.DataFrame({'name': test_data['name'], 'poi': blending_pred})
sub.to_csv('3dr_100_bleading_v2.csv', index=False)

In [36]:
# stacking
from mlxtend.classifier import StackingClassifier
meta_estimator = GradientBoostingClassifier(tol=10, subsample=0.44, n_estimators=100, 
                                            max_features='log2', max_depth=4, learning_rate=0.1)
stacking = StackingClassifier(classifiers=[lr, gdbt, rf], meta_classifier=meta_estimator)

stacking.fit(train_x, train_y)
val_pred = stacking.predict(val_x)
print(f'StackingRegressor {accuracy_score(val_pred, val_y)}')

stacking_test_pred = stacking.predict_proba(test_X)[:,1]
sub = pd.DataFrame({'name': test_data['name'], 'poi': stacking_test_pred})
sub.to_csv('3dr_100_stacking_v2.csv', index=False)

StackingRegressor 0.8695652173913043


In [30]:
stacking_test_pred

array([False, False, False, False, False, False,  True, False, False,
       False, False, False, False, False,  True, False, False, False,
       False, False,  True, False, False, False, False, False, False,
       False, False, False, False, False, False])