# 訓練 😉

### 載入函式庫

In [1]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split 
from sklearn import tree
from sklearn import svm
from sklearn.metrics import accuracy_score
import graphviz
from collections import defaultdict
import pandas as pd
import data_generator
import os
import lightgbm as lgb
import numpy as np

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


### 讀入資料

In [2]:
df = pd.read_csv("data/student_data_30000.csv")
df.head()

Unnamed: 0,Avg sleep time,Avg study time,Avg video game time,BMI,In a relationship,Family financial status,GPA,Laptop brand,TOEIC grade,IQ,Grade
0,7,3,13,33.0,Yes,high,2.6,Apple,700,142,80+
1,7,3,12,24.0,No,average,2.6,Leveno,910,115,60-
2,8,3,11,31.0,Yes,high,3.2,Acer,545,128,60-
3,6,4,8,32.0,No,low,4.2,Asus,455,115,80+
4,7,4,10,32.0,Yes,average,2.9,Asus,945,148,60-


### 特徵

In [3]:
features = df.drop("Grade", axis=1)
features["In a relationship"] = features["In a relationship"].map({"Yes":1, "No":0})
features["Family financial status"] = features["Family financial status"].map({"low":0, "average":1, "high":2})
features["Laptop brand"] = features["Laptop brand"].map({"Leveno":0, "HP":1, "Dell":2, "Acer":3, "Asus":4, "Apple":5})
features.head()

Unnamed: 0,Avg sleep time,Avg study time,Avg video game time,BMI,In a relationship,Family financial status,GPA,Laptop brand,TOEIC grade,IQ
0,7,3,13,33.0,1,2,2.6,5,700,142
1,7,3,12,24.0,0,1,2.6,0,910,115
2,8,3,11,31.0,1,2,3.2,3,545,128
3,6,4,8,32.0,0,0,4.2,4,455,115
4,7,4,10,32.0,1,1,2.9,4,945,148


### 類別

In [4]:
label = df["Grade"]
label = label.map({"60-":0, "60~80":1, "80+":2})
label.value_counts()

2    10000
1    10000
0    10000
Name: Grade, dtype: int64

### 將資料分成訓練集(0.8)與測試集(0.2)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.20)

### Decision Tree 🌲

In [6]:
clf = tree.DecisionTreeClassifier(min_samples_split=2000, min_samples_leaf=1000, max_depth=8)
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=8,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1000, min_samples_split=2000,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [7]:
pred_label = clf.predict(X_test)

In [8]:
accuracy_score(y_test, pred_label)

0.7366666666666667

In [9]:
dot_data = tree.export_graphviz(clf, out_file=None)
graph = graphviz.Source(dot_data)
dot_data = tree.export_graphviz(clf, out_file="tree.dot", feature_names=list(features.columns), 
                                class_names=["60-", "60~80", "80+"], 
                                filled=True, rounded=True, 
                                special_characters=True)
# graph = graphviz.Source(dot_data)
# graph

### Support Vector Machine 🗿

In [10]:
clf = svm.SVC(gamma='scale')
clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [11]:
pred_label = clf.predict(X_test)

In [12]:
accuracy_score(y_test, pred_label)

0.5665

### LightGBM 🍃

In [13]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [14]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'metric': 'multi_logloss',
    'num_class': 3,
    'is_provide_training_metric': True,
    'learning_rate': 0.01,
    'num_threads': 6,
    'num_leaves': 30,
    'max_depth': 7
}

In [15]:
gbm = lgb.train(params, lgb_train, num_boost_round=10000, valid_sets=lgb_eval,
                early_stopping_rounds=300, verbose_eval=100)

Training until validation scores don't improve for 300 rounds.
[100]	valid_0's multi_logloss: 0.593373
[200]	valid_0's multi_logloss: 0.448243
[300]	valid_0's multi_logloss: 0.39713
[400]	valid_0's multi_logloss: 0.378482
[500]	valid_0's multi_logloss: 0.371765
[600]	valid_0's multi_logloss: 0.369561
[700]	valid_0's multi_logloss: 0.368604
[800]	valid_0's multi_logloss: 0.368433
[900]	valid_0's multi_logloss: 0.368793
[1000]	valid_0's multi_logloss: 0.369133
[1100]	valid_0's multi_logloss: 0.369656
Early stopping, best iteration is:
[819]	valid_0's multi_logloss: 0.368355


In [16]:
pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)

In [17]:
acc = accuracy_score(np.argmax(pred, axis=1), y_test)

In [18]:
acc

0.7888333333333334

In [19]:
# !jupyter nbconvert --to script training.ipynb