<a href="https://colab.research.google.com/github/ngngocsonan2610/note/blob/master/1.PreProcessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

>>[Preprocessing](#scrollTo=-2COeSc5ml1_)

>>[Transformation Scaling/Standardize](#scrollTo=PcX1fpPrrg-_)

>>[Encoding](#scrollTo=qeLaVzFW0Se6)

>>>[Onehot Encode](#scrollTo=HULldo6Zm6l1)

>>>[Label Encode](#scrollTo=Hz-vTHED0Vbc)

>>>[Reduce memory](#scrollTo=WzFvPys_F9KB)

>>>[Reduce dimensional checking](#scrollTo=a_a8z7m7uec_)

>>[Feature Generation](#scrollTo=ffkEEAsbC4EW)

>>[Feature Selection](#scrollTo=LfdUhklbDAQa)

>>>[High correlation](#scrollTo=k9prSardKWXm)

>>>[WOE & IV](#scrollTo=t-8cO9PJ3ryk)

>>>[RFE](#scrollTo=cmnU0NRff_ck)

>>[Sampling](#scrollTo=JfMtLkkO7fxr)

>>>[Under-sampling](#scrollTo=5mzGc0BE7xWq)

>>>[Over-sampling](#scrollTo=4il7y5CI8O8D)

>>>>[RandomOverSampler](#scrollTo=2ioqMRx98Syz)

>>>>[SMOTE](#scrollTo=8A8WLw5B8dnc)



## Preprocessing

## Transformation Scaling/Standardize
News:
- https://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing


Căn bản:
- [Scale, Standardize, or Normalize with Scikit-Learn](https://towardsdatascience.com/scale-standardize-or-normalize-with-scikit-learn-6ccc7d176a02)
![alt text](https://miro.medium.com/max/4800/1*z-C9ANBC4rjsk-ZK4wzijg.png)
- [Why, How and When to Scale your Features](https://medium.com/greyatom/why-how-and-when-to-scale-your-features-4b30ab09db5e)


- Đối với từng model nên xét thêm





In [0]:
from sklearn.preprocessing import RobustScaler
nottrans_num_col = ['AGSegCode', 'MAPASegCode',
                    'Leader_AGSegCode', 'Leader_MAPASegCode']
                    
scale_col = [c for c in WOE_numeric_col if c not in nottrans_num_col]
print('scale cols' ,scale_col)
robust_scale = RobustScaler().fit(round(df_WOE[scale_col],2))
df_WOE[scale_col] = robust_scale.transform(df_WOE[scale_col])

## Encoding


### Onehot Encode

In [0]:
dummies_col = []
for c in ['Leader_RegionCode', 'RegionCode', 'ContactProvince_Code']:
    print('> Get dummies with prefix {}'.format(c+'_'))
    df_dummies = pd.get_dummies(df[c], prefix = c ) 
    dummies_col = dummies_col +(df_dummies.columns.to_list())
    df = pd.concat( [df, df_dummies] , axis = 1)
    
print('dummies col: ',dummies_col)


### Label Encode

In [0]:
df = df.replace({
             'Leader_AGLevel' : {'PM':1,'UM':2,'BM':3}
           })

In [0]:
for col in ['Leader_RegionCode', 'RegionCode', 'ContactProvince_Code','EducationCode']:
    print('> ',col)
    df[col] = df[col].astype('category').cat.codes

### Reduce memory

In [0]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

### Reduce dimensional checking


In [0]:
#scaling only on tranform data
df_V = df_V[c_V_trans] 
df_V_test = df_V_test[c_V_trans]

#standardize scaled
scaler = StandardScaler().fit(df_V)
df_V_scaled = scaler.transform(df_V)
print('Scaled mean df_V',df_V_scaled[:,0].mean())  # zero (or very close)
print('Scaled std df_V',df_V_scaled[:,0].std()) 
scaler = StandardScaler().fit(df_V_test)
df_V_test_scaled = scaler.transform(df_V_test)
print('Scaled mean df_V',df_V_test_scaled[:,0].mean())  # zero (or very close)
print('Scaled std df_V',df_V_test_scaled[:,0].std()) 

# plot cumulative explained variance
# pca = PCA().fit(df_V_scaled)
# plt.plot(np.cumsum(pca.explained_variance_ratio_))
# #plt.xlim(0,7,1)
# plt.xlabel('Number of components')
# plt.ylabel('Cumulative explained variance')


# Setup Principal component analysis
pca = PCA(n_components=125) 
#pca = PCA(n_components=0.96)  #v2: n_components=0.95
df_V_pca = pca.fit_transform(df_V_scaled)
np.save('df_V_pca_v3.npy',df_V_pca)
#df_V_pca.to_csv('df_V_pca', sep='\t')
df_V_test_pca = pca.fit_transform(df_V_test_scaled)
np.save('df_V_test_pca_v3.npy',df_V_test_pca)


In [0]:
import itertools
import warnings
warnings.filterwarnings("ignore")
import io
import plotly.offline as py#visualization
py.init_notebook_mode(connected=True)#visualization
import plotly.graph_objs as go#visualization
import plotly.tools as tls#visualization
import plotly.figure_factory as ff#visualization

'''
Scatter plot giữa 3 cột bất kỳ trong dữ liệu, được hue = TARGET
cho cái nhìn về dữ liệu, xem liệu nó có khả năng phân tách không
'''

trace1 = go.Scatter3d(x = churn["MEMBER_ANNUAL_INCOME"],
                      y = churn["ANNUAL_FEES"],
                      z = churn["MEMBERSHIP_TERM_YEARS"],
                      mode = "markers",
                      name = "Churn customers",
                      text = "Id : " + churn["MEMBERSHIP_NUMBER"],
                      marker = dict(size = 1,color = "red")
                     )
trace2 = go.Scatter3d(x = not_churn["MEMBER_ANNUAL_INCOME"],
                      y = not_churn["ANNUAL_FEES"],
                      z = not_churn["MEMBERSHIP_TERM_YEARS"],
                      name = "Non churn customers",
                      text = "Id : " + not_churn["MEMBERSHIP_NUMBER"],
                      mode = "markers",
                      marker = dict(size = 1,color= "green")
                     )

layout = go.Layout(dict(title = "Monthly charges,total charges & tenure in customer attrition",
                        scene = dict(camera = dict(up=dict(x= 0 , y=0, z=0),
                                                   center=dict(x=0, y=0, z=0),
                                                   eye=dict(x=1.25, y=1.25, z=1.25)),
                                     xaxis  = dict(title = "annual incomes",
                                                   gridcolor='rgb(255, 255, 255)',
                                                   zerolinecolor='rgb(255, 255, 255)',
                                                   showbackground=True,
                                                   backgroundcolor='rgb(230, 230,230)'),
                                     yaxis  = dict(title = "annual fees",
                                                   gridcolor='rgb(255, 255, 255)',
                                                   zerolinecolor='rgb(255, 255, 255)',
                                                   showbackground=True,
                                                   backgroundcolor='rgb(230, 230,230)'
                                                  ),
                                     zaxis  = dict(title = "term years",
                                                   gridcolor='rgb(255, 255, 255)',
                                                   zerolinecolor='rgb(255, 255, 255)',
                                                   showbackground=True,
                                                   backgroundcolor='rgb(230, 230,230)'
                                                  )
                                    ),
                        height = 700,
                       )
                  )
                  

data = [trace1,trace2]
fig  = go.Figure(data = data,layout = layout)
py.iplot(fig)

In [0]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 2)

X = tmp[[i for i in tmp.columns if i not in Id_col + target_col]]
Y = tmp[target_col + Id_col]

principal_components = pca.fit_transform(X)
pca_data = pd.DataFrame(principal_components,columns = ["PC1","PC2"])
pca_data = pca_data.merge(Y,left_index=True,right_index=True,how="left")
pca_data["CHURN"] = pca_data["CHURN"].replace({1:"CANCELLED",0:"INFORCE"})

def pca_scatter(target,color) :
    tracer = go.Scatter(x = pca_data[pca_data["CHURN"] == target]["PC1"] ,
                        y = pca_data[pca_data["CHURN"] == target]["PC2"],
                        name = target,mode = "markers",
                        marker = dict(color = color,
                                      line = dict(width = .5),
                                      symbol =  "diamond-open"),
                        text = ("Customer Id : " + 
                                pca_data[pca_data["CHURN"] == target]['MEMBERSHIP_NUMBER'])
                       )
    return tracer

layout = go.Layout(dict(title = "Visualising data with principal components",
                        plot_bgcolor  = "rgb(243,243,243)",
                        paper_bgcolor = "rgb(243,243,243)",
                        xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                     title = "principal component 1",
                                     zerolinewidth=1,ticklen=5,gridwidth=2),
                        yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                     title = "principal component 2",
                                     zerolinewidth=1,ticklen=5,gridwidth=2),
                        height = 600
                       )
                  )
trace1 = pca_scatter("CANCELLED",'red')
trace2 = pca_scatter("INFORCE",'royalblue')
data = [trace2,trace1]
fig = go.Figure(data=data,layout=layout)
py.iplot(fig)

## Feature Generation

[Simple FE](https://machinelearningcoban.com/general/2017/02/06/featureengineering/)
-    Trực tiếp lấy raw data
-    Bag-of-words
  -    Bag-of-Words trong Computer Vision
-    Feature Scaling and Normalization
  -        Rescaling
  -        Standardization
  -        Scaling to unit length

  - Feature Generation: ở bước này tập trung các kĩ thuật để tạo ra feature: xử lí nlp, image, binning, scaling, grouping, aggregate.
  - Kết hợp với Modelling để tìm ra nhóm feature tốt và tập trung mạnh vào đó


## Feature Selection




Research:
- [sklearn.feature_selection](https://scikit-learn.org/stable/modules/feature_selection.html#univariate-feature-selection)
- [2](https://www.kaggle.com/sz8416/6-ways-for-feature-selection) example selectkbes-RFE in kaggle
- [3](https://www.kaggle.com/dkim1992/feature-selection-ranking) chua doc
- [4](https://machinelearningmastery.com/feature-selection-machine-learning-python/) tong quat
- [sklearn.feature_selection](https://scikit-learn.org/stable/modules/feature_selection.html)
- [Cách tìm feature bằng cách chạy các linear reg](https://blog.datadive.net/selecting-good-features-part-iv-stability-selection-rfe-and-everything-side-by-side/) chua doc
- [6. Blog tổng hợp các method](https://mlwhiz.com/blog/2019/08/07/feature_selection/)

- Feature Selection: ở bước này tập trung các kĩ thuật chọn feature để tốt ưu mô hình
  - Chọn được feature tốt và tối ưu được bộ nhớ
  - Chọn được model parameters tốt nhất phù hợp với bộ features tốt nhất
  - Kết hợp với các kĩ thuật chia dataset (CV, leave-one)

- Tiêu chí đánh giá **metric of feature**
  - AUC: thể hiện khả năng dự đoán
  - Correlation: kiểm tra độ tương quan giữa feature với target hoặc với các feature quan trọng khác
  - Converage: kiểm tra null, null nhiều thì ít thông tin
  - Weighted of evidence, and information value:
    - $\ln \left(\frac{P(\text {Good})}{P(\text {Bad})}\right)$
    - \begin{array}{l}{\text { Information value }} \\ {\qquad \sum(P(G o o d)-P(B a d)) * \ln \left(\frac{P(G o d)}{P(B a d)}\right)}\end{array}



[Brute Force Approach](https://www.kdnuggets.com/2017/11/rapidminer-basic-concepts-feature-selection.html)
- Cách tiếp cận bằng các chạy thử thất cả các feature combination và so sánh trên các metrics of feature

[Feature Selection]
- Forwarding: ta bắt đầu với tập feature rỗng, sau đó lần lượt add thêm feature vào tập này. Nếu thấy performance của model tăng ta sẽ tiếp tục quá trình này, ngược lại sẽ dừng lại.
- Backwarding: ta bắt đầu với toàn bộ tập feature, sau đó lần lượt remove từng feature khỏi tập này. Nếu thấy performance của model tăng hoặc giảm không quá nhiều, ta sẽ tiếp tục quá trình này, ngược lại nếu performance bị drop quá mạnh sẽ dừng lại.
- Hybridge: kết hợp cả 2 hướng trên




### High correlation 

**High correlation features**
Có thể xóa bỏ bớt các cột có high correlate với nhau vì: 
- các cột có ý nghĩa như nhau với model
- có cơ hội học được từ các cột khác
- **quan trọng trong việc train NN** vì giảm được khối lượng dữ liệu

In [0]:
# drop columns with correlation greater than 0.9
corr_matrix = df.drop(columns="isMale").corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
print("high correlation features", to_drop)
df.drop(columns=to_drop, inplace=True)

In [0]:
#TESTING FOR MULTICOLLINEARITY:
from patsy import dmatrices
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
#RUNNING FOR MULTIPLE REGRESSION:
# %%capture
#gather features
vif_test = df_train
vif_test = vif_test.drop('new_Active_Net.1', axis = 1)
to_drop = ['new_Leader_ActNet_ratio_6m','new_Leader_RegionCode','new_Leader_APE_ratio_3m','new_NewRe_APE_ratio_3m','new_CaseNet_SumLast3m','new_Leader_APE_ratio_6m','new_NewRe_Manpower_SumLas3p','new_Leader_Case_ratio_3m','new_AG_Case_ratio_3m','new_NewRe_Case_Net_SumLas3p']
vif_test = vif_test.drop(to_drop,axis =1)
features = "+".join(vif_test.drop('label_1',axis=1).columns)
# get y and X dataframes based on this regression:
y, X = dmatrices('label_1 ~' + features, df_train, return_type='dataframe')
vif = pd.DataFrame()
range = np.arange(0,X.shape[1],1)
vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range]
vif["features"] = X.columns

col_vif = vif.drop(0).features
df_train = df_train.drop(to_drop,axis =1)
df_test = df_test.drop(to_drop,axis =1)
vif.sort_values('VIF Factor',ascending = False)


### WOE & IV
- https://www.one-tab.com/page/0V6bYIX4ShSQmuW4bpD7Zg


In [0]:
import pandas as pd
import numpy as np
import pandas.core.algorithms as algos
from pandas import Series
import scipy.stats.stats as stats
import re
import traceback
import string

max_bin = 20
force_bin = 3

# define a binning function
def mono_bin(Y, X, n = max_bin):
    
    df1 = pd.DataFrame({"X": X, "Y": Y})
    justmiss = df1[['X','Y']][df1.X.isnull()]
    notmiss = df1[['X','Y']][df1.X.notnull()]
    r = 0
    while np.abs(r) < 1:
        try:
            d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.qcut(notmiss.X, n)})
            d2 = d1.groupby('Bucket', as_index=True)
            r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
            n = n - 1 
        except Exception as e:
            n = n - 1

    if len(d2) == 1:
        n = force_bin         
        bins = algos.quantile(notmiss.X, np.linspace(0, 1, n))
        if len(np.unique(bins)) == 2:
            bins = np.insert(bins, 0, 1)
            bins[1] = bins[1]-(bins[1]/2)
        d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.cut(notmiss.X, np.unique(bins),include_lowest=True)}) 
        d2 = d1.groupby('Bucket', as_index=True)
    
    d3 = pd.DataFrame({},index=[])
    d3["MIN_VALUE"] = d2.min().X
    d3["MAX_VALUE"] = d2.max().X
    d3["COUNT"] = d2.count().Y
    d3["EVENT"] = d2.sum().Y
    d3["NONEVENT"] = d2.count().Y - d2.sum().Y
    d3=d3.reset_index(drop=True)
    
    if len(justmiss.index) > 0:
        d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
        d4["MAX_VALUE"] = np.nan
        d4["COUNT"] = justmiss.count().Y
        d4["EVENT"] = justmiss.sum().Y
        d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
        d3 = d3.append(d4,ignore_index=True)
    
    d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
    d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
    d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
    d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
    d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["VAR_NAME"] = "VAR"
    d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]       
    d3 = d3.replace([np.inf, -np.inf], 0)
    d3.IV = d3.IV.sum()
    
    return(d3)

def char_bin(Y, X):
        
    df1 = pd.DataFrame({"X": X, "Y": Y})
    justmiss = df1[['X','Y']][df1.X.isnull()]
    notmiss = df1[['X','Y']][df1.X.notnull()]    
    df2 = notmiss.groupby('X',as_index=True)
    
    d3 = pd.DataFrame({},index=[])
    d3["COUNT"] = df2.count().Y
    d3["MIN_VALUE"] = df2.sum().Y.index
    d3["MAX_VALUE"] = d3["MIN_VALUE"]
    d3["EVENT"] = df2.sum().Y
    d3["NONEVENT"] = df2.count().Y - df2.sum().Y
    
    if len(justmiss.index) > 0:
        d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
        d4["MAX_VALUE"] = np.nan
        d4["COUNT"] = justmiss.count().Y
        d4["EVENT"] = justmiss.sum().Y
        d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
        d3 = d3.append(d4,ignore_index=True)
    
    d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
    d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
    d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
    d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
    d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["VAR_NAME"] = "VAR"
    d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]      
    d3 = d3.replace([np.inf, -np.inf], 0)
    d3.IV = d3.IV.sum()
    d3 = d3.reset_index(drop=True)
    
    return(d3)

def data_vars(df1, target):
    
    stack = traceback.extract_stack()
    filename, lineno, function_name, code = stack[-2]
    vars_name = re.compile(r'\((.*?)\).*$').search(code).groups()[0]
    final = (re.findall(r"[\w']+", vars_name))[-1]
    
    x = df1.dtypes.index
    count = -1
    
    for i in x:
        if i.upper() not in (final.upper()):
            if np.issubdtype(df1[i], np.number) and len(Series.unique(df1[i])) > 2:
                conv = mono_bin(target, df1[i])
                conv["VAR_NAME"] = i
                count = count + 1
            else:
                conv = char_bin(target, df1[i])
                conv["VAR_NAME"] = i            
                count = count + 1
                
            if count == 0:
                iv_df = conv
            else:
                iv_df = iv_df.append(conv,ignore_index=True)
    
    iv = pd.DataFrame({'IV':iv_df.groupby('VAR_NAME').IV.max()})
    iv = iv.reset_index()
    return(iv_df,iv)

In [0]:
#from tqdm import tqdm
woe = pd.DataFrame()
for col in WOE_cat_col:
    l = []
    t_final_iv = final_iv[final_iv.VAR_NAME == col]
    print('> ',col)
    print(' no of bins: ',t_final_iv.shape[0])
    if t_final_iv.shape[0] <= 10:
        print(t_final_iv[['VAR_NAME','MIN_VALUE']])
    else:
        print(t_final_iv.sample(5)[['VAR_NAME','MIN_VALUE']])
    print('\n')
    for x in df_WOE[col]:
        l.append(t_final_iv[t_final_iv.MIN_VALUE == x].WOE.values[0])
    woe[col+'_woe'] = l

In [0]:
for col in WOE_numeric_col:
    l = []
    t_final_iv = final_iv[final_iv.VAR_NAME == col]
    print('> ',col)
    print(' no of bins: ',t_final_iv.shape[0])
    if t_final_iv.shape[0] <= 10:
        print(t_final_iv[['VAR_NAME','MIN_VALUE','MAX_VALUE']])
    else:
        print(t_final_iv.sample(5)[['VAR_NAME','MIN_VALUE','MAX_VALUE']])
    print('\n')
    for x in df_WOE[col]:
        l.append(t_final_iv[(t_final_iv.MIN_VALUE <= x)& (t_final_iv.MAX_VALUE >= x)].WOE.values[0])
    woe[col+'_woe'] = l

### **RFE**



[theory](https://www.kaggle.com/nroman/recursive-feature-elimination)

Trong mini course này, tôi sẽ áp dụng hướng “backwarding”. Các bước thực hiện như sau:

- Đặt: n là số lần lặp feature selection, k là số feature sẽ drop ở mỗi lần lặp, p là AUC sau mỗi lần train
Train model với XGboost
- Lấy kết quả feature important sắp xếp giảm dần và loại ra k feature có giá trị thấp nhất
- Lưu lại performance hiện tại để so sánh với performance tiếp theo.
- Nếu thấp hơn ngưỡng p sẽ dừng
- Tiếp tục quá trình selection

Tuỳ theo số lượng feature và cài đặt hyper-parameter của model thì thời gian sẽ nhanh chậm khác nhau.

Additional:
- Genetic algorithm for feature selection

Source:
- [1](https://ongxuanhong.wordpress.com/2019/04/17/data-science-mini-course/#more-15645)
- [2](https://towardsdatascience.com/feature-selection-in-python-recursive-feature-elimination-19f1c39b8d15) doc them



In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

#lr_best = LogisticRegressionCV
#lr_best = LogisticRegression(C=7.7, penalty='l2' ,class_weight='balanced')

In [0]:
from sklearn.feature_selection import RFECV
# Create the RFE object and compute a cross-validated score.
# The "accuracy" scoring is proportional to the number of correct classifications
rfecv = RFECV(estimator=lr_best, step=1, cv=10, scoring='roc_auc')
rfecv.fit(X_train, y_train)

best_features = X_train.columns[rfecv.support_].tolist()

print("Optimal number of features: %d" % rfecv.n_features_)
print('Selected features: %s' % best_features)


# Plot number of features VS. cross-validation scores
plt.figure(figsize=(10,6))
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(np.arange(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()

### 

## Sampling

### Under-sampling

In [0]:
# under-sampling 7:3
sample = train[train.label_2==1].shape[0]/3*7
train_US73=pd.concat([train[train.label_2!=1].sample(int(sample)),train[train.label_2==1]], axis = 0, ignore_index=True)
print('sample shape',train_US73.shape)

# under-sampling 5:5
# sample = train[train.label_2==1].shape[0]/5*5
# train_US55=pd.concat([train[train.label_2!=1].sample(int(sample)),train[train.label_2==1]], axis = 0, ignore_index=True
# print('sample shape',train_US73.shape)

In [0]:
#X = train[dummies_col+numeric_col]
X_train = train_US73[[x for x in train.columns if x not in ['label_2','First_month']]]
y_train = train_US73['label_2']
X_test = test[[x for x in train.columns if x not in ['label_2','First_month']]]
y_test = test['label_2']

### Over-sampling

#### RandomOverSampler

In [0]:
from collections import Counter

import matplotlib.pyplot as plt
import numpy as np

from sklearn.datasets import make_classification
from sklearn.svm import LinearSVC

from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import RandomOverSampler
from imblearn.base import BaseSampler

from sklearn.metrics import roc_auc_score, f1_score

In [0]:
#X = train[dummies_col+numeric_col]
X_train = train[[x for x in train.columns if x not in ['label_2','First_month']]]
y_train = train['label_2']
X_test = test[[x for x in train.columns if x not in ['label_2','First_month']]]
y_test = test['label_2']

In [0]:
model_smote = LogisticRegression()

pipe = make_pipeline(RandomOverSampler(sampling_strategy=1, random_state=0), model_smote)
pipe.fit(X_train, y_train)

In [0]:
model_smote_predictions = pipe.predict_proba(X_test)
model_smote_pred_label = pipe.predict(X_test) 
model_smote_roc_score = roc_auc_score( y_test, model_smote_predictions[:,1])
model_smote_f1_score = f1_score(y_test, model_smote_pred_label)
print('random forest roc score on test: ', model_smote_roc_score)
print('random forest f1 score on test: ', model_smote_f1_score)

confu_matrix = confusion_matrix(y_test, model_smote_pred_label) 
sns.heatmap(confu_matrix , annot=True, fmt='d')
print(classification_report(y_test, model_smote_pred_label) )

#### SMOTE

In [0]:
#X = train[dummies_col+numeric_col]
X_train = train[[x for x in train.columns if x not in ['label_2','First_month']]]
y_train = train['label_2']
X_test = test[[x for x in train.columns if x not in ['label_2','First_month']]]
y_test = test['label_2']

In [0]:
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import (SMOTE, BorderlineSMOTE, SVMSMOTE, SMOTENC)
from imblearn.over_sampling import RandomOverSampler
from imblearn.base import BaseSampler

rf_clf = LogisticRegression()

smotes = {0 : 'SMOTE',
          1 : 'BorderlineSMOTE',
          2 : 'SVMSMOTE',
          3 : 'ADASYN'}


for i, sampler in enumerate((SMOTE(sampling_strategy = 1, random_state=0),
                BorderlineSMOTE(sampling_strategy = 1, random_state=0, kind='borderline-1'),
                SVMSMOTE(sampling_strategy = 1, random_state=0),
                ADASYN(sampling_strategy = 1, random_state=0))):
    pipe_line = make_pipeline(sampler, rf_clf)
    pipe_line.fit(X_train, y_train)
    rf_predictions = pipe_line.predict_proba(X_test)
    rf_pred_label = pipe_line.predict(X_test) 
    rf_roc_score = roc_auc_score(y_test, rf_predictions[:,1])
    rf_f1_score = f1_score(y_test, rf_pred_label)
    print('------------------------------------------------')
    print('SMOTE method: ', smotes[i])
    print('random forest roc score on test: ', rf_roc_score)
    print('random forest f1 score on test: ', rf_f1_score)