In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import accuracy_score as acs
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import KNNImputer

In [2]:
path_2_train_file = '/Users/zhangmeng/Desktop/study/ESCP_SEP/machine learning with python/individual/train.csv'
df = pd.read_csv(path_2_train_file)

In [3]:
df.columns
df.label = (df.label > 0)*1

In [4]:
df.C7.value_counts()

2303407174    24113
1342367140     4123
2411180110     1420
730319316       940
3649211661      764
549342611        68
Name: C7, dtype: int64

In [8]:
dff = df.drop(columns=['id','visitTime','purchaseTime','C1','C2','C3','C10','hour','C4','C5','C6','C8','C11','C12'])

In [9]:
my_cat = ['C7','C9']

In [10]:
dff[my_cat] = dff[my_cat].astype('category')

In [11]:
dff.columns

Index(['label', 'C7', 'N1', 'C9', 'N2', 'N3', 'N4', 'N5', 'N6', 'N7', 'N8',
       'N9', 'N10'],
      dtype='object')

In [12]:
dff=pd.get_dummies(dff) 

In [13]:
my_num = ['N1','N2','N3', 'N4', 'N5', 'N6','N7', 'N8', 'N9', 'N10']

In [14]:
imputer = KNNImputer(n_neighbors=10)
dff[my_num] = imputer.fit_transform(dff[my_num])

In [15]:
len(dff.columns)

77

In [16]:
dff.dropna()

Unnamed: 0,label,N1,N2,N3,N4,N5,N6,N7,N8,N9,...,C9_3623356211,C9_3738539033,C9_3790028721,C9_3828882129,C9_3863488499,C9_3878556116,C9_3898085784,C9_3911032568,C9_4054959403,C9_4287642944
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
1,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31423,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
31424,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
31425,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
31426,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1


In [17]:
Y = dff.label

In [18]:
X = dff.drop(['label'],axis = 1)

# Split data into training and testing

In [19]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,random_state=666,shuffle=True,stratify=Y)
Train=X_train.join(Y_train)

In [20]:
Y_train.value_counts()

0    25096
1       46
Name: label, dtype: int64

In [21]:
print(len(X_train.columns))

76


## resampling

In [171]:

# oversampling minority then undersampling majority
over = SMOTE(sampling_strategy=0.02,random_state=666)
under = RandomUnderSampler(sampling_strategy=0.02,random_state=666)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
#create X_totrain and Y_totrain in order to train the model
X_totrain, Y_totrain = pipeline.fit_resample(X_train, Y_train)

In [172]:
Y_totrain.value_counts()

0    25050
1      501
Name: label, dtype: int64

## modeling, play with several models and choose random forest finally

In [174]:
print('Training')
# evaluate each model in turn
results = []
names = []
rf = RandomForestClassifier(max_depth = 15, random_state=666,max_leaf_nodes=60)
rf = rf.fit(X_totrain, Y_totrain)
train_pred = rf.predict(X_train)
print(metrics.accuracy_score(Y_train,train_pred))
print(metrics.classification_report(Y_train,train_pred))


print('\n')

print('Testing')


y_pred = rf.predict(X_test)
#Evaluate predictions
print(metrics.accuracy_score(Y_test,y_pred))
print(metrics.classification_report(Y_test,y_pred))

# Get numerical feature importances
feature_list = X.columns
importances = list(rf.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]

Training
0.999681807334341
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     25096
           1       1.00      0.83      0.90        46

    accuracy                           1.00     25142
   macro avg       1.00      0.91      0.95     25142
weighted avg       1.00      1.00      1.00     25142



Testing
0.999522748965956
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6275
           1       1.00      0.73      0.84        11

    accuracy                           1.00      6286
   macro avg       1.00      0.86      0.92      6286
weighted avg       1.00      1.00      1.00      6286

Variable: N10                  Importance: 0.3
Variable: N9                   Importance: 0.23
Variable: N6                   Importance: 0.16
Variable: N8                   Importance: 0.08
Variable: N3                   Importance: 0.05
Variable: N4                   Importance: 0.04
Vari

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

## Now let's select features and rebuild a better one

In [175]:
# Create a selector object that will use the random forest classifier to identify
# features that have an importance of more than 0.002
sfm = SelectFromModel(rf, threshold=0.001)

# Train the selector
sfm.fit(X_totrain, Y_totrain)

SelectFromModel(estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                                 class_weight=None,
                                                 criterion='gini', max_depth=15,
                                                 max_features='auto',
                                                 max_leaf_nodes=60,
                                                 max_samples=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=1,
                                                 min_samples_split=2,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=100, n_jobs=None,
                                                 oob_score=False,
                                                 ran

In [176]:
# Print the names of the most important features
my_list = []
for feature_list_index in sfm.get_support(indices=True):
    print(X.columns[feature_list_index])
    my_list.append(X.columns[feature_list_index])

N1
N2
N3
N4
N5
N6
N7
N8
N9
N10
C7_730319316
C7_1342367140
C7_2303407174
C7_2411180110
C7_3649211661
C9_14692648
C9_385980380
C9_399159304
C9_477276257
C9_1421481761
C9_1690004026
C9_2226682598
C9_2698786965
C9_2711086415
C9_3272771221
C9_3310009198
C9_3329757940
C9_3598157468
C9_3911032568
C9_4287642944


In [184]:
X_important_train = sfm.transform(X_totrain)
X_important_test = sfm.transform(X_test)
# Create a new random forest classifier for the most important features
clf_important = RandomForestClassifier(max_depth = 13, random_state=666, n_estimators = 500 ,max_leaf_nodes = 14)

# Train the new classifier on the new dataset containing the most important features
clf_important.fit(X_important_train, Y_totrain)
y_important_pred = clf_important.predict(X_important_train)
print('train')
print(metrics.accuracy_score(Y_totrain,y_important_pred))

print(metrics.classification_report(Y_totrain,y_important_pred))
print('test')
y_important_test_pred = clf_important.predict(X_important_test)
print(metrics.accuracy_score(Y_test,y_important_test_pred))
print(metrics.classification_report(Y_test,y_important_test_pred))

train
0.9967515948495167
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     25050
           1       0.98      0.85      0.91       501

    accuracy                           1.00     25551
   macro avg       0.99      0.92      0.95     25551
weighted avg       1.00      1.00      1.00     25551

test
0.999522748965956
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6275
           1       0.90      0.82      0.86        11

    accuracy                           1.00      6286
   macro avg       0.95      0.91      0.93      6286
weighted avg       1.00      1.00      1.00      6286



In [51]:
my_list

['N1',
 'N2',
 'N3',
 'N4',
 'N6',
 'N7',
 'N8',
 'N9',
 'N10',
 'C7_730319316',
 'C7_1342367140',
 'C7_2303407174',
 'C9_14692648',
 'C9_1421481761',
 'C9_3310009198',
 'C9_3911032568',
 'C9_4287642944']

# Save model

In [33]:
# save
filename = '/Users/zhangmeng/Desktop/study/ESCP_SEP/machine learning with python/individual/my_model.sav'
pickle.dump(clf_important, open(filename, 'wb'))
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))

In [86]:
dftest = pd.read_csv("/Users/zhangmeng/Desktop/study/ESCP_SEP/machine learning with python/individual/test1.csv", encoding='utf8')

In [87]:
dfftest = dftest.drop(columns=['id','visitTime','purchaseTime','C1','C2','C3','C10','hour','C4','C5','C6','C8','C11','C12'])

In [88]:
my_cat = ['C7','C9']

In [89]:
dfftest[my_cat] = dfftest[my_cat].astype('category')

In [90]:
dfftest=pd.get_dummies(dfftest) 

In [91]:
dfftest = dfftest[my_list]

In [92]:
dftest['prob0'],dftest['prob1'] =loaded_model.predict_proba(dfftest)[0:,0], loaded_model.predict_proba(dfftest)[0:,1]

In [94]:
dftest = dftest[['id','prob0']]

In [101]:
dftest.to_csv('/Users/zhangmeng/Desktop/study/ESCP_SEP/machine learning with python/individual/meng_result.csv')