In [54]:
import pandas as pd
import numpy as np

In [112]:
df_train = pd.read_csv('data/train_solved_1.csv')
df_test = pd.read_csv('data/test_solved_1.csv')
df_label = pd.read_csv('data/label_1.csv')

In [76]:
# 数据归一化，效果较差
from sklearn.preprocessing import MinMaxScaler, StandardScaler
scale_list = ['Age','DistanceFromHome', 'Education', 'EnvironmentSatisfaction', 'JobInvolvement', 'JobSatisfaction',
              'MonthlyIncome', 'NumCompaniesWorked', 'PercentSalaryHike', 'RelationshipSatisfaction', 'StockOptionLevel',
         'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsAtCompany', 
              'YearsWithCurrManager']
for item in df_train.columns:
    scaler = MinMaxScaler()
    scaler.fit(df_train[item].values.reshape(-1, 1).astype('float64'))
    df_train[item] = scaler.transform(df_train[item].values.reshape(-1, 1).astype('float64'))
    scaler = MinMaxScaler()
    scaler.fit(df_test[item].values.reshape(-1, 1).astype('float64'))
    df_test[item] = scaler.transform(df_test[item].values.reshape(-1, 1).astype('float64'))

In [9]:
# 主成分分析，效果一般
def pca_data(df):
    from sklearn.decomposition import PCA
    pca = PCA(n_components='mle', copy=False,)
    df = pca.fit_transform(df)
    return df

In [115]:
# 对属性值进行one-hot编码，避免某些特征权重过大
from sklearn.preprocessing import OneHotEncoder
from scipy import sparse

feats = ["Age","BusinessTravel","Department","DistanceFromHome","Education","EducationField",
    "EnvironmentSatisfaction","Gender","JobInvolvement","JobLevel","JobRole","JobSatisfaction","MaritalStatus","MonthlyIncome",
    "NumCompaniesWorked","OverTime","PercentSalaryHike","PerformanceRating","RelationshipSatisfaction",
    "StockOptionLevel","TotalWorkingYears","TrainingTimesLastYear","WorkLifeBalance","YearsAtCompany","YearsInCurrentRole",
    "YearsSinceLastPromotion","YearsWithCurrManager","AgeDistance","AgeEnvir","JobRoleLevel","OverPerRating"]
for (i, feat) in enumerate(feats):
    encoder = OneHotEncoder(categories='auto')
    encoder.fit(np.hstack((df_train[feat].values, df_test[feat].values)).reshape(-1, 1))
    x_train = encoder.transform(df_train[feat].values.reshape(-1, 1))
    x_test = encoder.transform(df_test[feat].values.reshape(-1, 1))
    if i == 0:
        # 第一个不需要拼合到最终矩阵，因为是起点
        X_train = x_train
        X_test = x_test
    else:
        # 后面的拼合到第一矩阵，为稀疏矩阵
        X_train = sparse.hstack((X_train, x_train))
        X_test = sparse.hstack((X_test, x_test))

In [116]:
x_train = X_train
y_train = df_label['label']
x_test = X_test

In [118]:
# 多模型交叉验证
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
import sklearn.neural_network as sk_nn
from sklearn.model_selection import cross_val_score
models = {
    'LR': LogisticRegression(solver='liblinear', penalty='l2', C=1),
    'SVM': SVC(C=1, gamma='auto'),
    'DT': DecisionTreeClassifier(),
    'RF' : RandomForestClassifier(n_estimators=100),
    'AdaBoost': AdaBoostClassifier(n_estimators=100),
    'GBDT': GradientBoostingClassifier(n_estimators=100),
    'NN': sk_nn.MLPClassifier(activation='relu',solver='adam',alpha=0.0001,learning_rate='adaptive',learning_rate_init=0.001, max_iter=1000)  
}

for k, clf in models.items():
    print("the model is {}".format(k))
    scores = cross_val_score(clf, x_train, y_train, cv=10)
    print(scores)
    print("Mean accuracy is {}".format(np.mean(scores)))
    print("*" * 100)

the model is LR
[0.9009009  0.86486486 0.89090909 0.89090909 0.90909091 0.86363636
 0.85454545 0.85454545 0.86238532 0.86238532]
Mean accuracy is 0.8754172771603963
****************************************************************************************************
the model is SVM
[0.83783784 0.83783784 0.83636364 0.83636364 0.83636364 0.83636364
 0.83636364 0.83636364 0.8440367  0.8440367 ]
Mean accuracy is 0.8381930888352906
****************************************************************************************************
the model is DT
[0.85585586 0.83783784 0.82727273 0.83636364 0.83636364 0.83636364
 0.85454545 0.84545455 0.78899083 0.80733945]
Mean accuracy is 0.8326387605286687
****************************************************************************************************
the model is RF
[0.85585586 0.84684685 0.83636364 0.84545455 0.85454545 0.83636364
 0.84545455 0.85454545 0.86238532 0.85321101]
Mean accuracy is 0.8491026305705205
************************************

- 可以看到，就不调参情况而言，首选的逻辑回归具有较好的准确率，接下来对逻辑回归进行调参

In [82]:
# 网格搜索调参
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
penaltys = ['l1', 'l2']
Cs = np.arange(1, 10, 0.1)
parameters = dict(penalty=penaltys, C=Cs )
lr_penalty= LogisticRegression(solver='liblinear')
grid= GridSearchCV(lr_penalty, parameters,cv=10)
grid.fit(x_train,y_train)
grid.cv_results_
print(grid.best_score_)
print(grid.best_params_)  

0.860909090909091
{'C': 1.0, 'penalty': 'l1'}


In [117]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
print(clf)
scores = cross_val_score(clf, x_train, y_train, cv=10)
print(scores)
print("Mean accuracy is {}".format(np.mean(scores)))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
[0.9009009  0.86486486 0.89090909 0.89090909 0.90909091 0.86363636
 0.85454545 0.85454545 0.86238532 0.86238532]
Mean accuracy is 0.8754172771603963




In [680]:
clf.fit(x_train, y_train)
result = clf.predict(x_test)
file = pd.DataFrame()
file['result'] = result
file.to_csv('data/result.csv', index=False, encoding='utf-8',)