In [1]:
# https://signate.jp/competitions/752#evaluation
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")
sample = pd.read_csv("./sample_submit.csv",header=None)

In [2]:
import utils as Utils
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import fetch_california_housing

# データ読み込み
train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")

# trainを変換
train["BusinessTravel_E"] = Utils.label_encoding(train["BusinessTravel"])
train["Department_E"] = Utils.label_encoding(train["Department"])
train["EducationField_E"] = Utils.label_encoding(train["EducationField"])
train["Gender_E"] = Utils.label_encoding(train["Gender"])
train["JobRole_E"] = Utils.label_encoding(train["JobRole"])
train["MaritalStatus_E"] = Utils.label_encoding(train["MaritalStatus"])
train["Over18_E"] = Utils.label_encoding(train["Over18"])
train["OverTime_E"] = Utils.label_encoding(train["OverTime"])

train["Age_Binned"] = Utils.data_binned(train["Age"], [16, 25, 30, 35, 40, 45, 50, 55, 60], [0, 1, 2, 3, 4, 5, 6, 7])

train = train.drop(['BusinessTravel','Department','EducationField','Gender','JobRole','MaritalStatus','Over18','OverTime'], axis=1)

# 特徴量と目的変数の分割
X = train[['id', 'Age', 'DailyRate', 'DistanceFromHome', 'Education',
       'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement', 'JobLevel',
       'JobSatisfaction', 'MonthlyIncome', 'NumCompaniesWorked',
       'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction',
       'StandardHours', 'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany',
       'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager',
       'BusinessTravel_E', 'Department_E', 'EducationField_E',
       'Gender_E', 'JobRole_E', 'MaritalStatus_E', 'Over18_E', 'OverTime_E',
       'Age_Binned']]
y = train['Attrition']

# データの分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

#
# モデル構築（重回帰）と精度検証（R2）
#
# インスタンス
model = LinearRegression()
# 学習（学習データ利用）
model.fit(X_train, y_train)
# 精度検証（決定係数R2）
print('決定係数R2（学習データ）:', 
      model.score(X_train, y_train))
print('決定係数R2（テストデータ）:', 
      model.score(X_test, y_test))

#
# モデル構築（ランダムフォレスト）と精度検証（R2）
#
# インスタンス
model_rf = RandomForestRegressor()
# 学習（学習データ利用）
model_rf.fit(X_train, y_train)
# 精度検証（決定係数R2）
print('決定係数R2（学習データ）:', 
      model_rf.score(X_train, y_train))
print('決定係数R2（テストデータ）:', 
      model_rf.score(X_test, y_test))

決定係数R2（学習データ）: 0.19819750168266248
決定係数R2（テストデータ）: 0.16445988933896127
決定係数R2（学習データ）: 0.8878815878378379
決定係数R2（テストデータ）: 0.21040027499809033


In [None]:
# 相関係数（学習データ）
cor = pd.concat([X_train, y_train], axis=1).corr()
# 目的変数との相関係数の絶対値
target_cor = abs(cor['Attrition'])
print(target_cor)

In [None]:
#
# 特徴量選択（変数選択）
#
# 基準
c = 0.1
# 選択の実施
X_selected = target_cor[target_cor > c]
X_selected = X_selected.drop('Attrition').index
# 選択した特徴量（説明変数）
print(X_selected)

# print(X_train[X_selected].corr())

In [10]:
#
# 重回帰
#
# 学習（学習データ利用）
model.fit(X_train[X_selected], y_train)
# 精度検証（決定係数R2）
print('決定係数R2（学習データ）:', 
      model.score(X_train[X_selected],y_train))
print('決定係数R2（テストデータ）:', 
      model.score(X_test[X_selected],y_test))

#
# ランダムフォレスト
#
# 学習（学習データ利用）
model_rf.fit(X_train[X_selected], y_train)
# 精度検証（決定係数R2）
print('決定係数R2（学習データ）:', 
      model_rf.score(X_train[X_selected], y_train))
print('決定係数R2（テストデータ）:', 
      model_rf.score(X_test[X_selected], y_test))

決定係数R2（学習データ）: 0.16262969434049057
決定係数R2（テストデータ）: 0.14976297216373935
決定係数R2（学習データ）: 0.8799381756756757
決定係数R2（テストデータ）: 0.16270643953861452


# ラッパー法（Wrapper Method）で特徴量選択（変数選択）

In [None]:
#
# 重回帰の学習とその結果（係数のp値の出力）
#
# 切片追加
X_ = sm.add_constant(X)
# 重回帰学習
lr = sm.OLS(y,X_).fit()
# 各係数のp値
lr.pvalues

In [14]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif

# データ読み込み
train = pd.read_csv("train.csv")

# trainを変換
train["BusinessTravel_E"] = Utils.label_encoding(train["BusinessTravel"])
train["Department_E"] = Utils.label_encoding(train["Department"])
train["EducationField_E"] = Utils.label_encoding(train["EducationField"])
train["Gender_E"] = Utils.label_encoding(train["Gender"])
train["JobRole_E"] = Utils.label_encoding(train["JobRole"])
train["MaritalStatus_E"] = Utils.label_encoding(train["MaritalStatus"])
train["Over18_E"] = Utils.label_encoding(train["Over18"])
train["OverTime_E"] = Utils.label_encoding(train["OverTime"])

train = train.drop(['BusinessTravel','Department','EducationField','Gender','JobRole','MaritalStatus','Over18','OverTime'], axis=1)

# 特徴量と目的変数の分割
X = train.drop('Attrition', axis=1)
y = train['Attrition']

# Filter法（相関係数に基づく特徴量選択）
selector = SelectKBest(score_func=f_classif, k=10)
X_selected = selector.fit_transform(X, y)

# 選択された特徴量の列名を取得
selected_features = X.columns[selector.get_support()].tolist()

# 選択された特徴量の表示
print("Selected Features:", selected_features)

Selected Features: ['Age', 'EnvironmentSatisfaction', 'JobLevel', 'StockOptionLevel', 'TotalWorkingYears', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsWithCurrManager', 'MaritalStatus_E', 'OverTime_E']


  f = msb / msw


In [None]:
import pandas as pd
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# データ読み込み
train = pd.read_csv("train.csv")

# trainを変換
train["BusinessTravel_E"] = Utils.label_encoding(train["BusinessTravel"])
train["Department_E"] = Utils.label_encoding(train["Department"])
train["EducationField_E"] = Utils.label_encoding(train["EducationField"])
train["Gender_E"] = Utils.label_encoding(train["Gender"])
train["JobRole_E"] = Utils.label_encoding(train["JobRole"])
train["MaritalStatus_E"] = Utils.label_encoding(train["MaritalStatus"])
train["Over18_E"] = Utils.label_encoding(train["Over18"])
train["OverTime_E"] = Utils.label_encoding(train["OverTime"])

train = train.drop(['BusinessTravel','Department','EducationField','Gender','JobRole','MaritalStatus','Over18','OverTime'], axis=1)

# 特徴量と目的変数の分割
X = train.drop('Attrition', axis=1)
y = train['Attrition']

# データの分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Wrapper法（再帰的特徴量除去）
estimator = LogisticRegression(max_iter=2000)
selector = RFECV(estimator)
X_selected = selector.fit_transform(X_train, y_train)

# 選択された特徴量の列名を取得
selected_features = X_train.columns[selector.support_].tolist()

# 選択された特徴量の表示
print("Selected Features:", selected_features)

In [20]:
import pandas as pd
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# データ読み込み
train = pd.read_csv("train.csv")

# trainを変換
train["BusinessTravel_E"] = Utils.label_encoding(train["BusinessTravel"])
train["Department_E"] = Utils.label_encoding(train["Department"])
train["EducationField_E"] = Utils.label_encoding(train["EducationField"])
train["Gender_E"] = Utils.label_encoding(train["Gender"])
train["JobRole_E"] = Utils.label_encoding(train["JobRole"])
train["MaritalStatus_E"] = Utils.label_encoding(train["MaritalStatus"])
train["Over18_E"] = Utils.label_encoding(train["Over18"])
train["OverTime_E"] = Utils.label_encoding(train["OverTime"])

train = train.drop(['BusinessTravel','Department','EducationField','Gender','JobRole','MaritalStatus','Over18','OverTime'], axis=1)

# 特徴量と目的変数の分割
X = train.drop('Attrition', axis=1)
y = train['Attrition']

# データの分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Embedded法（L1正則化による特徴量選択）
estimator = LogisticRegression(penalty='l1', solver='liblinear')
selector = SelectFromModel(estimator)
X_selected = selector.fit_transform(X_train, y_train)

# 選択された特徴量の列名を取得
selected_features = X_train.columns[selector.get_support()].tolist()

# 選択された特徴量の表示
print("Selected Features:", selected_features)

Selected Features: ['id', 'Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'NumCompaniesWorked', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager', 'BusinessTravel_E', 'Department_E', 'EducationField_E', 'JobRole_E', 'MaritalStatus_E', 'OverTime_E']


In [7]:
# 相関係数の計算
correlation_matrix = train.corr()['Attrition']
# "Attrition"との相関係数を降順で表示
print(correlation_matrix.sort_values(ascending=False))
# ['OverTime_E','MaritalStatus_E','DistanceFromHome','Age','JobLevel','YearsWithCurrManager','YearsInCurrentRole']

Attrition                   1.000000
OverTime_E                  0.200415
MaritalStatus_E             0.155337
DistanceFromHome            0.119646
Department_E                0.113929
BusinessTravel_E            0.101148
JobRole_E                   0.093558
HourlyRate                  0.032017
PerformanceRating           0.024884
Gender_E                    0.006407
EducationField_E            0.006105
id                         -0.003991
WorkLifeBalance            -0.015836
TrainingTimesLastYear      -0.017048
JobInvolvement             -0.024697
Education                  -0.026255
JobSatisfaction            -0.027633
MonthlyIncome              -0.029431
NumCompaniesWorked         -0.037084
RelationshipSatisfaction   -0.045726
PercentSalaryHike          -0.048952
DailyRate                  -0.060977
YearsSinceLastPromotion    -0.104601
EnvironmentSatisfaction    -0.121957
StockOptionLevel           -0.124432
TotalWorkingYears          -0.136075
YearsAtCompany             -0.137592
A

In [11]:
# 情報利得を計算
from sklearn.feature_selection import mutual_info_classif

# Xは特徴量（エンコーディング済みのカテゴリカル特徴量を含む）, yはターゲット変数
information_gains = mutual_info_classif(X, y)
print(information_gains)

[0.00807841 0.01889232 0.         0.02695699 0.00053123 0.00406679
 0.02824724 0.00778775 0.02577912 0.         0.04717235 0.0150325
 0.00349416 0.         0.         0.02362113 0.03275779 0.04149435
 0.00292245 0.         0.02729487 0.03045367 0.01612522 0.04115012
 0.02424116 0.00705852 0.00154091 0.01154464 0.01660772 0.00950344
 0.         0.02241516 0.01800628]


In [12]:
# 相互情報量を計算
from sklearn.feature_selection import mutual_info_regression

# Xは特徴量（エンコーディング済みのカテゴリカル特徴量を含む）, yはターゲット変数
mutual_informations = mutual_info_regression(X, y)
print(mutual_informations)

[0.00580261 0.0077143  0.         0.02695699 0.         0.
 0.01966513 0.0129088  0.01615229 0.         0.06353736 0.01520102
 0.01308358 0.0299478  0.01453878 0.         0.05558406 0.02837453
 0.00653134 0.         0.01457839 0.02244025 0.         0.03177405
 0.02535901 0.         0.02517749 0.00135089 0.039009   0.02924609
 0.00272701 0.038203   0.00544183]


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# 訓練データとテストデータに分割する
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ランダムフォレストモデルを構築する
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# 特徴量の重要度を取得する
feature_importances = model.feature_importances_

# 特徴量の重要度を表示する（降順で表示）
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)

                     Feature  Importance
3           DistanceFromHome    0.065401
23      YearsWithCurrManager    0.062041
0                         id    0.060506
21        YearsInCurrentRole    0.058549
2                  DailyRate    0.057290
10             MonthlyIncome    0.054688
12         PercentSalaryHike    0.054668
17         TotalWorkingYears    0.053511
6                 HourlyRate    0.046884
1                        Age    0.045414
20            YearsAtCompany    0.043046
28                 JobRole_E    0.036089
11        NumCompaniesWorked    0.028135
31                OverTime_E    0.027285
8                   JobLevel    0.026815
18     TrainingTimesLastYear    0.024917
26          EducationField_E    0.024146
5    EnvironmentSatisfaction    0.023484
16          StockOptionLevel    0.023038
32                Age_Binned    0.022675
14  RelationshipSatisfaction    0.022542
22   YearsSinceLastPromotion    0.021698
9            JobSatisfaction    0.018838
4               

In [7]:
# ランダムフォレスト 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# データを読み込む（仮想的なデータを使用）
data = pd.read_csv('train.csv')

# 不要なカラムを削除する
data = data.drop(['id'], axis=1)

# カテゴリカルな特徴量を数値に変換する
label_encoder = LabelEncoder()
categorical_cols = ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole','MaritalStatus', 'OverTime', 'Over18']
for col in categorical_cols:
    data[col] = label_encoder.fit_transform(data[col])

# 欠損値があれば、それらを処理する（例えば、欠損値を平均値で埋めるなど）
data = data.fillna(data.mean())

# X = data.drop('Attrition', axis=1)
# X = data[['DistanceFromHome','YearsWithCurrManager','YearsInCurrentRole','DailyRate','MonthlyIncome','PercentSalaryHike','TotalWorkingYears']]
X = data[['OverTime','MaritalStatus','DistanceFromHome','Age','JobLevel','YearsWithCurrManager','YearsInCurrentRole']]
y = data['Attrition']
# print(X.columns)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)



Accuracy: 0.80
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.94      0.89       193
           1       0.50      0.26      0.34        47

    accuracy                           0.80       240
   macro avg       0.67      0.60      0.61       240
weighted avg       0.77      0.80      0.78       240



In [18]:
# ランダムフォレスト 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

# データを読み込む（仮想的なデータを使用）
data = pd.read_csv('train.csv')

# 不要なカラムを削除する
data = data.drop(['id'], axis=1)

# カテゴリカルな特徴量を数値に変換する
label_encoder = LabelEncoder()
categorical_cols = ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole','MaritalStatus', 'OverTime', 'Over18']
for col in categorical_cols:
    data[col] = label_encoder.fit_transform(data[col])

# 欠損値があれば、それらを処理する（例えば、欠損値を平均値で埋めるなど）
data = data.fillna(data.mean())

# X = data.drop('Attrition', axis=1)
X = data[['YearsWithCurrManager','OverTime','YearsInCurrentRole','RelationshipSatisfaction','JobRole','JobLevel','EnvironmentSatisfaction','HourlyRate','Department','BusinessTravel','Gender']]
y = data['Attrition']
# print(X.columns)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.45, random_state=42)

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# モデルの学習のためのパラメータ設定
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': 6,
    'eta': 0.3,
    'gamma': 0,
    'min_child_weight': 1,
    'subsample': 1,
    'colsample_bytree': 1,
    'lambda': 1,
    'alpha': 0,
    'seed': 42,
}

# モデルの学習
num_round = 100  # 学習の繰り返し回数（エポック数）
model = xgb.train(params, dtrain, num_round)

# 評価用データの予測
y_pred_proba = model.predict(dtest)  # 予測確率を出力
y_pred = (y_pred_proba > 0.5).astype(int)  # 0.5を閾値にして0または1に変換

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)



Accuracy: 0.82
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.94      0.90       436
           1       0.57      0.33      0.41       104

    accuracy                           0.82       540
   macro avg       0.71      0.63      0.65       540
weighted avg       0.80      0.82      0.80       540



In [13]:
import pandas as pd
import xgboost as xgb

# データを読み込む（仮想的なデータを使用）
data = pd.read_csv('train.csv')  # 'employee_data.csv'にはデータが保存されているファイル名を記述してください

# 不要なカラムを削除する
data = data.drop(['id'], axis=1)

# カテゴリカルな特徴量を数値に変換する
label_encoder = LabelEncoder()
categorical_cols = ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole','MaritalStatus', 'OverTime', 'Over18']
for col in categorical_cols:
    data[col] = label_encoder.fit_transform(data[col])

# 欠損値があれば、それらを処理する（例えば、欠損値を平均値で埋めるなど）
data = data.fillna(data.mean())

# 特徴量とターゲット変数に分割する
X = data.drop('Attrition', axis=1)
y = data['Attrition']

# XGBoost用のデータ構造に変換する
dtrain = xgb.DMatrix(data=X, label=y)

# XGBoostモデルを構築する
params = {'objective': 'binary:logistic', 'eval_metric': 'logloss'}
model = xgb.train(params, dtrain)

# 特徴量の重要度を取得する
feature_importances = model.get_score(importance_type='gain')

# 特徴量の重要度を表示する（降順で表示）
feature_importance_df = pd.DataFrame({'Feature': list(feature_importances.keys()), 'Importance': list(feature_importances.values())})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)


                     Feature  Importance
27      YearsWithCurrManager   14.228306
17                  OverTime    6.438156
25        YearsInCurrentRole    5.748655
20  RelationshipSatisfaction    4.309811
12                   JobRole    4.086659
11                  JobLevel    3.924931
7    EnvironmentSatisfaction    3.649502
9                 HourlyRate    3.620637
3                 Department    3.545303
1             BusinessTravel    3.526254
8                     Gender    3.500268
14             MaritalStatus    3.279509
21          StockOptionLevel    3.258085
22         TotalWorkingYears    3.223164
0                        Age    3.213730
6             EducationField    3.208443
15             MonthlyIncome    2.938428
4           DistanceFromHome    2.696337
18         PercentSalaryHike    2.597253
24            YearsAtCompany    2.410704
16        NumCompaniesWorked    2.311994
19         PerformanceRating    2.122531
5                  Education    2.094749
2               