In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.utils.multiclass import class_distribution
from imblearn.over_sampling import RandomOverSampler

In [23]:
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv')
y_test = pd.read_csv('y_test.csv')

# Combine the X_train and X_test data
combined_X = pd.concat([X_train, X_test], axis=0).reset_index(drop=True)

# Combine y_train and y_test
combined_y = pd.concat([y_train, y_test], axis=0).reset_index(drop=True)

# Combine the combined X data with combined y data into a single dataset
df = pd.concat([combined_X, combined_y], axis=1)

# View the first few rows of the final combined dataframe
print(df.head(6))

   CustomerId    Surname  CreditScore Geography  Gender  Age  Tenure  \
0    15799217   Zetticci          791   Germany  Female   35       7   
1    15748986    Bischof          705   Germany    Male   42       8   
2    15722004     Hsiung          543    France  Female   31       4   
3    15780966  Pritchard          709    France  Female   32       2   
4    15636731      Ts'ai          714   Germany  Female   36       1   
5    15670646      Moore          499     Spain  Female   42       0   

     Balance  NumOfProducts  HasCrCard  IsActiveMember  EstimatedSalary  \
0   52436.20              1          1               0        161051.75   
1  166685.92              2          1               1         55313.51   
2  138317.94              1          0               0         61843.73   
3       0.00              2          0               0        109681.29   
4  101609.01              2          1               1           447.73   
5  147187.84              1          1       

In [24]:
df.drop(['CustomerId', 'Surname'], axis=1, inplace=True)

In [25]:
# Standardize the 'Gender' column by stripping spaces and capitalizing
df['Gender'] = df['Gender'].str.strip().str.capitalize()
# Verify the unique values
print(df['Gender'].unique())

['Female' 'Male']


In [26]:
label_encoder = LabelEncoder()
label_encoder

In [27]:
label_encoder.fit(df['Gender'])

In [28]:
df['Gender'] = label_encoder.fit_transform(df['Gender'])
df['Gender'].head()

0    0
1    1
2    0
3    0
4    0
Name: Gender, dtype: int32

In [29]:
onehot_encoder = OneHotEncoder(sparse_output=False,)
onehot_encoder

In [30]:
geography_encoded = onehot_encoder.fit_transform(df[['Geography']])
geography_encoded

array([[0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.]])

In [31]:
new_columns_names = onehot_encoder.get_feature_names_out()
new_columns_names

array(['Geography_France', 'Geography_Germany', 'Geography_Spain'],
      dtype=object)

In [32]:
temp = df.copy()
temp[new_columns_names] = geography_encoded.astype(int)
temp.drop('Geography', axis=1, inplace=True)
temp.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,791,0,35,7,52436.2,1,1,0,161051.75,0,0,1,0
1,705,1,42,8,166685.92,2,1,1,55313.51,0,0,1,0
2,543,0,31,4,138317.94,1,0,0,61843.73,0,1,0,0
3,709,0,32,2,0.0,2,0,0,109681.29,0,1,0,0
4,714,0,36,1,101609.01,2,1,1,447.73,0,0,1,0


In [33]:
temp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   CreditScore        10000 non-null  int64  
 1   Gender             10000 non-null  int32  
 2   Age                10000 non-null  int64  
 3   Tenure             10000 non-null  int64  
 4   Balance            10000 non-null  float64
 5   NumOfProducts      10000 non-null  int64  
 6   HasCrCard          10000 non-null  int64  
 7   IsActiveMember     10000 non-null  int64  
 8   EstimatedSalary    10000 non-null  float64
 9   Exited             10000 non-null  int64  
 10  Geography_France   10000 non-null  int32  
 11  Geography_Germany  10000 non-null  int32  
 12  Geography_Spain    10000 non-null  int32  
dtypes: float64(2), int32(4), int64(7)
memory usage: 859.5 KB


In [34]:

# ตรวจสอบว่ามีค่า NaN หรือไม่
print("NaN values in dataset:")
print(temp.isnull().sum())

# ตรวจสอบค่า inf และ -inf ใน DataFrame
print("inf or -inf values in dataset:")
for col in temp.columns:
    inf_values = temp[col].map(np.isinf).sum()
    print(f"{col}: {inf_values} inf values")



NaN values in dataset:
CreditScore          0
Gender               0
Age                  0
Tenure               0
Balance              0
NumOfProducts        0
HasCrCard            0
IsActiveMember       0
EstimatedSalary      0
Exited               0
Geography_France     0
Geography_Germany    0
Geography_Spain      0
dtype: int64
inf or -inf values in dataset:
CreditScore: 0 inf values
Gender: 0 inf values
Age: 0 inf values
Tenure: 0 inf values
Balance: 0 inf values
NumOfProducts: 0 inf values
HasCrCard: 0 inf values
IsActiveMember: 0 inf values
EstimatedSalary: 0 inf values
Exited: 0 inf values
Geography_France: 0 inf values
Geography_Germany: 0 inf values
Geography_Spain: 0 inf values


In [35]:
X = temp.drop(['Exited'],axis=1)
y = temp['Exited']

In [36]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix, roc_auc_score,roc_curve
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold, GridSearchCV

In [37]:
scoring = {'accuracy': 'accuracy',
           'precision': 'precision_weighted',
           'recall': 'recall_weighted',
           'f1': 'f1_weighted'
}
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7, 9],
    'max_samples': [0.5, 0.7, 0.9],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [38]:
model = RandomForestClassifier(random_state=42, n_jobs=-1)

In [39]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
smote = SMOTE(random_state=42)

In [40]:
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=skf, scoring='accuracy', n_jobs=-1)


In [41]:
f1_scores = []
accuracy_scores = []
precision_scores = []
recall_scores = []

In [None]:
# Loop for Stratified K-Fold
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Oversample only in the training set
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    # Fit model with GridSearchCV
    grid_search.fit(X_train_resampled, y_train_resampled)

    # Predict on test set
    y_pred = grid_search.predict(X_test)

    # Calculate scores
    f1 = f1_score(y_test, y_pred, average='weighted')
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')

    # Print results
    print(f'Fold completed. F1 Score: {f1}, Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}')

# Show the best parameters found by GridSearchCV
print(f"Best Parameters: {grid_search.best_params_}")

In [102]:
# import joblib
# 
# # สมมติว่าโมเดลของคุณคือ model
# # บันทึกโมเดลที่ฝึกแล้ว
# joblib.dump(model, 'saved_model.pkl')


['saved_model.pkl']