## Treat training dataset

### import library and initial analysis

In [None]:
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter('ignore')
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from sklearn.impute import KNNImputer
%matplotlib inline

In [None]:
train_features = pd.read_csv('train_features.csv')
train_features.head()

In [None]:
train_features.shape

In [None]:
train_labels = pd.read_csv('train_labels.csv')
train_labels.head()

In [None]:
train_labels.shape

In [None]:
train_features['jumlah_promosi'] = train_labels.copy()

In [None]:
train_features.head()

In [None]:
train_features.shape

In [None]:
df = train_features.copy()
df.info()

In [None]:
df.isna().sum()

### Data Preprocessing

#### Missing Values Exploration

In [None]:
df = df.drop(['tanggal_menjadi_anggota'], axis=1)
df.isna().sum()

In [None]:
cat_cols = df.select_dtypes(include=['object']).columns
for i in cat_cols:
    print(f'{df[i].value_counts()}')
    print('\n')

In [None]:
df['pendidikan'] = df['pendidikan'].replace('5', np.nan)
df['status_pernikahan'] = df['status_pernikahan'].replace('5', np.nan)

for i in cat_cols:
    print(f'{df[i].value_counts()}')
    print('\n')

In [None]:
def null_statistics(df):
    
    # Create a DataFrame to show column information

    null_info = pd.DataFrame({
        'Column Name': df.columns,
        'Column Data Type': df.dtypes,
        'Null Values': df.isnull().sum(),
        'Null Percentage': df.isnull().mean() * 100
    })

    null_info.reset_index(drop=True, inplace=True)  # Reset the index

    # Display the DataFrame
    
    null_info = null_info[null_info["Null Values"] > 0]
    null_info = null_info.sort_values(by = 'Null Values', ascending = False).reset_index(drop=True)
    
    print("")
    print(f"There are {null_info.shape[0]} columns ({null_info.shape[0] / len(df.columns) * 100:,.2f} %) with Null values out of {len(df.columns)} columns in Dataframe.")
    print("")
    
    return null_info

train_null_data = null_statistics(df)

train_null_data

In [None]:
df_null_handled = df.copy()

In [None]:
def impute_with_knn(data):
    num_columns = data.select_dtypes(['int64', 'float64']).columns

    num_imputer = KNNImputer(n_neighbors=5, weights='distance')
    num_imputed_data = num_imputer.fit_transform(data[num_columns])
    num_imputed_byKNN_df = pd.DataFrame(num_imputed_data, columns=num_columns)
    return num_imputed_byKNN_df

num_imputed_byKNN_df = impute_with_knn(df_null_handled)

num_imputed_byKNN_df.isna().sum()

In [None]:
def impute_cat_with_mode(data):
    cat_columns = data.select_dtypes(['object']).columns

    for col in cat_columns:
        data[col].fillna(df[col].mode()[0], inplace=True)
    
    return data[cat_columns]

df_null_handled['pendidikan', 'status_pernikahan'] = impute_cat_with_mode(df_null_handled)

In [None]:
# df null handled by mode and knn = df_mk
num_columns = df_null_handled.select_dtypes(['int64', 'float64']).columns

df_null_handled[num_columns] = num_imputed_byKNN_df
df_mk = df_null_handled.copy()

df_mk.isna().sum()

#### Duplicate Values Exploration

In [None]:
df_mk.shape

In [None]:
df_mk.duplicated().sum()

#### Outliers Exploration

In [None]:
# Select numerical columns from df_mk

numerical_columns = df_mk.select_dtypes(include='number')

# Sort columns by correlation with 'SalePrice' in ascending order

sorted_columns = numerical_columns.columns

# Calculate the number of rows and columns for the subplots

num_plots = len(sorted_columns)

# Calculate the number of rows and columns for the subplots

num_cols = 3
num_rows = (num_plots - 1) // num_cols + 1

# Create subplots

fig, axes = plt.subplots(num_rows, num_cols, figsize=(20, num_rows * 5))

# Add a main title to the entire set of subplots

fig.suptitle("Box Plots For Original Numerical Features", y=1, fontsize=20)

# Create a box plot for each feature

for i, col in enumerate(sorted_columns):
    
    # Calculate current row and column for the subplot
    
    row_idx = i // num_cols
    col_idx = i % num_cols
    
    # Calculate and add information about outliers

    Q1 = df_mk[col].quantile(0.25)
    Q3 = df_mk[col].quantile(0.75)
    IQR = Q3 - Q1
    iqr_outliers = (df_mk[col] < Q1 - 1.5 * IQR) | (df_mk[col] > Q3 + 1.5 * IQR)
    percent_iqr_outliers = (sum(iqr_outliers) / len(df_mk)) * 100
    
    sns.boxplot(x=df_mk[col], ax=axes[row_idx, col_idx])
    axes[row_idx, col_idx].set_xlabel(col, fontsize=12)
    axes[row_idx, col_idx].set_title(f"\n"
                                     f'Box Plot for {col}\n'
                                     f"\n"
                                     f'Outliers: {sum(iqr_outliers)} ({percent_iqr_outliers:.2f}%)\n', fontsize=14, color='red')
    
    axes[row_idx, col_idx].grid(True)

# Remove any empty subplots

for i in range(num_plots, num_rows * num_cols):
    fig.delaxes(axes.flatten()[i])

# Adjust layout

plt.tight_layout()

plt.show()

In [None]:
# pada kolom tahun_kelahiran, hapus outlier karena agaknya aneh jika terdapat orang di data tersebut yang lahir pada 100 tahun yang lalu

Q1 = df_mk['tahun_kelahiran'].quantile(0.25)
Q3 = df_mk['tahun_kelahiran'].quantile(0.75)
IQR = Q3 - Q1

# Tentukan batas bawah dan batas atas untuk outlier
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Hapus baris dengan nilai tahun_kelahiran di luar batas bawah dan batas atas
df_mk_filtered = df_mk[(df_mk['tahun_kelahiran'] >= lower_bound) & (df_mk['tahun_kelahiran'] <= upper_bound)]

In [None]:
# Select numerical columns from df_mk_filtered

numerical_columns = df_mk_filtered.select_dtypes(include='number')

# Sort columns by correlation with 'SalePrice' in ascending order

sorted_columns = numerical_columns.columns

# Calculate the number of rows and columns for the subplots

num_plots = len(sorted_columns)

# Calculate the number of rows and columns for the subplots

num_cols = 3
num_rows = (num_plots - 1) // num_cols + 1

# Create subplots

fig, axes = plt.subplots(num_rows, num_cols, figsize=(20, num_rows * 5))

# Add a main title to the entire set of subplots

fig.suptitle("Box Plots For Original Numerical Features", y=1, fontsize=20)

# Create a box plot for each feature

for i, col in enumerate(sorted_columns):
    
    # Calculate current row and column for the subplot
    
    row_idx = i // num_cols
    col_idx = i % num_cols
    
    # Calculate and add information about outliers

    Q1 = df_mk_filtered[col].quantile(0.25)
    Q3 = df_mk_filtered[col].quantile(0.75)
    IQR = Q3 - Q1
    iqr_outliers = (df_mk_filtered[col] < Q1 - 1.5 * IQR) | (df_mk_filtered[col] > Q3 + 1.5 * IQR)
    percent_iqr_outliers = (sum(iqr_outliers) / len(df_mk_filtered)) * 100
    
    sns.boxplot(x=df_mk_filtered[col], ax=axes[row_idx, col_idx])
    axes[row_idx, col_idx].set_xlabel(col, fontsize=12)
    axes[row_idx, col_idx].set_title(f"\n"
                                     f'Box Plot for {col}\n'
                                     f"\n"
                                     f'Outliers: {sum(iqr_outliers)} ({percent_iqr_outliers:.2f}%)\n', fontsize=14, color='red')
    
    axes[row_idx, col_idx].grid(True)

# Remove any empty subplots

for i in range(num_plots, num_rows * num_cols):
    fig.delaxes(axes.flatten()[i])

# Adjust layout

plt.tight_layout()

plt.show()

#### Fix Possible Wrong Inputation

In [None]:
df_mk_filtered['jumlah_anak_balita'].value_counts()

terdapat jumlah anak yang continuous

In [None]:
# pembulatan input yang salah
def round_value(value):
    if value % 1 < 0.5:
        return int(np.floor(value))
    else:
        return int(np.ceil(value))
    

# visualize updated data
def show_countplot(data):
  possible_wrong_input_cols = data[['jumlah_anak_balita', 'jumlah_anak_remaja', 'pembelian_diskon', 'pembelian_web', 'pembelian_toko', 'keluhan', 'terakhir_belanja']]
  num_cols = 2
  num_rows = (possible_wrong_input_cols.shape[1] + num_cols -1 ) // num_cols

  fig, axes = plt.subplots(num_rows, num_cols, figsize=(20, num_rows * 5))

  # Add a main title to the entire set of subplots
  fig.suptitle("Countplot for outlier column in the dataset", y=1, fontsize=20)


  for i, col in enumerate(possible_wrong_input_cols):

      # Calculate current row and column for the subplot
      row_idx = i // num_cols
      col_idx = i % num_cols

      pic = sns.countplot(possible_wrong_input_cols, x=col, ax=axes[row_idx, col_idx])
      axes[row_idx, col_idx].set_xlabel(col, fontsize=12)
      pic.margins(x=0.1)

  # Remove any empty subplots
  for i in range(len(possible_wrong_input_cols.columns), num_rows * num_cols):
      fig.delaxes(axes.flatten()[i])

  # Adjust layout
  plt.tight_layout()
  plt.show()


possible_wrong_input_cols = ['jumlah_anak_balita', 'jumlah_anak_remaja', 'pembelian_diskon', 'pembelian_web', 'pembelian_toko', 'keluhan', 'terakhir_belanja']

for col in possible_wrong_input_cols:
    df_mk_filtered[col] = df_mk_filtered[col].apply(lambda x: round_value(x))
    
show_countplot(df_mk_filtered)

#### Encode Categorical Data

In [None]:
cat_cols = df_mk_filtered.select_dtypes(include=['object']).columns
for i in cat_cols:
    print(f'{df_mk_filtered[i].value_counts()}')
    print('\n')

In [None]:
def apply_custom_ordinal_encoding_mappings(df, default_value = 0):
    
    ordinal_encoding_columns_mappings = {
    
        'pendidikan': {'SMP': 1, 'SMA': 2, 'Sarjana': 3, 'Magister': 4, 'Doktor': 5},
        'status_pernikahan': {'Sendiri': 1, 'Rencana Menikah': 2, 'Menikah': 3, 'Cerai': 4, 'Cerai Mati': 5}
    }
    
    for col, mapping in ordinal_encoding_columns_mappings.items():
        
        if col in df.columns:
            
            df[col] = df[col].map(mapping).fillna(default_value)
        
    return df

In [None]:
df_mk_filtered[cat_cols] = apply_custom_ordinal_encoding_mappings(df_mk_filtered[cat_cols])
for i in cat_cols:
    print(f'{df_mk_filtered[i].value_counts()}')
    print('\n')

In [None]:
df_mk_filtered.info()

### EDA

#### Statistics Summary

In [None]:
df_mk_filtered_desc = df_mk_filtered.describe().T
df_mk_filtered_desc['skewness'] = df_mk_filtered.select_dtypes(include=[np.number]).skew()
df_mk_filtered_desc['range'] = np.ptp(df_mk_filtered.select_dtypes(include=[np.number]), axis=0)
df_mk_filtered_desc

#### Correlation Exploration

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df_mk_filtered.corr(),annot=True, annot_kws={"fontsize": 8})
plt.show()

#### Feature Scalling

In [None]:
from sklearn.preprocessing import RobustScaler

feature_selected = ['tahun_kelahiran', 'pendidikan', 'status_pernikahan', 'pendapatan', 'jumlah_anak_balita', 'jumlah_anak_remaja', 'terakhir_belanja', 'belanja_buah', 'belanja_daging', 'belanja_ikan','pembelian_diskon', 'belanja_kue', 'pembelian_web', 'pembelian_toko']
feature_df = df_mk_filtered[feature_selected].copy()
target_df = df_mk_filtered['jumlah_promosi'].copy()

# Fit and transform features
scaler = RobustScaler()
scaled_features = scaler.fit_transform(feature_df)

# Convert scaled features array to DataFrame
scaled_feature_df = pd.DataFrame(scaled_features, columns=feature_selected)

# Fit and transform target
target_scaler = RobustScaler()
scaled_target = target_scaler.fit_transform(target_df.values.reshape(-1, 1))

# Convert scaled target array to DataFrame
scaled_target_df = pd.DataFrame(scaled_target, columns=['jumlah_promosi'])

scaled_feature_df


### Modelling

#### Model and Evaluation

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV, RandomizedSearchCV, learning_curve
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import mean_squared_error
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier


In [None]:
X = scaled_feature_df.copy()
y = target_df.copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

LogisticRegression = LogisticRegression()
RidgeClassifier = RidgeClassifier()
SGDClassifier = SGDClassifier(loss='log')  
DecisionTreeClassifier = DecisionTreeClassifier()
RandomForestClassifier = RandomForestClassifier()
BaggingClassifier = BaggingClassifier()
ExtraTreesClassifier = ExtraTreesClassifier()
AdaBoostClassifier = AdaBoostClassifier()
GradientBoostingClassifier = GradientBoostingClassifier()
KNeighborsClassifier = KNeighborsClassifier()
svc = SVC(probability=True) 
XGBClassifier = XGBClassifier()
LGBMClassifier = LGBMClassifier()
CatBoostClassifier = CatBoostClassifier(verbose=0)


LogisticRegression = LogisticRegression.fit(X_train, y_train)
RidgeClassifier = RidgeClassifier.fit(X_train, y_train)
SGDClassifier = SGDClassifier.fit(X_train, y_train)
DecisionTreeClassifier = DecisionTreeClassifier.fit(X_train, y_train)
RandomForestClassifier = RandomForestClassifier.fit(X_train, y_train)
BaggingClassifier = BaggingClassifier.fit(X_train, y_train)
ExtraTreesClassifier = ExtraTreesClassifier.fit(X_train, y_train)
AdaBoostClassifier = AdaBoostClassifier.fit(X_train, y_train)
GradientBoostingClassifier = GradientBoostingClassifier.fit(X_train, y_train)
KNeighborsClassifier = KNeighborsClassifier.fit(X_train, y_train)
svc = svc.fit(X_train, y_train)
XGBClassifier = XGBClassifier.fit(X_train, y_train)
LGBMClassifier = LGBMClassifier.fit(X_train, y_train)
CatBoostClassifier = CatBoostClassifier.fit(X_train, y_train)


y_pred_logisticRegression = LogisticRegression.predict(X_test)
y_pred_RidgeClassifier = RidgeClassifier.predict(X_test)
y_pred_SGDClassifier = SGDClassifier.predict(X_test)
y_pred_DecisionTreeClassifier = DecisionTreeClassifier.predict(X_test)
y_pred_RandomForestClassifier = RandomForestClassifier.predict(X_test)
y_pred_BaggingClassifier = BaggingClassifier.predict(X_test)
y_pred_ExtraTreesClassifier = ExtraTreesClassifier.predict(X_test)
y_pred_AdaBoostClassifier = AdaBoostClassifier.predict(X_test)
y_pred_GradientBoostingClassifier = GradientBoostingClassifier.predict(X_test)
y_pred_KNeighborsClassifier = KNeighborsClassifier.predict(X_test)
y_pred_svc = svc.predict(X_test)
y_pred_XGBClassifier = XGBClassifier.predict(X_test)
y_pred_LGBMClassifier = LGBMClassifier.predict(X_test)
y_pred_CatBoostClassifier = CatBoostClassifier.predict(X_test)

In [None]:
predictions = [y_pred_SGDClassifier, y_pred_DecisionTreeClassifier, y_pred_RandomForestClassifier, y_pred_BaggingClassifier, y_pred_ExtraTreesClassifier, y_pred_AdaBoostClassifier, y_pred_GradientBoostingClassifier, y_pred_KNeighborsClassifier, y_pred_svc, y_pred_XGBClassifier, y_pred_LGBMClassifier, y_pred_CatBoostClassifier]
model_names = ['SGDClassifier', 'DecisionTreeClassifier', 'RandomForestClassifier', 'BaggingClassifier', 'ExtraTreesClassifier', 'AdaBoostClassifier', 'GradientBoostingClassifier', 'KNeighborsClassifier', 'svc', 'XGBClassifier', 'LGBMClassifier', 'CatBoostClassifier']

# Metrik evaluasi yang akan digunakan
metrics = {
    'Model Name' : [],
    'F1 Score': []
}

# Looping untuk mengevaluasi setiap model
for i in range(len(model_names)):
    
    metrics['Model Name'].append(model_names[i])
    metrics['F1 Score'].append(f1_score(y_test, predictions[i], average='macro'))

metrics = pd.DataFrame(metrics)
metrics

## Treat Testing Dataset

In [None]:
df_test = pd.read_csv('test_features.csv')
df_test.head()

In [None]:
df_test.info()

In [None]:
df_test.isna().sum()

In [None]:
df_test = df_test.drop(['tanggal_menjadi_anggota'], axis=1)
df_test.isna().sum()

In [None]:
cat_cols = df_test_test.select_dtypes(include=['object']).columns
for i in cat_cols:
    print(f'{df_test[i].value_counts()}')
    print('\n')

In [None]:
df_test['pendidikan'] = df_test['pendidikan'].replace('5', np.nan)
df_test['status_pernikahan'] = df_test['status_pernikahan'].replace('5', np.nan)

for i in cat_cols:
    print(f'{df_test[i].value_counts()}')
    print('\n')

In [None]:
test_null_data = null_statistics(df_test)
test_null_data

In [None]:
df_test_null_handled = df_test.copy()

num_test_imputed_byKNN_df = impute_with_knn(df_test_null_handled)

num_test_imputed_byKNN_df.isna().sum()

In [None]:
df_test_null_handled['pendidikan', 'status_pernikahan'] = impute_cat_with_mode(df_test_null_handled)

# df null handled by mode and knn = df_mk
num_columns = df_test_null_handled.select_dtypes(['int64', 'float64']).columns

df_test_null_handled[num_columns] = num_imputed_byKNN_df
df_test_mk = df_test_null_handled.copy()

df_test_mk.isna().sum()

In [None]:
possible_wrong_input_cols = ['jumlah_anak_balita', 'jumlah_anak_remaja', 'pembelian_diskon', 'pembelian_web', 'pembelian_toko', 'keluhan', 'terakhir_belanja']

for col in possible_wrong_input_cols:
    df_test_mk[col] = df_test_mk[col].apply(lambda x: round_value(x))
    
show_countplot(df_test_mk)

In [None]:
df_test_mk[cat_cols] = apply_custom_ordinal_encoding_mappings(df_test_mk[cat_cols])
for i in cat_cols:
    print(f'{df_test_mk[i].value_counts()}')
    print('\n')

In [None]:
df_test_mk.info()

In [None]:
feature_selected = ['tahun_kelahiran', 'pendidikan', 'status_pernikahan', 'pendapatan', 'jumlah_anak_balita', 'jumlah_anak_remaja', 'terakhir_belanja', 'belanja_buah', 'belanja_daging', 'belanja_ikan','pembelian_diskon', 'belanja_kue', 'pembelian_web', 'pembelian_toko']
feature_df_test = df_test_mk[feature_selected].copy()

# Fit and transform features
scaler_test = RobustScaler()
scaled_features_test = scaler_test.fit_transform(feature_df_test)

# Convert scaled features array to DataFrame
scaled_feature_df_test = pd.DataFrame(scaled_features_test, columns=feature_selected)

In [None]:
X_train = scaled_feature_df.copy()
y_train = target_df.copy()
X_test = scaled_feature_df_test.copy


RandomForestClassifier = RandomForestClassifier()
ExtraTreesClassifier = ExtraTreesClassifier()



RandomForestClassifier = RandomForestClassifier.fit(X_train, y_train)
ExtraTreesClassifier = ExtraTreesClassifier.fit(X_train, y_train)



y_pred_RandomForestClassifier = RandomForestClassifier.predict(X_test)
y_pred_ExtraTreesClassifier = ExtraTreesClassifier.predict(X_test)


In [None]:
rf_df = pd.DataFrame(y_pred_RandomForestClassifier)
etc_df = pd.DataFrame(y_pred_ExtraTreesClassifier)

rf_df.to_csv('rf.csv')
etc_df.to_csv('etc.csv')