# **MODULE AND PACKAGE**

---



In [None]:
# Data manipulation
import pandas as pd
import numpy as np

# Data visualization
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.express as px

# Modeling
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from imblearn.over_sampling import SMOTE,SMOTENC
from sklearn.metrics import f1_score,recall_score,precision_score,confusion_matrix,roc_curve,roc_auc_score,classification_report,accuracy_score, auc # performance metrics
from scipy import stats
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

# Algorithms for supervised learning methods
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Filtering future warnings
import warnings
warnings.filterwarnings('ignore')

#**DATA UNDERSTANDING**

---

In [None]:
test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Data Test.csv')
train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Data Train.csv')

In [None]:
data = train

In [None]:
data.head()

In [None]:
#Shape of the dataframe
print("The number of rows: {}".format(data.shape[0]))
print("The number of columns:{}".format(data.shape[1]))

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
# Numerical Columns
print(f"Numerical Columns: {data.select_dtypes(include='number').columns}\n")
# Categorical Columns
print(f"Categorical Columns: {data.select_dtypes(include='object').columns}")

In [None]:
data.isnull().sum()

#**DATA PREPARATION**

---

## **DATA CLEANING**

In [None]:
def check_missing_values(df):
    """
    A function to check for missing values in a DataFrame
    """
    missing_values = df.isnull().sum().sort_values(ascending=False)
    if missing_values.sum() == 0:
        return "No missing values found."
    else:
        missing_percent = round(missing_values/len(df)*100,2)
        missing_values = pd.concat([missing_values, missing_percent], axis=1, keys=['Number of Missing Values', 'Percentage of Missing Values'])
        return missing_values

def check_duplicates(df):
    """
    Function to check for duplicates in a DataFrame
    """
    duplicates = df.duplicated()
    if duplicates.any():
        print("Duplicates found:{}", duplicates)
    else:
        print("No duplicates")

In [None]:
check_missing_values(data)

In [None]:
check_duplicates(data)

##**EXPLORATORY DATA**

---

**CHURN FEATURE**

In [None]:
def figs(ds) :
  #plotting the target variable distribution
  class_counts = data[ds].value_counts()

  # Create a bar chart of the value counts using Plotly
  fig = go.Figure(
      data=[go.Bar(x=class_counts.index, y=class_counts.values)],
      layout=go.Layout(title=ds.title().replace('_',' ')+' Distributon',
            hovermode = 'closest',width=600)
  )

  # Show the chart
  fig.show()

In [None]:
figs('churn')

**AREA CODE FEATURE**

In [None]:
# Pie chart of area code feature
area = data['area_code'].value_counts()
transanction = area.index
quantity = area.values

# plot pie circle with plotly
figure = px.pie(data,
               values = quantity,
               names = transanction,
               hole = .5,
               title = 'Distribution of Area Code Feature')
figure.show()

**NUMERICAL FEATURE**

In [None]:
#checking for distribution of the numeric features
numeric_features = ['account_length', 'number_vmail_messages', 'total_day_minutes',
       'total_day_calls', 'total_day_charge', 'total_eve_minutes',
       'total_eve_calls', 'total_eve_charge', 'total_night_minutes',
       'total_night_calls', 'total_night_charge', 'total_intl_minutes',
       'total_intl_calls', 'total_intl_charge',
       'number_customer_service_calls']

# Calculate the number of rows and columns for subplots
nrows = (len(numeric_features) - 1) // 3 + 1
ncols = min(3, len(numeric_features))

# Create subplots
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(12, 10))

# Flatten axes if necessary
axes = axes.flatten() if nrows > 1 else [axes]

# Plot numeric features
for i, feature in enumerate(numeric_features):
    ax = axes[i]
    sns.histplot(data[feature], kde=True, ax=ax)
    ax.set_xlabel(feature)
    ax.set_ylabel("Count")

# Remove empty subplots
if len(numeric_features) < nrows * ncols:
    for i in range(len(numeric_features), nrows * ncols):
        fig.delaxes(axes[i])

# Adjust subplot spacing
fig.tight_layout()

# Display the plot
plt.show()

In [None]:
figs('state')

**CATEGORICAL FEATURE**

In [None]:
figs('international_plan')

In [None]:
figs('voice_mail_plan')

**BOX PLOT**

In [None]:
# Boxplot to see which area code has the highest churn
plt.figure(figsize=(14,5))
sns.boxplot(data=data,x='churn',y='number_customer_service_calls',hue='area_code');
plt.legend(loc='upper right');

In [None]:
#Checking the distribution of categorical features based on churn rate
def plot_categorical_distribution(data, feature):
    """
    Plots the distribution of a categorical feature in the given data.
    """
    plt.figure(figsize=(10, 4))
    churn_counts = data.groupby(feature)["churn"].sum().sort_values(ascending=False)
    top_10_categories = churn_counts.head(10).index.tolist()
    sns.countplot(x=feature, hue="churn", data=data, order=top_10_categories)
    plt.xticks(rotation=90)
    plt.legend(loc="upper right")
    plt.show()

In [None]:
plot_categorical_distribution(data, 'state')

In [None]:
plot_categorical_distribution(data, 'international_plan')

In [None]:
plot_categorical_distribution(data, 'voice_mail_plan')

In [None]:
def plot_churn_kde(data, x_column, charge_type):
    """
    A function to plot features based on churn rate
    """
    plt.figure(figsize=(8, 4))
    sns.kdeplot(data=data, x=x_column, hue='churn', fill=True)
    plt.xlabel(f'Total {charge_type} Charge')
    plt.ylabel('Density')
    plt.title(f'Churn Distribution by {charge_type} Charges')
    plt.show()

In [None]:
# Churn by day charges
plot_churn_kde(data, 'total_day_charge', 'Day')

In [None]:
# Churn by evening charges
plot_churn_kde(data, 'total_eve_charge', 'Evening')

In [None]:
# Churn by night charges
plot_churn_kde(data, 'total_eve_charge', 'Night')

In [None]:
plot_churn_kde(data, 'total_intl_charge', 'International')

### **OUTLIER**

In [None]:
def drop_numerical_outliers(df, z_thresh=3):
    constrains = df.select_dtypes(include=[np.number]).apply(lambda x: np.abs(stats.zscore(x)) < z_thresh).all(axis=1)
    df.drop(df.index[~constrains], inplace=True)

drop_numerical_outliers(data)
print(data.shape)

### **FEATURE CORELATION**

In [None]:
# Finding correlation between features using a heatmap
def corrmatrix(df):
    ''' This function plots a correlation matrix for a given dataframe '''
    plt.figure(figsize=(14,14))

    corr = df.corr()

    # Generate a mask to only show the bottom triangle
    corr_tri = corr.where(np.tril(np.ones(corr.shape)).astype(np.bool_))

    sns.heatmap(data = corr_tri, center = 0, cmap = "YlGnBu", annot = True, fmt='.1g',vmin=-1);
    plt.title('Correlation of Features')
    plt.show()

In [None]:
corrmatrix(data)

### MULTICOLLINEARITY CHECK

In [None]:
# Calculate the correlation matrix and take the absolute value
corr_matrix = data.corr().abs()

# Create a True/False mask and apply it
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
tri_df = corr_matrix.mask(mask)

# List column names of highly correlated features (r > 0.90)
to_drop = [c for c in tri_df.columns if any(tri_df[c] >  0.90)]

data = data.drop(to_drop, axis=1) # Drop the features
data

## FEATURE ENGGINERING



> LABEL ENCODING



In [None]:
# Convert columns with 'yes' or 'no' to binary using LabelEncoder
label_encoder = LabelEncoder()
data['churn'] = label_encoder.fit_transform(data['churn'])



> ONE HOT ENCODING



In [None]:
data = pd.get_dummies(data,columns = ['state', 'area_code','international_plan','voice_mail_plan'])
data.head()

> SCALING DATA



In [None]:
scaler = MinMaxScaler()

def scaling(columns):
    return scaler.fit_transform(data[columns].values.reshape(-1,1))

for i in data.select_dtypes(include=[np.number]).columns:
    data[i] = scaling(i)
data.head()

# **MODELING**

In [None]:
#Defining X and y
X = data.drop("churn", axis=1)
y = data["churn"]

In [None]:
#splitting the data in to train and test sets
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.25, random_state=123)

In [None]:
#instantiate SMOTENC
from imblearn.over_sampling import SMOTE, SMOTENC

smote = SMOTENC(categorical_features = [1,2],random_state = 123)
resampled_X_train, resampled_y_train = smote.fit_resample(X_train,y_train)

In [None]:
def plot_confusion_matrix(y_true, y_pred, classes):
    """
    Plots a confusion matrix.
    """
    cm = confusion_matrix(y_true, y_pred)
    plt.figure()
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=classes, yticklabels=classes)
    plt.xlabel('Predicted label')
    plt.ylabel('True label')
    plt.show()

## **DECISION TREE CLASSIFIER**

In [None]:
#Instantiate DecisionTreeClassifier
dt_clf = DecisionTreeClassifier(random_state=123)

#Fit on the training data
dt_clf.fit(resampled_X_train,resampled_y_train)

#predict on the test set
y_pred_dt = dt_clf.predict(X_test)

In [None]:
plot_confusion_matrix(y_test, y_pred_dt, [0,1])

In [None]:
print(classification_report(y_test,y_pred_dt))

In [None]:
# Feature Importances
feature_names = list(resampled_X_train.columns)
importances = dt_clf.feature_importances_[0:10]
indices = np.argsort(importances)

plt.figure(figsize=(8,6))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], align='center')
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

## **RANDOM FOREST CLASIFIER**

In [None]:
#Instantiate the classifier
rf_clf= RandomForestClassifier(random_state=123)

#Fit on the training data
rf_clf.fit(resampled_X_train,resampled_y_train)

In [None]:
#predict on the test data
y_pred_rf = rf_clf.predict(X_test)

In [None]:
plot_confusion_matrix(y_test, y_pred_rf, [0,1])

In [None]:
print(classification_report(y_test,y_pred_rf))

In [None]:
feature_names = list(resampled_X_train.columns)
importances = rf_clf.feature_importances_[0:10]
indices = np.argsort(importances)

plt.figure(figsize=(8,6))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], align='center')
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

# **MODEL EVALUATION**



> MODEL COMPARISON - RECALL SCORE



In [None]:
np.random.seed(123)

classifiers = [RandomForestClassifier(),
               DecisionTreeClassifier()]

# Define a result table as a DataFrame
result_table = pd.DataFrame(columns=['classifiers', 'recall'])

# Train the models and record the results
for cls in classifiers:
    model = cls.fit(resampled_X_train, resampled_y_train)
    y_pred = model.predict(X_test)

    recall = recall_score(y_test, y_pred)

    result_table = result_table.append({'classifiers': cls.__class__.__name__,
                                        'recall': recall}, ignore_index=True)

# Set name of the classifiers as index labels
result_table.set_index('classifiers', inplace=True)

result_table



> MODEL COMPARISON - ROC CURVE



In [None]:
np.random.seed(123)
classifiers = [RandomForestClassifier(),
               DecisionTreeClassifier()]


# Define a result table as a DataFrame
result_table = pd.DataFrame(columns=['classifiers', 'fpr','tpr','auc'])

# Train the models and record the results
for cls in classifiers:
    model = cls.fit(resampled_X_train, resampled_y_train)
    yproba = model.predict_proba(X_test)[::,1]

    fpr, tpr, _ = roc_curve(y_test,  yproba)
    auc = roc_auc_score(y_test, yproba)

    result_table = result_table.append({'classifiers':cls.__class__.__name__,
                                        'fpr':fpr,
                                        'tpr':tpr,
                                        'auc':auc}, ignore_index=True)

# Set name of the classifiers as index labels
result_table.set_index('classifiers', inplace=True)

fig = plt.figure(figsize=(8,6))

for i in result_table.index:
    plt.plot(result_table.loc[i]['fpr'],
             result_table.loc[i]['tpr'],
             label="{}, AUC={:.3f}".format(i, result_table.loc[i]['auc']))

plt.plot([0,1], [0,1], color='orange', linestyle='--')

plt.xticks(np.arange(0.0, 1.1, step=0.1))
plt.xlabel("False Positive Rate", fontsize=15)

plt.yticks(np.arange(0.0, 1.1, step=0.1))
plt.ylabel("True Positive Rate", fontsize=15)

plt.title('ROC Curve Analysis', fontsize=15)
plt.legend(prop={'size':13}, loc='lower right')

plt.show()

## MODEL TUNING



> TUNING RANDOM FOREST



In [None]:
# Tune Random Forest
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestClassifier(random_state=42)
grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=5)
grid_search_rf.fit(X_train, y_train)

print("Best Parameters for Random Forest:", grid_search_rf.best_params_)

In [None]:
# Train RF
best_rf = grid_search_rf.best_estimator_
best_rf.fit(X_train, y_train)



> TUNING DECISIONTREE CLASSIFIER



In [None]:
# Tune Decisiontree
param_grid_dt = {
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

dt = DecisionTreeClassifier(random_state=42)
grid_search_dt = GridSearchCV(estimator=dt, param_grid=param_grid_dt, cv=5)
grid_search_dt.fit(X_train, y_train)

print("Best Parameters for Decision Tree:", grid_search_dt.best_params_)

In [None]:
# Train Decisiontree
best_dt = grid_search_dt.best_estimator_
best_dt.fit(X_train, y_train)

## MODEL ACCURACY AND ROC

In [None]:
# Predictions
y_pred_rf = best_rf.predict(X_test)
y_pred_dt = best_dt.predict(X_test)

# Accuracy
accuracy_rf = accuracy_score(y_test, y_pred_rf)
accuracy_dt = accuracy_score(y_test, y_pred_dt)

print("Accuracy of Random Forest:", accuracy_rf)
print("Accuracy of Decision Tree:", accuracy_dt)


In [None]:
# Klasifikasi Report
print("Classification Report for Random Forest:")
print(classification_report(y_test, y_pred_rf))

print("Classification Report for Decision Tree:")
print(classification_report(y_test, y_pred_dt))

In [None]:
# Random Forest
y_prob_rf = best_rf.predict_proba(X_test)[:, 1]
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_prob_rf)
roc_auc_rf = auc(fpr_rf, tpr_rf)

# Decision Tree
y_prob_dt = best_dt.predict_proba(X_test)[:, 1]
fpr_dt, tpr_dt, _ = roc_curve(y_test, y_prob_dt)
roc_auc_dt = auc(fpr_dt, tpr_dt)

# Plot ROC Curve
plt.figure()
plt.plot(fpr_rf, tpr_rf, color='darkorange', lw=2, label='Random Forest (AUC = %0.2f)' % roc_auc_rf)
plt.plot(fpr_dt, tpr_dt, color='blue', lw=2, label='Decision Tree (AUC = %0.2f)' % roc_auc_dt)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Decision Tree Feature Importance Visualization
feature_names_dt = list(X_train.columns)
importances_dt = best_dt.feature_importances_
indices_dt = np.argsort(importances_dt)[-10:]  # Top 10 features

plt.figure(figsize=(8, 6))
plt.title('Decision Tree Feature Importances')
plt.barh(range(len(indices_dt)), importances_dt[indices_dt], align='center')
plt.yticks(range(len(indices_dt)), [feature_names_dt[i] for i in indices_dt])
plt.xlabel('Relative Importance')
plt.show()


In [None]:
# Random Forest Feature Importance Visualization
feature_names_rf = list(X_train.columns)
importances_rf = best_rf.feature_importances_
indices_rf = np.argsort(importances_rf)[-10:]  # Top 10 features

plt.figure(figsize=(8, 6))
plt.title('Random Forest Feature Importances')
plt.barh(range(len(indices_rf)), importances_rf[indices_rf], align='center')
plt.yticks(range(len(indices_rf)), [feature_names_rf[i] for i in indices_rf])
plt.xlabel('Relative Importance')
plt.show()


# CONCLUSION
Skor penarikan kembali pengklasifikasi adalah 73%. Meskipun model ini masih merupakan model prediktif yang baik.

## Recomendations

* Tawarkan diskon atau penawaran promosi kepada pelanggan di kode area 415 dan 510, karena area ini memiliki tingkat churn yang lebih tinggi. Hal ini dapat membantu memberi insentif kepada pelanggan untuk tetap bersama perusahaan.

* Meningkatkan kualitas layanan pelanggan dan mengurangi jumlah panggilan layanan pelanggan. Meningkatkan program pelatihan bagi perwakilan layanan pelanggan untuk memastikan penyelesaian masalah pelanggan dengan cepat dan efektif, sehingga menghasilkan kepuasan pelanggan yang lebih tinggi dan mengurangi churn.

* Evaluasi struktur harga untuk tarif siang, malam, malam, dan internasional. Pertimbangkan untuk menyesuaikan paket harga atau memperkenalkan paket diskon untuk mengatasi biaya lebih tinggi yang terkait dengan pelanggan yang melakukan churn.

* Fokus pada strategi retensi pelanggan di negara-negara dengan tingkat churn yang lebih tinggi, seperti Texas, New Jersey, Maryland, Miami, dan New York. Hal ini dapat melibatkan kampanye pemasaran yang ditargetkan, penawaran yang dipersonalisasi, atau peningkatan dukungan pelanggan yang disesuaikan dengan kebutuhan dan preferensi spesifik pelanggan di negara bagian tersebut.

* Meningkatkan proposisi nilai rencana pesan suara untuk meningkatkan adopsi di kalangan pelanggan. Soroti manfaat dan kenyamanan layanan pesan suara, dan pertimbangkan untuk menawarkan fitur atau diskon tambahan untuk mendorong pelanggan mendaftar.