<a href="https://colab.research.google.com/github/nicolascavalli/Forma-o-em-Dados/blob/main/Case_Churn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'telco-customer-churn:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F13996%2F18858%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240325%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240325T220131Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D224c941e0d74f25f8af11c8e3c4a5d7bdecf459ac78da289c000606034d0f0d87cf4e854638b875f5faca8f581ed462af278312a07450f8415062c26bcb1f9336590889f6537954de047c1495c7e6c657a1c6f400f304311cd462d7f3db2c660bd47258593ac4ade7c902481a18981c7f883bb83342d1e34472704f92c75c5abd7850b40394c12577fb1558184259e159d24e6110f2f5793f65b728257416d7442a91d61348311fc625d192aaa71e177d885d8e6bcc724a5cf2dfc87711bf0811ac76e3898d0f3c23bdaee30b6f52a5db9769ba436ffc104b0ca9c94892f32e3b681cbbf84f2706b57db6b2a20a8d54f218b8acddeb6fd773cee0d27c1c03682'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, scale, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV

In [None]:
#Reading the file and visualize the DataFrame
df = pd.read_csv('/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.head()

In [None]:
#Dataframe shape
df.shape

In [None]:
#Dataframe Columns Types
df.info()

Notice that there aren't missing values.

In [None]:
#Dropping CustomerID Column
df.drop(columns='customerID', inplace=True)
df.head()

Why drop CustomerID Column?
> For the purposes, this column isn't an attribute that impact in the target. So there's no need to keep it.

In [None]:
#Convert TotalCharges Column to float
df['TotalCharges'].replace(' ', 0, inplace=True)
df['TotalCharges'] = df['TotalCharges'].astype(float)
df.info()

In [None]:
#Verifying target's distribution
df['Churn'].value_counts(normalize=True)

In [None]:
#Verifying target's distribution x Atributtes
sns.catplot(data=df, x='Churn', y='MonthlyCharges')

In [None]:
sns.catplot(data=df, x='Churn', y='TotalCharges')

In [None]:
#Verifying outliers for Charges Columns
sns.catplot(data=df, x='Contract', y='MonthlyCharges', hue='Churn', kind='box')

In [None]:
sns.catplot(data=df, x='Contract', y='TotalCharges', hue='Churn', kind='box')

In [None]:
sns.catplot(data=df, x='Contract', y='MonthlyCharges', hue='Churn', kind='violin')

In [None]:
sns.catplot(data=df, x='Contract', y='tenure', hue='Churn', kind='violin')

In [None]:
#Analysing tenure column distribuiton
sns.histplot(data=df, x='tenure', hue='Churn');

In [None]:
sns.catplot(data=df, x='Contract', y='tenure', hue='Churn', kind='violin')

In [None]:
#Analysing PaymentMethod Column distribution
sns.catplot(data=df, x='PaymentMethod', y='MonthlyCharges', hue='Churn', kind='violin')

In [None]:
#Applying LabelEnconder in the features
le = LabelEncoder()
var_features = df.drop(columns=['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges'])
num_features = df[['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']]

In [None]:
var_features = var_features.apply(le.fit_transform)
df = pd.concat([var_features, num_features], axis=1)
df.head()

In [None]:
df.describe(percentiles=[0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99])

In [None]:
#Create a function to detect outliers
def is_outlier(array, extreme=False):
  """Custom function to identify outliers in an array"""
  q1, q3 = np.quantile(array, [.25, .75])
  iqr = q3 - q1

  factor = 3. if extreme else 1.5
  upper_outlier = q3 + factor * iqr
  lower_outlier = q1 - factor * iqr

  return (array < lower_outlier) | (array > upper_outlier)

In [None]:
#Applying the function in the original numeric features
df[['tenure', 'MonthlyCharges', 'TotalCharges']].apply(is_outlier).sum()

In [None]:
#Splitting dataset
X = df.drop(columns='Churn')
y = df[['Churn']]

In [None]:
#Normalizing X
std_scaler = StandardScaler()
X = pd.DataFrame(std_scaler.fit_transform(X), columns=X.columns)
X

In [None]:
#Splitting dataset in train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y, random_state=42)

### Modelling

Here, we're gonna train and test some models to verify which is the best model to accomplish our goal(Predict Churning in dataset)

#### Logistic Regression

In [None]:
#Trainning Logistic Regression Model
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [None]:
#Predicting Results
lr_pred = lr.predict(X_test)

In [None]:
#Analysing the train datasets score
lr.score(X_train, y_train)

In [None]:
#Visualize some metrics
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, roc_auc_score
print(f'Accuracy: {accuracy_score(y_test, lr_pred):.2f}')
print(f'Balanced Accuracy: {balanced_accuracy_score(y_test, lr_pred):.2f}')
print(f'F1 Score: {f1_score(y_test, lr_pred):.2f}')
print(f'ROC_AUC: {roc_auc_score(y_test, lr_pred):.2f}')

#### Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

In [None]:
dtc.score(X_train, y_train)

In [None]:
dtc_pred = dtc.predict(X_test)

In [None]:
print(f'Accuracy: {accuracy_score(y_test, dtc_pred):.2f}')
print(f'Balanced Accuracy: {balanced_accuracy_score(y_test, dtc_pred):.2f}')
print(f'F1 Score: {f1_score(y_test, dtc_pred):.2f}')
print(f'ROC_AUC: {roc_auc_score(y_test, dtc_pred):.2f}')

#### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

In [None]:
rfc.score(X_test, y_test)

In [None]:
rfc_pred = rfc.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, rfc_pred):.2f}')
print(f'Balanced Accuracy: {balanced_accuracy_score(y_test, rfc_pred):.2f}')
print(f'F1 Score: {f1_score(y_test, rfc_pred):.2f}')
print(f'ROC_AUC: {roc_auc_score(y_test, rfc_pred):.2f}')

#### SVM

In [None]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train, y_train)

In [None]:
svc.score(X_test, y_test)

In [None]:
svc_pred = svc.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, svc_pred):.2f}')
print(f'Balanced Accuracy: {balanced_accuracy_score(y_test, svc_pred):.2f}')
print(f'F1 Score: {f1_score(y_test, svc_pred):.2f}')
print(f'ROC_AUC: {roc_auc_score(y_test, svc_pred):.2f}')

#### Neural Network

In [None]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier()
mlp.fit(X_train, y_train)


In [None]:
mlp.score(X_test, y_test)

In [None]:
mlp_pred = mlp.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, mlp_pred):.2f}')
print(f'Balanced Accuracy: {balanced_accuracy_score(y_test, mlp_pred):.2f}')
print(f'F1 Score: {f1_score(y_test, mlp_pred):.2f}')
print(f'ROC_AUC: {roc_auc_score(y_test, mlp_pred):.2f}')

In [None]:
#Tunning model
params = {'C':np.logspace(-3, 3, 7), 'penalty': ['l1', 'l2'], 'max_iter':[200, 500, 1000]}
gscv = GridSearchCV(lr, param_grid=params, cv=5, n_jobs=-1)
gscv.fit(X_train, y_train)

In [None]:
results = pd.DataFrame(gscv.cv_results_)
results.sort_values(by='rank_test_score').head(10)

In [None]:
gscv.best_estimator_.get_params()

In [None]:
lr_grid = LogisticRegression(C= 1.0,
 class_weight= None,
 dual= False,
 fit_intercept= True,
 intercept_scaling= 1,
 l1_ratio= None,
 max_iter= 200,
 multi_class= 'auto',
 n_jobs= None,
 penalty= 'l2',
 random_state= None,
 solver= 'lbfgs',
 tol= 0.0001,
 verbose= 0,
 warm_start= False)
lr_grid.fit(X_train, y_train)

In [None]:
lr_grid_pred = lr_grid.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, lr_grid_pred):.2f}')
print(f'Balanced Accuracy: {balanced_accuracy_score(y_test, lr_grid_pred):.2f}')
print(f'F1 Score: {f1_score(y_test, lr_grid_pred):.2f}')
print(f'ROC_AUC: {roc_auc_score(y_test, lr_grid_pred):.2f}')