"""This code base is adopted from the below notebook

https://www.kaggle.com/code/edwingeevarughese/internet-service-churn-analysis
'''"""

Copyright (C) 2022 Intel Corporation
SPDX-License-Identifier: BSD-3-Clause

pylint: disable=C0209,C0301,W0108,C0103,E1101,E1137,E1136


Industry: Telecommunications

Aim: This task is aimed at predicting customers behavior in the telecommuniation.

Task: Predict if a customer will unsubscribe and create churn or not (classification and regression)

Dataset: Structured telecom internet subscriber data. the data set is made up of 72274 rows, and 11 columns

Type of Learning: Supervised learning
Models: Logistic regression, random forest classification
Output: Yes or No if customer churn is predicted

Use Case
Our use case for this project was the Customer Churn Dataset sourced from kaggle
(https://www.kaggle.com/datasets/mehmetsabrikunt/internet-service-churn) directory. 

Objectives
Our objective is to determine whether a customer will leave the service provider and what could be the factor that sorround the customers behavior.

We will fit 2 classification models and find the best model to describe our data. The models are:
        Random Forest
        Logistic Regression
        
Data Cleaning



***** DATA DESCRIPTION *****

Our data set has 72274 rows and 11 columns. Each entry contains the following information;

       1. Id:This is customers unique identification number.

       2. is_tv_subcriber: These are customers that subscribes to tv packages; 1 means subscriber where 0 means no subscriber
        
       3. is_movie_package_subscriber: Customers that subscribes to movie packages; 1 means subscriber where 0 means no subscriber

       4. subscription_age: Age of the subscribers.
        
       5. bill_avg: customers billing average.
        
       6. reamining_contract: 
        
       7. service_failure_count: This counts the number of times customer call to call center for service failure for last 3 months

       8. download_avg

       9. upload_avg
        
      10. download_over_limit
        
      11. churn: describes the loss of customers who don't resign their contract at the time of their renewal.

In [None]:
#importing the necessary libraries

import sys
import time
import logging
import warnings
#import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings('ignore')

In [None]:
import pandas as pd

In [None]:
DATASET_FILE = 'data/internet_service_churn.csv'

#loaded the dataset from the data folder where it is saved.
#and because the data set is in data folder we have to call the foldername and then the name of the data file

In [None]:
# setting track/ keeping record/ checking the system
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

creating and defining model functions 

In [None]:
#Creating and error handlers to handle the errors

In [None]:
def loadmodel(modelfile):
    """Loading the saved joblib model"""
    try:
        load_model = joblib.load(modelfile)
    except Exception as excep:
        raise IOError("Error loading model data from disk: {}".format(str(excep))) from excep
    return load_model



In [None]:
#to store the result output of the trained model for future prediction.
def savemodel(save_model, modelfile):
    """Saving the joblib model"""
    try:
        joblib.dump(save_model, modelfile)
    except Exception as exp:
        raise IOError("Error saving model data to disk: {}".format(str(exp))) from exp


##  preprocessing the data

In [None]:

def data_preparation(data_max_size, test_split):
    """Data Preparation """
    # Data preparation Starts here
    start_time = time.time()
    # loading data
    try:
        df = pd.read_csv(DATASET_FILE) # reading the data set file
    except IOError:  # noqa:F841
        sys.exit('Dataset file not found')


In [None]:
df= pd.read_csv(DATASET_FILE)
df.shape

In [None]:
# Creating is_contract column
df['is_contract'] = df['reamining_contract'].apply(lambda ele: 0 if pd.isna(ele) else 1)
    # Imputing null values with 0
df['reamining_contract'].replace(np.nan, 0, inplace=True)
    # Rearranging columns
column_names = ['id', 'is_tv_subscriber', 'is_movie_package_subscriber', 'subscription_age', 'bill_avg',
                    'reamining_contract',
                    'is_contract', 'service_failure_count', 'download_avg', 'upload_avg', 'download_over_limit',
                'churn']

df = df.reindex(columns=column_names)

df['download_avg'].replace('', np.nan, inplace=True)
df['upload_avg'].replace('', np.nan, inplace=True)
df.dropna(subset=['download_avg', 'upload_avg'], inplace=True)




Observation
Another column was created called is_contract column 
The code in line was identifying and replacing null values with 0


In [None]:
#the current number of rows and columns after the replacement
df.shape

In [None]:
 # Restructuring as per the correlation across the features
# the columns values of the data set are re-arranged in ascending order
df.corr()['churn'].sort_values(ascending=False)

In [None]:
df.head()

In [None]:
# Splitting up training and Label features for training
x = df.drop(columns=['churn'])
y = df['churn'].values

if data_max_size != x.shape[0]:
    x = x.head(data_max_size)
    y = y[:data_max_size]
logger.info('[Data] DataPreparation Time Taken in seconds --> %f secs', time.time() - start_time)
logger.info('[Data] Total Data samples ---> %s', x.shape[0])


In [None]:
    # Preparing dataset for Training
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_split, random_state=40, stratify=y)
    # Since the numerical features are distributed over different value ranges,
    # standard scalar is used to scale them down to the same range.
    num_cols = ['subscription_age', 'reamining_contract', 'download_avg', 'upload_avg',
                'download_over_limit', 'bill_avg', 'service_failure_count']

    scaler = StandardScaler()

    x_train[num_cols] = scaler.fit_transform(x_train[num_cols])
    x_test[num_cols] = scaler.transform(x_test[num_cols])
    return x_train, x_test, y_train, y_test
