In [1]:
#Importing needed libraries
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [2]:
#Reading our csv files as a pandas dataframes
X_test = pd.read_csv('X_test.csv')
X_train = pd.read_csv('X_train.csv')

In [3]:
#Concatenating the x_test and x_train into a single dataframe
df = pd.concat([X_train, X_test])

In [4]:
df

Unnamed: 0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,15799217,Zetticci,791,Germany,Female,35,7,52436.20,1,1,0,161051.75,0
1,15748986,Bischof,705,Germany,Male,42,8,166685.92,2,1,1,55313.51,0
2,15722004,Hsiung,543,France,Female,31,4,138317.94,1,0,0,61843.73,0
3,15780966,Pritchard,709,France,Female,32,2,0.00,2,0,0,109681.29,0
4,15636731,Ts'ai,714,Germany,Female,36,1,101609.01,2,1,1,447.73,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3496,15733966,Johnstone,496,Germany,Female,55,4,125292.53,1,1,1,31532.96,1
3497,15669994,Greece,556,Germany,Female,31,1,128663.81,2,1,0,125083.29,0
3498,15712403,McMillan,589,France,Female,61,1,0.00,1,1,0,61108.56,1
3499,15643819,Dawson,714,France,Female,25,4,0.00,2,0,0,82500.84,0


In [5]:
#Visualising correlation with the Exited Column
df.corr()['Exited']

CustomerId        -0.006248
CreditScore       -0.027094
Age                0.285323
Tenure            -0.014001
Balance            0.118533
NumOfProducts     -0.047820
HasCrCard         -0.007138
IsActiveMember    -0.156128
EstimatedSalary    0.012097
Exited             1.000000
Name: Exited, dtype: float64

In [6]:
#Value_counts method on Geography column to examine how many unique values there are
df['Geography'].value_counts()

France     5014
Germany    2509
Spain      2477
Name: Geography, dtype: int64

In [7]:
#Value_counts method on Gender column to examine how many unique values there are
df['Gender'].value_counts()

Male      5324
Female    4423
 male      133
female     120
Name: Gender, dtype: int64

### Customer churn refers to when a customer (player, subscriber, user, etc.) ceases his or her relationship with a company.

### Age


##### Customer's Age has a major influence in their relationship with a company. If the company's target group are young people (16 - 35 y.o) (for example a gaming company ), there is a very big possibility that a 35 y.o man will cease his relationship with the company because he has other priorities like family etc. Also, teenagers are also prone to that because maybe they cannot afford paying for it

### Balance

##### Balance in customers account is also a factor, there are people with low income for example that cannot afford a long subscription and will look for more affordable options. On the other side, if the quality of services arent good enough, people with high income will also cease their subscribtion. If we would be talking of a gaming company,low balance would be the teenagers ( with exceptions) and people with very high balance would maybe consider they have other priorities ( not if making money on gaming)

### Gender

##### It is known that boys and girls usually have diffrent hobbies, interests and needs.If talking of a gaming company - the company would have normally more males customers , and probably females would be the ones with a highest rate of churn per number of customers.

### Active Member

##### The active member features would be one of the best methods to predict churn, its obviously that a customer active a long period and also with consistency, would have a lower chance of becoming a churn. While a short period being very active could determine the customer to go after a that period .

#### Geography

##### People from different countries can have diffrent oppinions and also needs, also it can influence where the company was founded and its history in that country. A company that was founded long ago in X country would have more customers from that country because of people trust.

## Data cleaning

In [8]:
#Getting dummies for fuel_type column
df = pd.get_dummies(df, columns=['Geography'], drop_first = False )

In [9]:
#Mapping through Gender column to clear our data
df['Gender'] = df['Gender'].map({'Male' : 0,'Female': 1, ' male': 0, 'female': 1})

In [10]:
#Getting rid of columns with less than 0.10 correlation
df.drop(['CustomerId','Surname', 'CreditScore', 'Tenure', 'NumOfProducts', 'HasCrCard', 'EstimatedSalary'
           ], axis=1, inplace=True)

In [11]:
#Visualising correlation with the Exited Column
df.corr()['Exited']

Gender               0.106512
Age                  0.285323
Balance              0.118533
IsActiveMember      -0.156128
Exited               1.000000
Geography_France    -0.104955
Geography_Germany    0.173488
Geography_Spain     -0.052667
Name: Exited, dtype: float64

## The Logistic Regression model

In [12]:
#Defining the features columns - X, and target values - y
X = df[['Gender', 'Age', 'Balance', 'IsActiveMember',
       'Geography_France', 'Geography_Germany']]

y = df['Exited']

In [13]:
#Splitting the data in train and test with train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [14]:
#Create an instance of the Logistic Regression class from sklearn.
model = LogisticRegression()

In [15]:
#Fitting the model
model.fit(X_train, y_train)

LogisticRegression()

In [16]:
#Making predictions
y_pred = model.predict(X_test)

In [17]:
#Visualising accuracy score
accuracy_score(y_test, y_pred)

0.7943333333333333

In [18]:
#Visualizing the confusion matrix
confusion_matrix(y_test, y_pred)

array([[2383,    0],
       [ 617,    0]], dtype=int64)

In [19]:
#How many unique values predicted 
np.unique(y_pred)

array([0], dtype=int64)

In [20]:
#How many unique values were actually in dataset - 0(False) and 1(True)
np.unique(y_test)

array([0, 1], dtype=int64)

#### The model predicted only 0s - didnot predict any people that exited

In [21]:
y_test.value_counts()

0    2383
1     617
Name: Exited, dtype: int64

## Implimentation of Logistic Regression model

In [22]:
class LogisticRegression2():
    #Define a Linear Regression class to store our relevant functions in
    def __init__(self, learning_rate: float = 0.05, max_iter: int = 100000) -> None:
        '''
        The constructor of the Logistic Regression model.
            :param learning_rate: float, default = 0.05
                The learning rate of the model.
            :param max_iter: int, default = 1000000
                The number of iteration to go through.
        '''
        #Setting up the hyperparameters.
        self.__learning_rate = learning_rate
        self.__max_iter = max_iter
    def sigmoid( self, y: 'np.array') -> 'np.array':
        '''
        The sigmoid function.
        :param y: numpy.ndarray
            The predictions of the linear function .
        '''
        return 1 / (1 + np.exp(-y))
    def fit( self, X: 'np.array', y: 'np.array') -> 'LogisticRegression2':
        '''
        The fit function of the model.
        : param X : 2-D numpy.ndarray
            The X matrix with the features.
        : param y : 1-D numpy.ndarray
            The target vector.
        '''
        #Creating the weights vector
        self.coef_ = np.zeros( len(X[0]) + 1)
        #Adding the intercept column.
        X = np.hstack((X, np.ones((len(X), 1))))
        #The weights updating process.
        for i in range(self.__max_iter):
            #Prediction.
            pred = self.sigmoid(np.dot(X, self.coef_))
            
            #Computing the gradient.
            gradient = np.dot(X.T, (pred - y)) / y.size
            
            #Updating the weights.
            self.coef_ -= gradient * self.__learning_rate
        return self
    def predict_proba( self, X: 'np.array') -> 'np.array':
        '''
        This function returns the class probabilities.
        : param X : 2-D numpy.ndarray
            The X matrix with the features.
        : return : 2-D numpy.ndarray
            The array with the probabilities for every class and sample.
        '''
        
        #Adding the intercept column.
        X = np.hstack((X, np.ones((len(X), 1))))
        
        #Computing the probabilities.
        prob = self.sigmoid(np.dot(X, self.coef_))
        
        #Returning the probabilities.
        return np.hstack(((1 - prob).reshape(-1, 1), prob.reshape(-1, 1)))

    def predict( self, X: 'np.array') -> 'np.array':
        '''
        This functions returns the predictions.
        : param X : 2-D numpy.ndarray
            The X matrix with the features.
        : return : 2-D numpy.ndarray
            The array with the predictions
        '''
        
        #Adding the intercept column.
        X = np.hstack((X, np.ones((len(X), 1))))
        
        return ( self.sigmoid(np.dot(X, self.coef_)) > 0.7 ) * 1





In [23]:
#Implimentation of our model 
gr = LogisticRegression2()

In [24]:
#The sigmoid function from our model
gr.sigmoid(y_train)

2570    0.500000
2603    0.500000
1239    0.500000
1579    0.500000
5058    0.500000
          ...   
5734    0.500000
5191    0.731059
5390    0.500000
860     0.500000
771     0.731059
Name: Exited, Length: 7000, dtype: float64

In [25]:
#Converting dataframe to numpy array
X_train = X_train.to_numpy()

In [26]:
#The fit function from our model
gr.fit(X_train, y_train)



<__main__.LogisticRegression2 at 0x1d3091f76c8>

In [27]:
#The predict_proba from our model, returning the probabilities
gr.predict_proba(X_test)



array([[1.00000000e+00, 0.00000000e+00],
       [1.00000000e+00, 0.00000000e+00],
       [1.00000000e+00, 0.00000000e+00],
       ...,
       [1.00000000e+00, 0.00000000e+00],
       [1.00000000e+00, 0.00000000e+00],
       [1.00000000e+00, 4.67920848e-61]])

In [28]:
#The predict function that returns the predictions
y_pred = gr.predict(X_test)



In [29]:
#Visualizing the confusion matrix
confusion_matrix(y_test, y_pred)

array([[2345,   38],
       [ 587,   30]], dtype=int64)

In [30]:
#Accuracy score of our model
accuracy_score(y_test, y_pred)

0.7916666666666666

##### The accuracy score obtained by our implimentation of Logistic Regression is better because we have churns predicted correct ( 30 ), while in Logistic Regression from sklearn there aren`t