# Probability. Naive Bayes

In [198]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Load the dataset
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
data = pd.read_csv('dataset.csv')

# Describing our data
We have data about some telecom company customers, we have data about:
- Our customer: customer id(will probably drop this column as it is irrelevant), type of contract, payment method, how much he pays monthly, how much did he/she paid overall.
- Our customer personal details such as gender, is he a senior or not, has dependents.
- We have data about whatever the customer is using specific telecom services such as Phone Service, StreamingTV, Streaming Movies and so on.
- and whatever he stopped using our services within our last month `Churn`

In [199]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


as we see, there are not any null columns 

In [200]:
data = data.drop('customerID', axis=1)

# Encode Gender with One Hot encoder
one_hot_encoder = OneHotEncoder()
encoded = one_hot_encoder.fit_transform(data[['gender']])
gender_encoded = pd.DataFrame(encoded.toarray(), columns=one_hot_encoder.get_feature_names_out(['gender']))

# Encode MultipleLines, PaymentMethod, Contract and InternetService with Label Encoder
label_encoder = LabelEncoder()
data['Contract'] = label_encoder.fit_transform(data['Contract'])
data['PaymentMethod'] = label_encoder.fit_transform(data['PaymentMethod'])
data['MultipleLines'] = label_encoder.fit_transform(data['MultipleLines'])
data['InternetService'] = label_encoder.fit_transform(data['InternetService'])

# The rest of the columns, I will Binary Encode (Yes->1, No ->0)
for column in ['Partner', 'Dependents', 'PhoneService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling', 'Churn']:
    # At first, I thought that simple replacing of Yes and No with 1 and 0 is enough, but there is a third value No Service Provided
    data[column] = label_encoder.fit_transform(data[column])

data = pd.concat([data.drop(columns=['gender']), gender_encoded], axis=1)

data.head()

Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male
0,0,1,0,1,0,1,0,0,2,0,0,0,0,0,1,2,29.85,29.85,0,1.0,0.0
1,0,0,0,34,1,0,0,2,0,2,0,0,0,1,0,3,56.95,1889.5,0,0.0,1.0
2,0,0,0,2,1,0,0,2,2,0,0,0,0,0,1,3,53.85,108.15,1,0.0,1.0
3,0,0,0,45,0,1,0,2,0,2,2,0,0,1,0,0,42.3,1840.75,0,0.0,1.0
4,0,0,0,2,1,0,1,0,0,0,0,0,0,0,1,2,70.7,151.65,1,1.0,0.0


In [201]:
data.describe(include='all')

Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male
count,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0
unique,,,,,,,,,,,,,,,,,,6531.0,,,
top,,,,,,,,,,,,,,,,,,,,,
freq,,,,,,,,,,,,,,,,,,11.0,,,
mean,0.162147,0.483033,0.299588,32.371149,0.903166,0.940508,0.872923,0.790004,0.906432,0.904444,0.797104,0.985376,0.992475,0.690473,0.592219,1.574329,64.761692,,0.26537,0.495244,0.504756
std,0.368612,0.499748,0.45811,24.559481,0.295752,0.948554,0.737796,0.859848,0.880162,0.879949,0.861551,0.885002,0.885091,0.833755,0.491457,1.068104,30.090047,,0.441561,0.500013,0.500013
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.25,,0.0,0.0,0.0
25%,0.0,0.0,0.0,9.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,35.5,,0.0,0.0,0.0
50%,0.0,0.0,0.0,29.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,2.0,70.35,,0.0,0.0,1.0
75%,0.0,1.0,1.0,55.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,2.0,89.85,,1.0,1.0,1.0


In [202]:
# We will replace empty strings with 0, since there are minor errors in this csv 
data = data.apply(pd.to_numeric, errors='coerce')
data.fillna(0, inplace=True)

# Splitting the data

In [203]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('Churn', axis=1),
                                                         data['Churn'], 
                                                    test_size=0.4, 
                                                    random_state=42)


In [204]:
data.head() 

Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male
0,0,1,0,1,0,1,0,0,2,0,0,0,0,0,1,2,29.85,29.85,0,1.0,0.0
1,0,0,0,34,1,0,0,2,0,2,0,0,0,1,0,3,56.95,1889.5,0,0.0,1.0
2,0,0,0,2,1,0,0,2,2,0,0,0,0,0,1,3,53.85,108.15,1,0.0,1.0
3,0,0,0,45,0,1,0,2,0,2,2,0,0,1,0,0,42.3,1840.75,0,0.0,1.0
4,0,0,0,2,1,0,1,0,0,0,0,0,0,0,1,2,70.7,151.65,1,1.0,0.0


# Training and predicting using Logistic regression

In [205]:
from sklearn.linear_model import LinearRegression, LogisticRegression

logistic_regression = LogisticRegression(max_iter=10000)

# Training
logistic_regression.fit(X_train, y_train)

# Predict
predicted_churn_logistic = logistic_regression.predict(X_test)

# Training and predicting using Naive Bayes models
We will naive bayes models to predict if a customer can leave.

In [206]:
# Create instances 
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
gaussian_bayes = GaussianNB()
multinomial_bayes = MultinomialNB()
bernoulli_bayes = BernoulliNB()

# Training
gaussian_bayes.fit(X_train, y_train)
multinomial_bayes.fit(X_train, y_train)
bernoulli_bayes.fit(X_train, y_train)

# Predict
predicted_churn_gaussian_bayes = gaussian_bayes.predict(X_test)
predicted_churn_multinomial_bayes = multinomial_bayes.predict(X_test)
predicted_churn_bernoulli_bayes = bernoulli_bayes.predict(X_test)


## Getting the accuracy for our data

In [207]:
from sklearn.metrics import f1_score

gaussian_bayes_accuracy = f1_score(y_test, predicted_churn_gaussian_bayes)
multinomial_bayes_accuracy = f1_score(y_test, predicted_churn_multinomial_bayes)
bernoulli_bayes_accuracy = f1_score(y_test, predicted_churn_bernoulli_bayes)
logistic_regression_accuracy = f1_score(y_test, predicted_churn_logistic)


accuracy_table = pd.DataFrame({
    'Model': ['GaussianNB', 'MultinomialNB', 'BernoulliNB', 'Logistic'],
    'Accuracy': [gaussian_bayes_accuracy, multinomial_bayes_accuracy, bernoulli_bayes_accuracy, logistic_regression_accuracy],
})

accuracy_table.head()

Unnamed: 0,Model,Accuracy
0,GaussianNB,0.62891
1,MultinomialNB,0.557564
2,BernoulliNB,0.606295
3,Logistic,0.627143


# Conclusion
As, we see, each model have similar accuracy between them. The best performant model was Gaussian Naive Bayes, while the worst performant was Multinomial Naive Bayes. There might be a slight error since we used all columns to train our data but haven't picked individual columns for each algorithm because each algorithm works best with specific data:
    
- **Bernoulli** - for binary data [columns that have "Yes"/"No" values fit best for this algorithm
- **Multinomial** - for discrete data, hence why it performs the worst out of all because we don't have discrete data here
- **Gaussian** - for data that has weight [columns like Monthly Charges and Yearly Charges fit best for this algorithm]

Also, we see that the best performing Naive Bayes model(Gaussian) has almost the same performance score as the Logistic Regression score.