In [1]:
# libraries 
import pandas as pd
import numpy as np

# encoding
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, FunctionTransformer
from sklearn.feature_extraction.text import CountVectorizer

# data splitting
from sklearn.model_selection import train_test_split

# model algorithms
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

# metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# pipeline
from sklearn.pipeline import Pipeline

# preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

# model storage
import pickle
import joblib

# visualizaiton
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# relative path and reading the data
datapath = "../../data/processed/cleaned_data.csv"
mobile_df = pd.read_csv(datapath)

In [3]:
mobile_df.head()

Unnamed: 0,Telephone_Number,Service_Provider
0,8059224942,GLO
1,8142358593,MTN
2,8142358593,MTN
3,8148252331,MTN
4,8142358593,MTN


In [4]:
mobile_df.columns

Index(['Telephone_Number', 'Service_Provider'], dtype='object')

In [5]:
mobile_df.shape

(23154, 2)

## preprocessing

#### y-variable

In [6]:
le = LabelEncoder()
y = le.fit_transform(mobile_df['Service_Provider'])

In [7]:
le.classes_

array(['9MOBILE', 'AIRTEL', 'GLO', 'MTN'], dtype=object)

#### X-variable

In [8]:
mobile_df['Phone_Prefixes'] = mobile_df['Telephone_Number'].str[:4]
X = mobile_df.drop('Service_Provider', axis=1)
# X = mobile_df

In [11]:
X

Unnamed: 0,Telephone_Number,Phone_Prefixes
0,08059224942,0805
1,08142358593,0814
2,08142358593,0814
3,08148252331,0814
4,08142358593,0814
...,...,...
23149,08034268185,0803
23150,08064995656,0806
23151,08058595159,0805
23152,07039069136,0703


In [12]:
# # custom transformer to extract prefixes from phone number
# # BaseEstimator, TransformerMixin
# class PrefixExtractor():
#     def fit(self, X, y=None):
#         return self

#     def transform(self, X):
#         X = pd.DataFrame(X)
#         X['Phone_Prefixes'] = X['Telephone_Number'].str[:4]
#         return X[['Phone_Prefixes']]

**`OR`**

In [13]:
# # using a functiontransfomer
# def extract_prefixes(X):
#     X = pd.DataFrame(X)
#     X['Phone_Prefixes'] = X['Telephone_Number'].str[:4]
#     return X[['Phone_Prefixes']]

# prefix_extractor = FunctionTransformer(extract_prefixes)

In [14]:
# preprocessor to extract prefixes and encode them

preprocessor = ColumnTransformer(
    transformers = [
        ('Phone_Prefixes', OneHotEncoder(), ['Phone_Prefixes'])
    ],
    remainder = 'drop'
)

#### splitting

In [15]:
# splitting data into train and test sets (80% to 20% split)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

## model pipeline

In [16]:
# Preprocessing and classificatiion Pipeline

pipe = Pipeline([
    # ('prefix_extractor', prefix_extractor),
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

# Train the mode
pipe.fit(X_train, y_train)

In [19]:
# model prediction and evaluation

y_pred = pipe.predict(X_test)
ac = accuracy_score(y_test, y_pred)
print("Accuracy is: ", ac)

Accuracy is:  1.0


## testing

In [20]:
# Testing prediction on phone numbers

number = pd.DataFrame({'Phone_Prefixes': ["08156441653"]})
y = pipe.predict(number)
le.classes_[y[0]]

ValueError: Found unknown categories ['08156441653'] in column 0 during transform

In [17]:
number = pd.DataFrame({'Telephone_Number': ["07012622068"]})
y = pipe.predict(number)
le.classes_[y[0]]

'AIRTEL'

In [18]:
number = pd.DataFrame({'Telephone_Number': ["08099556498"]})
y = pipe.predict(number)
le.classes_[y[0]]

'9MOBILE'

In [19]:
number = pd.DataFrame({'Telephone_Number': ["08109475645"]})
y = pipe.predict(number)
le.classes_[y[0]]

'MTN'

## model storage

In [20]:
# saving the entire pipeline
with open('../../model/trained_pipeline-0.1.0.pk1', 'wb') as f:
    pickle.dump(pipe, f)

In [21]:
# saving the classifier model
with open('../../model/model.pk1', 'wb') as f:
    pickle.dump(pipe.named_steps['classifier'], f)

In [22]:
joblib.dump(pipe.named_steps['classifier'], '../../model/model.joblib')

['../../model/model.joblib']

In [30]:
# saving the classifier model
with open('../../model/preprocessing.pk1', 'wb') as f:
    pickle.dump((pipe.named_steps['preprocessor'], pipe.named_steps['prefix_extractor']), f)

In [24]:
prefix_extractor = pipe.named_steps['prefix_extractor']
preprocessor = pipe.named_steps['preprocessor']

joblib.dump((prefix_extractor, preprocessor), '../../model/preprocessing.joblib')

['../../model/preprocessing.joblib']

In [25]:
# saving the label encoder seperately
with open('../../model/label_encoder.pk1', 'wb') as f:
    pickle.dump(le, f)

In [26]:
joblib.dump(le, '../../model/label_encoder.joblib')

['../../model/label_encoder.joblib']

In [27]:
!zip -r ./trained_pipeline=0.1.0.pk1.zip ./trained_pipeline=0.1.0.pk1


zip error: Nothing to do! (try: zip -r ./trained_pipeline=0.1.0.pk1.zip . -i ./trained_pipeline=0.1.0.pk1)


In [28]:
joblib.dump(pipe, '../../model/model_pipeline-0.1.0.joblib')

['../../model/model_pipeline-0.1.0.joblib']