In [None]:
# hide
# default_exp L2B_geo_model_explore
# from nbdev.showdoc import *

# 02 geolocation churn 

> Combining data on geo location level, given that the current calculation is done on planning_area (far too few points), I will just be doing visualisation with powerBI.

## Library

In [None]:
# Library
#exports
import pandas as pd
import numpy as np
import os
from zipfile import ZipFile
from scipy import spatial
import matplotlib.pyplot as plt

from tsfresh import extract_features
from tsfresh.feature_selection.relevance import calculate_relevance_table
import tsfresh

In [None]:
#exports
from sklearn.cluster import AgglomerativeClustering
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.inspection import plot_partial_dependence
from sklearn.impute import SimpleImputer

imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_med = SimpleImputer(missing_values=np.nan, strategy='median')

from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import balanced_accuracy_score, accuracy_score, classification_report
from sklearn.inspection import permutation_importance

from collections import defaultdict

## Functions

In [None]:
#exports
def read_tsv(file:str)->pd.DataFrame:
    return pd.read_csv(file,  compression='gzip', sep='\t')

def gzip_reading(gzip_file)->dict:
    'Read all tsv.gz files in the zip file and returning a dictionary (key:filename, value:data)'
    archive = ZipFile(gzip_file, 'r')
    files = {name: archive.open(name) for name in archive.namelist() if
     (name.endswith('.gz') and not name.startswith('_'))}
    files_names = [i.split('.')[0] for i in files.keys()]
    
    # reading the designated files into dict
    dt = {}
    for name, key in zip(files_names, files.keys()):
        dt[name] = read_tsv(files[key])
    return dt

def load_directory_files_dict(dir_path)->dict:
    'Load all pkl files in the directory into dict'
    L1file_list = os.listdir(path_load)
    L1file_list = [i for i in L1file_list if not i.startswith(".")]
    L1name_list = [i.split("_")[0]+"_"+i.split("_")[1].replace(".pkl","") for i in L1file_list]

    dt = {}
    for name, key in zip(L1file_list, L1name_list):
        dt[key] = pd.read_pickle(os.path.join(path_load,name))
    return dt


## Data

In [None]:
# data
path_load = os.path.join("Data","L1")
path_save = os.path.join("Data","L2")

dt = load_directory_files_dict(path_load)
raw = gzip_reading('telco_demo_datasets.zip')

## geo profile

In [None]:
geo_train = dt['geo_train']
geo_loc = (dt['geo_location']
           .groupby('planning_area', as_index=False)
           .size()
           .rename({'size':'visits'}, axis=1)
          )
geo_census = dt['geo_census']
geo_school = dt['geo_school']
geo_coor = dt['geo_coor']

In [None]:
# combining data
geo_dt = (geo_train
          .merge(geo_coor)
          .merge(geo_loc)
          .merge(geo_census)
          .merge(geo_school)
)

# print data
geo_dt.head()

Unnamed: 0,planning_area,age,contract,internet_service,account_start_year,month_delta,churn,lat,lng,users_nb,visits,med_income,avg_income,gini_coef,pop,working_pop,number_school,integrated_schools,primary_schools,secondary_schools
0,ANG MO KIO,38.967391,0.531621,0.443676,2014.26581,65.870553,0.093874,1.371236,103.847778,1012,373,2500,5254.007202,0.427458,59705,51238,17,1.0,8.0,8.0
1,BEDOK,39.557133,0.555105,0.465855,2014.260987,65.876944,0.077755,1.331222,103.928134,1479,575,2500,6066.907469,0.38353,91224,80081,24,1.0,12.0,11.0
2,BISHAN,38.417323,0.511811,0.527559,2014.070866,67.992126,0.19685,1.355431,103.839107,127,406,3500,7303.412733,0.302858,27457,24602,15,3.0,4.0,8.0
3,BUKIT BATOK,40.53605,0.530564,0.472571,2014.325235,65.303292,0.06348,1.351252,103.750406,1276,146,3500,6627.432987,0.328079,44133,40681,11,0.0,6.0,5.0
4,BUKIT MERAH,39.228438,0.543124,0.51049,2014.235431,66.386946,0.118881,1.279427,103.822536,429,4404,1500,4930.564294,0.469347,55627,45316,13,0.0,8.0,5.0


In [None]:
geo_dt.shape

(21, 20)

With only 21 records and 20 features... I should have perhaps calculate it on finer lat, lon instaed of planning area...

### output

In [None]:
geo_dt.to_pickle(os.path.join(path_save, "geo_profile.pkl"))
geo_dt.to_csv(os.path.join(path_save, "geo_profile.csv"))

## geo visit Location
clustering on locations visited for users
- hclust on lat,lon with 60 clusters
- calculate the mean churn % within clusters

In [None]:
X_cluster_dt = raw['telco_locations'][['latitude','longitude','msisdn']].merge(dt['user_train'][['msisdn','churn']])
X_cluster_dt.head()

Unnamed: 0,latitude,longitude,msisdn,churn
0,1.326087,103.89846,6048764759382,0
1,1.292531,103.825648,6048764759382,0
2,1.301823,103.904991,1948924115781,0
3,1.301866,103.837118,1948924115781,0
4,1.301894,103.904761,5938778408016,0


In [None]:
cluster = AgglomerativeClustering(n_clusters=60, affinity='euclidean', linkage='ward')
cluster.fit_predict(X_cluster_dt[['latitude','longitude']])

X_cluster_dt['cluster'] = cluster.labels_
X_cluster_dt['churn'] = X_cluster_dt.churn

X_cluster_agg_dt = X_cluster_dt.groupby('cluster', as_index=False).agg({'latitude':'median','longitude':'median','churn':'mean'})
X_cluster_agg_dt.head()

Unnamed: 0,cluster,latitude,longitude,churn
0,0,1.280635,103.848045,0.116236
1,1,1.326771,103.846719,0.102916
2,2,1.30737,103.789107,0.128079
3,3,1.360945,103.893404,0.103226
4,4,1.293947,103.784791,0.135472


### output

In [None]:
X_cluster_agg_dt.to_pickle(os.path.join(path_save, "geo_visit.pkl"))
X_cluster_agg_dt.to_csv(os.path.join(path_save, "geo_visit.csv"))

In [None]:
# plt.scatter(X['latitude'],X['longitude'], c=cluster.labels_, cmap='rainbow')