# Using this script to check the data and get an initial understanding from the data

# Note on some columns meaning:

In the calls log:

    - call types: 1:incoming, 2:outgoing, 3:missed
    - is_read: whether the user has interact with the entry
    
In the contacts log:

    - times_contacted: just count of call for the specific contact
    
In the sms logs:
    
    - sms_type: 0:all, 1:inbox, 2:sent, 3:draft, 4:outbox, 5:failed, 6:queued

In [1]:
import pandas as pd
import numpy as np
import glob

from collections import Counter

In [2]:
# to check if the files contain the same data
df_from_csv = pd.read_csv('../logs/users.csv')
df_from_json = pd.read_json('../logs/users.json')

In [3]:
df_from_csv.head(3)

Unnamed: 0,user_id,disbursed_at,loan_status
0,1,2017-03-02T00:00:00.000Z,repaid
1,2,2017-03-01T00:00:00.000Z,defaulted
2,3,2017-03-02T00:00:00.000Z,repaid


In [4]:
df_from_json.head(3)

Unnamed: 0,disbursed_at,loan_status,user_id
0,2017-03-02,repaid,1
1,2017-03-01,defaulted,2
2,2017-03-02,repaid,3


In [7]:
print Counter(df_from_json.disbursed_at)

print Counter(df_from_csv.disbursed_at)

Counter({Timestamp('2017-03-02 00:00:00'): 333, Timestamp('2017-03-01 00:00:00'): 67})
Counter({'2017-03-02T00:00:00.000Z': 333, '2017-03-01T00:00:00.000Z': 67})


In [8]:
sum(df_from_csv.user_id == df_from_json.user_id), sum(df_from_csv.loan_status == df_from_json.loan_status)

(400, 400)

Looks like the users.csv and users.json contain the same data of the load status and disbursment time of users with id from 1-400, so I will go ahead to use the users.csv

In [9]:
#  check count of defaulted and repaid users
Counter(df_from_csv.loan_status)

Counter({'defaulted': 200, 'repaid': 200})

# Looking at the data of user 164

Looking at the data from a specific user to get a sense that how the data from one user will look like

In [10]:
uid = 164  # picked this user because it has 2 devices
devices = glob.glob('../logs/user-%d/*' %uid)
devices_data = [glob.glob(i+'/*.txt') for i in devices]
devices_data

[['../logs/user-164/device-1/collated_call_log.txt',
  '../logs/user-164/device-1/collated_contact_list.txt',
  '../logs/user-164/device-1/collated_sms_log.txt'],
 ['../logs/user-164/device-2/collated_call_log.txt',
  '../logs/user-164/device-2/collated_contact_list.txt',
  '../logs/user-164/device-2/collated_sms_log.txt']]

In [11]:
df_from_csv[df_from_csv['user_id'] == 164]

Unnamed: 0,user_id,disbursed_at,loan_status
163,164,2017-03-02T00:00:00.000Z,repaid


# Device 1 of uid 164

In [12]:
# call log
dv1_call = pd.read_json(devices_data[0][0])
dv1_call.head(5)

Unnamed: 0,cached_name,call_type,country_iso,data_usage,datetime,duration,features_video,geocoded_location,is_read,item_id,phone_number
0,Mum,2,KE,0,2017-04-26 17:27:26.907,201,False,Kenya,False,31,710461743
1,Ma Reu,2,KE,0,2017-04-26 17:22:56.508,172,False,Kenya,False,30,254650162978
2,Akiteles,2,KE,0,2017-04-26 17:22:01.934,0,False,Kenya,False,29,718742916
3,Akiteles,2,KE,0,2017-04-26 17:12:04.462,0,False,Kenya,False,28,718742916
4,Akiteles,2,KE,0,2017-04-26 17:10:03.791,0,False,Kenya,False,27,718742916


In [13]:
Counter(dv1_call.cached_name)

Counter({nan: 327,
         u'*128*0772031155': 2,
         u'0790 659030': 79,
         u'128772031155': 1,
         u'AMENELE G': 2,
         u'ARYNE K': 1,
         u'Abu': 1,
         u'Achichi': 1,
         u'Akine': 2,
         u'Akiteles': 5,
         u'Aledi': 28,
         u'Alex!': 7,
         u'Alex-aitel': 1,
         u'Alicia': 3,
         u'Amnyatap!': 2,
         u'Andrew Kapelo': 12,
         u'Anyik!': 3,
         u'Apostle Ben': 2,
         u'Arot': 4,
         u'Arotic': 1,
         u'Arshley': 4,
         u'Ashley!': 1,
         u'Atabo Emar': 16,
         u'Atash': 19,
         u'Athkon': 1,
         u"BA'ryan": 1,
         u'BARNABAS K': 1,
         u'BETTY L': 2,
         u'Ba Chero': 1,
         u'Baba Ipae': 7,
         u'Baba!': 2,
         u'Babaikimat': 1,
         u'Becky': 4,
         u'Benja': 2,
         u'Benny': 1,
         u'Bentar siz': 5,
         u'Bony!': 2,
         u'Brian-ikuku': 2,
         u'Bro Davi': 3,
         u'Bro Eli': 4,
         u'Bro

In [14]:
print Counter(dv1_call.features_video)
print Counter(dv1_call.is_read)

Counter({False: 2098})
Counter({False: 1343, True: 755})


In [15]:
# estimating how long the user has been on this device 
# by the time difference between the first call and last 
# call in the call logs (it can also be done with sms log)
dv1_call['datetime'] = pd.to_datetime(dv1_call['datetime'])
min(dv1_call['datetime']), max(dv1_call['datetime'])

(Timestamp('2017-02-16 09:52:19.049000'),
 Timestamp('2017-04-26 17:27:26.907000'))

In [17]:
# in second
min(dv1_call.duration), np.mean(dv1_call.duration), max(dv1_call.duration)

(0, 44.785033365109626, 2263)

In [18]:
# contact log
dv1_contact = pd.read_json(devices_data[0][1])
dv1_contact.head(3)

Unnamed: 0,display_name,item_id,last_time_contacted,phone_numbers,photo_id,times_contacted
0,Akiteles,1271,1493227362262,"[{u'item_id': 2595, u'normalized_phone_number'...",,6
1,Robert Ekeno,1272,0,"[{u'item_id': 2546, u'normalized_phone_number'...",,0
2,Bentar siz,1273,1493225220050,"[{u'item_id': 5485, u'normalized_phone_number'...",,12


In [19]:
for i in dv1_contact.phone_numbers.head(3).values:
    print i

[{u'item_id': 2595, u'normalized_phone_number': u'+254054247283', u'phone_number': u'0718742916'}]
[{u'item_id': 2546, u'normalized_phone_number': u'+254711346911', u'phone_number': u'+254711346911'}]
[{u'item_id': 5485, u'normalized_phone_number': u'+254323976446', u'phone_number': u'0707379138'}]


In [20]:
# sms log
dv1_sms = pd.read_json(devices_data[0][2])
dv1_sms.head(3)

Unnamed: 0,contact_id,datetime,item_id,message_body,sms_address,sms_type,thread_id
0,0,2017-04-26 18:45:55.595,72,"Hi, gd nyt.",254281586716,1,17
1,0,2017-04-26 17:17:16.897,71,Sawa uncle..thank you,254933575166,2,15
2,0,2017-04-26 17:16:22.497,70,Ekwar had also hinted to send something may by...,254933575166,1,15


In [21]:
Counter(dv1_sms.sms_type)

Counter({1: 6606, 2: 5692, 3: 1, 5: 40})

In [22]:
Counter(dv1_sms.sms_address)

Counter({nan: 1,
         u'+181': 1,
         u'+254000685997': 3,
         u'+254002094601': 1,
         u'+254003232744': 1,
         u'+254007510931': 1,
         u'+254008674978': 2,
         u'+254010790839': 21,
         u'+254011502989': 8,
         u'+254016386269': 1,
         u'+254016436479': 9,
         u'+254018807942': 4,
         u'+254021719067': 1,
         u'+254022167497': 68,
         u'+254026616983': 4,
         u'+254028063367': 2,
         u'+254030142705': 11,
         u'+254033514402': 1,
         u'+254040404372': 1,
         u'+254040669378': 18,
         u'+254041468860': 21,
         u'+254041646229': 1,
         u'+254042789785': 3,
         u'+254043334791': 6,
         u'+254043724477': 15,
         u'+254047392031': 89,
         u'+254047820042': 16,
         u'+254049888677': 14,
         u'+254054247283': 15,
         u'+254068660795': 1,
         u'+254071349735': 15,
         u'+254072684297': 56,
         u'+254072976684': 1,
         u'+25408187

# Device 2 of uid 164

In [23]:
dv2_call = pd.read_json(devices_data[1][0])
dv2_call.head(3)

Unnamed: 0,cached_name,call_type,country_iso,data_usage,datetime,duration,features_video,geocoded_location,is_read,item_id,phone_number
0,Honey!,2,KE,0,2017-05-22 10:47:24.679,0,False,Kenya,False,6781,254458302406
1,Honey!,2,KE,0,2017-05-22 10:47:16.054,0,False,Kenya,False,6780,254458302406
2,Robat-platnum,3,KE,0,2017-05-22 06:55:05.352,0,False,Kenya,True,6779,718902128


In [24]:
# Note: there's an overlap of usage between device 1 and 2
dv2_call['datetime'] = pd.to_datetime(dv2_call['datetime'])
min(dv2_call['datetime']), max(dv2_call['datetime'])

(Timestamp('2017-03-12 15:47:14.905000'),
 Timestamp('2017-05-22 10:47:24.679000'))

In [25]:
min(dv2_call.duration), np.mean(dv2_call.duration), max(dv2_call.duration)

(0, 29.083166999334665, 1908)

In [26]:
dv2_contact = pd.read_json(devices_data[1][1])
dv2_contact.head(3)

Unnamed: 0,display_name,item_id,last_time_contacted,phone_numbers,photo_id,times_contacted
0,Stevo,423,1470677235362,"[{u'item_id': 495222, u'normalized_phone_numbe...",,5
1,Petzz,1188,1486666837411,"[{u'item_id': 495937, u'normalized_phone_numbe...",,28
2,Adhiambo,1514,0,"[{u'item_id': 10620, u'phone_number': u'441'}]",,0


In [27]:
dv2_sms = pd.read_json(devices_data[1][2])
dv2_sms.head(5)

Unnamed: 0,contact_id,datetime,item_id,message_body,sms_address,sms_type,thread_id
0,0,2017-05-22 10:50:59.602,21381,"Hi Ekuwom, your next Branch repayment of Ksh35...",Branch-Co,1,524
1,0,2017-05-22 10:45:48.087,21380,You have successfully received Ksh 45.00 on P...,Orange,1,561
2,0,2017-05-22 10:45:43.436,21379,Congratulations you have received FREE Holla P...,Orange,1,561
3,0,2017-05-22 10:18:57.252,21378,Hello!,+254458302406,2,225
4,0,2017-05-22 08:20:30.234,21376,You are invited to the Safaricom Deaf Athletic...,SAFARICOM,1,564


In [28]:
# seeing sms from Branch-Co, let's look at it closer
dv2_sms[dv2_sms.sms_address == 'Branch-Co'].head(5)

Unnamed: 0,contact_id,datetime,item_id,message_body,sms_address,sms_type,thread_id
0,0,2017-05-22 10:50:59.602,21381,"Hi Ekuwom, your next Branch repayment of Ksh35...",Branch-Co,1,524
50,0,2017-05-20 10:36:33.079,21330,"Hi Ekuwom, your next Branch repayment of Ksh35...",Branch-Co,1,524
524,0,2017-04-26 19:24:41.516,20027,Your Branch loan of Ksh 3100.0 has been sent! ...,Branch-Co,1,524
800,0,2017-04-03 11:11:28.619,18757,"Hi Ekuwom, your next Branch repayment of Ksh56...",Branch-Co,1,524
1107,0,2017-03-31 11:39:26.447,18427,Your Branch loan of Ksh 2000.0 has been sent! ...,Branch-Co,1,524


In [29]:
Counter(dv2_sms.sms_type)

Counter({1: 5027, 2: 3724, 5: 63})

In [30]:
Counter(dv2_sms.sms_address)

Counter({u'+211863012343': 1,
         u'+254005735214': 2,
         u'+254007510931': 13,
         u'+254008393217': 1,
         u'+254020740405': 6,
         u'+254024948558': 52,
         u'+254025669654': 2,
         u'+254027054132': 2,
         u'+254028991682': 1,
         u'+254032194814': 14,
         u'+254040669378': 51,
         u'+254042255566': 27,
         u'+254047392031': 1,
         u'+254047820042': 25,
         u'+254049403345': 1,
         u'+254049888677': 33,
         u'+254054247283': 8,
         u'+254057552783': 1,
         u'+254060432444': 3,
         u'+254071349735': 32,
         u'+254072684297': 23,
         u'+254078310919': 7,
         u'+254083973428': 1,
         u'+254087134351': 3,
         u'+254092593154': 15,
         u'+254093546874': 18,
         u'+254100705546': 1,
         u'+254106294864': 15,
         u'+254111548588': 1,
         u'+254112189269': 2,
         u'+254117514912': 28,
         u'+254119024677': 27,
         u'+254121570830':

# let's look at data from another user (uid 232)

In [31]:
uid = 232 # a defaulted user
devices = glob.glob('../logs/user-%d/*' %uid)
devices_data = [glob.glob(i+'/*.txt') for i in devices]
devices_data

[['../logs/user-232/device-1/collated_call_log.txt',
  '../logs/user-232/device-1/collated_contact_list.txt',
  '../logs/user-232/device-1/collated_sms_log.txt']]

In [32]:
df_from_csv[df_from_csv['user_id'] == 232]

Unnamed: 0,user_id,disbursed_at,loan_status
231,232,2017-03-01T00:00:00.000Z,defaulted


# Device 1 of uid 220

In [33]:
dv1_call = pd.read_json(devices_data[0][0])
dv1_call.head(3)

Unnamed: 0,cached_name,call_type,datetime,duration,is_read,item_id,phone_number
0,,1,2017-04-10 19:23:38.183,0,False,51734,Safaricom
1,Ngomongo Machakachue,1,2017-04-10 19:12:06.127,0,False,51733,0722252112
2,Mzee Sagini Majimoto,1,2017-04-10 18:55:14.980,0,False,51732,+254068542642


In [34]:
dv1_call['datetime'] = pd.to_datetime(dv1_call['datetime'])
min(dv1_call['datetime']), max(dv1_call['datetime'])

(Timestamp('2017-01-21 16:40:17.285000'),
 Timestamp('2017-04-10 19:23:38.183000'))

In [35]:
min(dv1_call.duration), np.mean(dv1_call.duration), max(dv1_call.duration)

(0, 23.537665562913908, 1994)

In [36]:
# noticed a lot more calls are from people not in the contact list
Counter(dv1_call.cached_name)

Counter({nan: 788,
         u'+254719833858': 6,
         u'0727777760': 17,
         u'Abel': 111,
         u'Abough': 1,
         u'Aesha': 10,
         u'Anta': 3,
         u'Baba Sumaya': 36,
         u'Babbu': 105,
         u'Bnz.': 3,
         u'Brooohhh': 7,
         u'Clyde': 6,
         u'Dada': 30,
         u'Fadhili': 2,
         u'Free Call': 13,
         u'Halima': 5,
         u'Hassan': 39,
         u'Herb': 42,
         u'JIMMY JAMAL': 12,
         u'Joan': 1,
         u'Jude': 2,
         u'Jumbe': 1,
         u'Kimani Bildad': 27,
         u'Kirago': 39,
         u'Ma Khadija': 369,
         u'Maahad': 5,
         u'Makhanu': 7,
         u'Meddy': 2,
         u'Mgustavo': 77,
         u'Muha': 35,
         u'Mzee Sagini Majimoto': 73,
         u'Nancy Mafuta': 1,
         u'Ngomongo Machakachue': 349,
         u'Omar12': 15,
         u'Panda': 2,
         u'Peter': 136,
         u'Peterr': 3,
         u'Qassim2': 6,
         u'Sportpesa': 9,
         u'Tonny': 7,
     

In [37]:
dv1_contact = pd.read_json(devices_data[0][1])
dv1_contact.head(3)

Unnamed: 0,display_name,item_id,last_time_contacted,phone_numbers,photo_id,times_contacted
0,Baba,2056,1461738812364,"[{u'item_id': 2467, u'normalized_phone_number'...",,16
1,Mama Suleiman,6372,0,,,0
2,Muha,6941,1489142087743,"[{u'item_id': 12882, u'normalized_phone_number...",,162


In [38]:
dv1_sms = pd.read_json(devices_data[0][2])
dv1_sms.head(3)

Unnamed: 0,contact_id,datetime,item_id,message_body,sms_address,sms_type,thread_id
0,0,2017-04-10 19:23:38.183,28841,Congratulations! You have received 60.00 KSH ...,Safaricom,1,4034
1,0,2017-04-10 19:12:06.127,28840,I tried calling you at 21:29 on 10Apr Please ...,0722252112,1,3963
2,0,2017-04-10 18:55:14.980,28839,Huku njeve kaka,+254068542642,1,3993


In [39]:
Counter(dv1_sms.sms_type)

Counter({1: 1133, 2: 725, 5: 9})

In [40]:
Counter(dv1_sms.sms_address)

Counter({u'+254018678736': 8,
         u'+254019610769': 2,
         u'+254040858930': 2,
         u'+254064603459': 130,
         u'+254068542642': 60,
         u'+254094120967': 3,
         u'+254114737896': 90,
         u'+254143913710': 4,
         u'+254187686411': 12,
         u'+254239703771': 3,
         u'+254246932653': 3,
         u'+254275145453': 2,
         u'+254294618681': 2,
         u'+254320142173': 20,
         u'+254339916286': 1,
         u'+254462379307': 8,
         u'+254498363441': 2,
         u'+254498932642': 230,
         u'+254525302683': 3,
         u'+254526557522': 4,
         u'+254554142237': 3,
         u'+254602353546': 2,
         u'+254609264863': 20,
         u'+254623639289': 1,
         u'+254659306671': 2,
         u'+254673666459': 6,
         u'+254699736129': 3,
         u'+254712306481': 12,
         u'+254718270610': 2,
         u'+254745852644': 64,
         u'+254757189128': 140,
         u'+254761014110': 8,
         u'+254808049297': 

In [41]:
# what happened to the sms from Branch?
# looks like this particular user has some text from 
# Branch about his late payment
dv1_sms[dv1_sms.sms_address == 'Branch-Co'].head(5)

Unnamed: 0,contact_id,datetime,item_id,message_body,sms_address,sms_type,thread_id
41,0,2017-04-09 11:10:46.416,28798,"Hi Abubakar Siddique, Branch reports any non-p...",Branch-Co,1,4030
74,0,2017-04-06 10:15:43.489,28763,"Hi Abubakar Siddique, you are very late on you...",Branch-Co,1,4030
93,0,2017-04-03 10:23:39.639,28744,"Hi Abubakar Siddique, you are very late on you...",Branch-Co,1,4030
134,0,2017-03-31 10:41:28.476,28700,"Hi Abubakar Siddique, you are very late on you...",Branch-Co,1,4030
149,0,2017-03-28 11:06:58.839,28682,"Hi Abubakar Siddique, you are very late on you...",Branch-Co,1,4030
