In [1]:
import os
import nltk
import pandas as pd
import numpy as np
import re
import codecs
import lime

In [2]:
def build_df():
    """
    Once the files have been unzipped, let's build some dataframes from the three types of data we have:
    IE phone calls, contacts, and text messages

    :return:
    Three pandas.DataFrames [calls, contacts, sms]
    """

    # Set our top-level file path to find all the data
    origin_root = "../data/user_logs/"

    # Create multiple lists so we can concat later
    call_list = list()
    contact_list = list()
    sms_list = list()

    # TODO: research about writing on the fly when unzipping the files
    for root, dirs, files in os.walk(origin_root):
        for file in files:
            if file.endswith("collated_call_log.txt"):
                user_and_device = str(root[18:])
                user_id, device_id = user_and_device.split("/")[
                    0
                ], user_and_device.split(
                    "/"
                )[
                    1
                ]
                tmp_txt = os.path.join(root, file)
                temp_df = pd.read_json(tmp_txt)
                temp_df["user_id"] = user_id
                temp_df["device_id"] = device_id
                call_list.append(temp_df)
            elif file.endswith("collated_contact_list.txt"):
                user_and_device = str(root[18:])
                user_id, device_id = user_and_device.split("/")[
                    0
                ], user_and_device.split(
                    "/"
                )[
                    1
                ]
                tmp_txt = os.path.join(root, file)
                temp_df = pd.read_json(tmp_txt)
                temp_df["user_id"] = user_id
                temp_df["device_id"] = device_id
                contact_list.append(temp_df)
            elif file.endswith("collated_sms_log.txt"):
                user_and_device = str(root[18:])
                user_id, device_id = user_and_device.split("/")[
                    0
                ], user_and_device.split(
                    "/"
                )[
                    1
                ]
                tmp_txt = os.path.join(root, file)
                temp_df = pd.read_json(tmp_txt)
                temp_df["user_id"] = user_id
                temp_df["device_id"] = device_id
                sms_list.append(temp_df)

    # Combine the respective data in their own groupings
    call_df = pd.concat(call_list, axis=0)
    contact_df = pd.concat(contact_list, axis=0)
    sms_df = pd.concat(sms_list, axis=0)

    return call_df, contact_df, sms_df

call_df, contact_df, sms_df = build_df()

loan_df = pd.read_csv("../data/user_logs/user_status.csv", parse_dates=['disbursement_date'])
loan_df['user_id'] = "user-" + loan_df['user_id'].apply(str)
dummy_loans = pd.get_dummies(loan_df['status'])
loan_df = pd.concat([loan_df, dummy_loans], axis=1)

In [3]:
sms_df.sms_type.value_counts()

 1.0     818112
 2.0     507881
 5.0       7442
 3.0        936
 4.0         68
 6.0         11
 13.0         7
-1.0          6
 7.0          1
Name: sms_type, dtype: int64

In [4]:
sms_df[(sms_df['thread_id'] == 1.0)].user_id.value_counts().head(33)

user-358    5335
user-15     1755
user-176    1051
user-269     976
user-200     886
user-227     668
user-229     585
user-371     524
user-13      492
user-327     406
user-302     251
user-69      244
user-114     186
user-333     178
user-108     150
user-137     150
user-86      142
user-286     116
user-298     109
user-40       97
user-151      96
user-320      92
user-128      79
user-30       66
user-258      54
user-148      50
user-272      40
user-97       36
user-344      35
user-131      34
user-252      34
user-132      28
user-87       25
Name: user_id, dtype: int64

In [5]:
sms_df[(sms_df['thread_id'] == 1.0) & (sms_df['user_id'] == "user-132")].sort_values(by="datetime")

Unnamed: 0,contact_id,datetime,device_id,item_id,message_body,sms_address,sms_type,thread_id,user_id
10772.0,0.0,2017-02-04 04:30:08.833000,device-1,1.0,I tried to call you at 14:43 on 20 Apr Please ...,723118530,1.0,1.0,user-132
10771.0,0.0,2017-04-20 11:52:30.755000,device-1,2.0,We can talk now this sato,254957059531,1.0,1.0,user-132
10719.0,0.0,2017-04-21 18:03:28.839000,device-1,65.0,Hi dia,254957059531,1.0,1.0,user-132
10718.0,0.0,2017-04-21 18:09:03.239000,device-1,66.0,Hi,723118530,2.0,1.0,user-132
10717.0,0.0,2017-04-21 18:09:35.490000,device-1,67.0,Nimerudi Dia job,254957059531,1.0,1.0,user-132
10715.0,0.0,2017-04-21 18:10:04.199000,device-1,69.0,Ooh how's mum and everyone,723118530,2.0,1.0,user-132
10714.0,0.0,2017-04-21 18:10:54.139000,device-1,70.0,Not bad recovering pole pole,254957059531,1.0,1.0,user-132
10713.0,0.0,2017-04-21 18:11:07.842000,device-1,71.0,N u hny mko aje,254957059531,1.0,1.0,user-132
10710.0,0.0,2017-04-21 18:16:36.109000,device-1,74.0,Yukon poa,723118530,2.0,1.0,user-132
10709.0,0.0,2017-04-21 18:18:25.751000,device-1,76.0,"Ooh poa dia,so umejipanga for tomorrow?",254957059531,1.0,1.0,user-132
