In [20]:
# IMPORTING LIBRARIES
import re
import numpy as np
import pandas as pd
import datetime

In [21]:
# SEPARATES THE DATE AND TIME
def datetime_divider(data):
    for index in range(len(data)):
        # find the digit if at begining only not others
        if re.match("^\d", str(data[index])):
            regex = re.compile("\d{1,8}")
            a = regex.findall(str(data[index]))
            data[index] = [a[0], a[1]]
        else:
            data[index] = [np.nan, np.nan]

    return data


data = ["20190620032717.906", "20190620052652.52", '', "20190620052735.207"]
print("\n\n.............SEPARATING DATE AND TIME FROM RAW DATA...........\n\n")
print(datetime_divider(data))



.............SEPARATING DATE AND TIME FROM RAW DATA...........


[['20190620', '032717'], ['20190620', '052652'], [nan, nan], ['20190620', '052735']]


In [None]:
# IT MODIFIES THE DATE IN YYYY-MM-DD FORMAT
def date_modifier(data):
    for i in range(len(data)):
        if (re.match("^\d", str(data[i]))):
            year = str(data[i])[:4]
            month = str(data[i])[4:6]
            date = str(data[i])[6:]
            data[i] = "-".join([year, month, date])
        else:
            data[i] = np.nan
    return data


data = ['20190620', '20190620', np.nan, '20190620']
print("\n\n.............IN STANDARD DATE FORMAT...........\n\n")
print(date_modifier(data))

In [None]:
# IT MODIFIES TIME IN HH-MM-SS FORMAT
def time_modifier(data):
    for i in range(len(data)):
        data[i] = str(data[i])
        if re.match("^\d", data[i]):
            hour = data[i][0:2]
            min = data[i][2:4]
            sec = data[i][4:]
            meridian = None
            hr = int(hour)
            if (hr >= 12):
                if hr == 12:
                    hour = str(12)
                else:
                    hour = str(hr - 12)
                meridian = "PM"
            else:
                if hr == 0:
                    hour = str(12)
                else:
                    hour = str(hr)
                meridian = "AM"
            data[i] = f"{hour}-{min}-{sec} {meridian}"
        else:
            data[i] = np.nan

    return data

time_data = ['032717', '202652', np.nan, '052735', '003419']
print("\n\n.............IN STANDARD TIME FORMAT...........\n\n")
print(time_modifier(time_data))

In [None]:
# NOW CLEANING THE DATA FROM THE CDR SHEET

dataset_name = "raw_cdr_data.csv"
dataframe = pd.read_csv(dataset_name, header=None, low_memory=False)

In [None]:
# REPLACING SIMPLE TERMINOLOGY WITH STANDARD TERMS

def replacing_simple_teminology_with_standard(dataframe):
    dataframe[5] = dataframe[5].replace('Originating', 'Outgoing')
    dataframe[5] = dataframe[5].replace('Terminating', 'Incoming')
    dataframe[267] = dataframe[267].replace('Success', 'Voice Portal')
    dataframe[312] = dataframe[312].replace('Shared Call Appearance', 'Secondary Device')

    return dataframe


dataframe = replacing_simple_teminology_with_standard(dataframe)
print("\n\n.............REPLACING SIMPLE TERMINOLOGY WITH STANDARD TERMS...........\n\n")
print(dataframe[5].unique())
print(dataframe[267].unique())
print(dataframe[312].unique())

In [None]:
# REMOVING UNWANTED DATA FROM CDR SHEET

def remove_unwanted_data(datacolumn):
    for i in range(len(datacolumn)):
        if datacolumn[i] == ('Secondary Device') or datacolumn[i] == ('Primary Device'):
            continue
        else:
            datacolumn[i] = np.nan
    return datacolumn


dataframe[312] = remove_unwanted_data(dataframe[312].tolist())
print("\n\n.............REMOVING UNWANTED TERMS FROM COLUMN 312...........\n\n")
print(dataframe[312].unique())

In [None]:
# COMBINIG ALL THE SERVICES

def combine_all_services(datacolumn147, datacolumn312, datacolumn267):
    for index in range(len(datacolumn147)):
        if (datacolumn147[index] is np.nan):
            if (datacolumn312[index] is not np.nan and datacolumn267[index] is not np.nan):
                datacolumn147[index] = str(datacolumn312[index]) + " , " + str(datacolumn267[index])
            elif (datacolumn312[index] is not np.nan):
                datacolumn147[index] = datacolumn312[index]
            else:
                datacolumn147[index] = datacolumn267[index]
        else:
            continue
    return datacolumn147


dataframe[147] = combine_all_services(dataframe[147].tolist(), dataframe[312].tolist(), dataframe[267].tolist())
print("\n\n.............COMBINING ALL THE SERVICES IN COL 312,147,267...........\n\n")
print(dataframe[147].unique())

In [None]:
# CONVERT DATE-TIME DATA INTO SPCIFIC FORMAT
def call_time_fetcher(data):
    for index in range(len(data)):
        data[index] = str(data[index])
        if data[index] != "nan":
            year = data[index][0:4]
            month = data[index][4:6]
            day = data[index][6:8]
            hour = data[index][8:10]
            minute = data[index][10:12]
            seconds = str(round(float(data[index][12:])))

            if int(seconds) >= 60:
                seconds = int(seconds) - 60
                minute = int(minute) + 1
            if int(minute) >= 60:
                minute = int(minute) - 1
                hour = int(hour) + 1
            data[index] = f"{year}-{month}-{day}  {hour}:{minute}:{seconds}"
        else:
            data[index] = np.nan
    return data


data = ["20190620032717.906", "20190621132819.68", 'nan', "20190625192352.293"]
print("\n\n...........CONVERT DATE-TIME DATA INTO SPCIFIC FORMAT............\n\n")
print(call_time_fetcher(data))

In [None]:
# FINDING THE HOURLY RANGE
def hourly_range(data):
    for index in range(len(data)):
        data[index] = str(data[index])
        if data[index] != "nan":
            if re.search("PM", data[index]):
                time_data = re.findall("\d+", data[index])
                if time_data[0] != "12":
                    time_data = int(time_data[0]) + 12
                else:
                    time_data = time_data[0]
            else:
                time_data = re.findall("\d+", data[index])
                if time_data[0] == "12":
                    time_data = int(time_data[0]) - 12
                else:
                    time_data = time_data[0]
            data[index] = f"{time_data}:00 - {time_data}:59"
        else:
            data[index] = np.nan
    return data


data = ['3:27:17 AM', '1:28:19 PM', 'nan', '7:23:52 PM', '12:20:45 AM', '12:56:27 PM']
print("\n\n...........FINDING HOURLY RANGE............\n\n")
print(hourly_range(data))

In [None]:
# FINDING THE WEEKLY RANGE
def weekly_range(data):
    for index in range(len(data)):
        data[index] = str(data[index])
        if data[index] != "nan":
            year, month, date = (int(x) for x in data[index].split("-"))
            result = datetime.date(year, month, date)
            data[index] = result.strftime("%A")
        else:
            data[index] = np.nan

    return data


data = ['2019-06-20', '2019-06-21', 'nan', '2019-06-25', '2020-11-06']
print("\n\n...........FINDING WEEKLY RANGE............\n\n")
print(weekly_range(data))

In [None]:
"""
Number of functions made
1. datetime_divider()
2. date_modifier()
3. time_modifier()
4. replacing_simple_teminology_with_standard()
5. remove_unwanted_data()
6. combine_all_services()
7. call_time_fetcher()
8. hourly_range()
9. weekly_range()
"""

In [None]:
print("\n\n...........NOW CREATING NEW COLUMNS WITH CLEANED DATA............\n\n")
dataset_name = "raw_cdr_data.csv"
raw_cdr_data = pd.read_csv(dataset_name, header=None, low_memory=False)

In [None]:
# CREATE 2 COLUMNS TO STORE DATE AND TIME
raw_cdr_data['date'], raw_cdr_data['time'] = zip(*datetime_divider(raw_cdr_data[9].tolist()))

print(raw_cdr_data['date'].tolist()[0])
print(raw_cdr_data['time'].tolist()[0])

raw_cdr_data['date'] = date_modifier(raw_cdr_data['date'].tolist())
raw_cdr_data['time'] = time_modifier(raw_cdr_data['time'].tolist())

print(raw_cdr_data['date'].tolist()[0])
print(raw_cdr_data['time'].tolist()[0])

In [None]:
# MAKING A NEW COLUMN WITH STANDARD TERMINOLOGIES
raw_cdr_data = replacing_simple_teminology_with_standard(raw_cdr_data)
print(raw_cdr_data[5])
print(raw_cdr_data[267].unique())
print(raw_cdr_data[312])

In [19]:
# NOW REMOVING UNWANTED DATA FROM 312
raw_cdr_data[312] = remove_unwanted_data(raw_cdr_data[312].tolist())
print(raw_cdr_data[312].unique())

[nan 'Primary Device']


In [None]:
# NOW COMBINING DATA OF COLUMN 312 , 267 , 147
raw_cdr_data[147] = combine_all_services(raw_cdr_data[147].tolist(), raw_cdr_data[312].tolist(), raw_cdr_data[267].tolist())
print(raw_cdr_data[147].unique())

In [None]:
# NOW MAKING 2 TEMPORARY COLUMNS FOR CALCULATING THE CALL DURATION
raw_cdr_data['starttime'] = pd.to_datetime(call_time_fetcher(raw_cdr_data[9].tolist()))
print(raw_cdr_data['starttime'])
raw_cdr_data['endtime'] = pd.to_datetime(call_time_fetcher(raw_cdr_data[13].tolist()))
print(raw_cdr_data['endtime'])

raw_cdr_data["duration"] = (raw_cdr_data["endtime"] - raw_cdr_data["starttime"]).astype("timedelta64[m]")
print(raw_cdr_data["duration"])

In [None]:
# NOW CREATING 2 COLUMNS FOR HOURLY RANGE AND WEEKLY RANGE
raw_cdr_data['hourly_range'] = hourly_range(raw_cdr_data['time'].tolist())
print(raw_cdr_data['hourly_range'])

raw_cdr_data['weekly_range'] = weekly_range(raw_cdr_data['date'].tolist())
print(raw_cdr_data['weekly_range'])

In [None]:
# REMOVE THE COLUMNS WHICH ARE NOT REQUIRED
raw_cdr_data = raw_cdr_data.drop('time', axis=1)

In [None]:
dataset_name = "cdr_data.csv"

# Required columns
call_columns = ["4", "5", "14", "31", "120", "147", "267", "312", "345", \
                "date", "starttime", "endtime", "duration", "hourly_range", "weekly_range"]

call_dataset = pd.read_csv(dataset_name, usecols=call_columns, low_memory=False)

In [None]:
# COLUMNS FOR SERVICE DATA
service_columns = ['31', '120', '147', '345', 'date', 'starttime', 'endtime', 'duration']
service_dataset = call_dataset[service_columns]

In [None]:
# COLUMNS FOR DEVICE DATASET
device_columns = ['5', '31', '120', '312', '345', 'date', 'starttime', 'endtime', 'duration']
device_dataset = call_dataset[device_columns]

In [None]:
# RENAMING COLUMN NAMES AS PER CLIENT REQUIREMENT
call_dataset = call_dataset.rename(columns={"4": "Group", "5": "Call_Direction", "14": "Missed Calls",
                                            "31": "GroupID", "120": "UserID", "147": "Features",
                                            "267": " vpDialingfacResult",
                                            "312": "UsageDeviceType",
                                            "345": "UserDeviceType"})

service_dataset = service_dataset.rename(columns={"120": "UserID",
                                                  "31": "GroupID", "147": "FeatureName",
                                                  "345": "UserDeviceType", "date": "FeatureEventDate"
                                                  })

device_dataset = device_dataset.rename(columns={"5": "DeviceEventTypeDirection",
                                                "120": "UserID", "31": "GroupID",
                                                "345": "UserDeviceType", "date": "DeviceEventDate",
                                                "312": "UsageDeviceType"})

call_dataset.to_csv("Call_data.csv", index=None)
service_dataset.to_csv("Service_data.csv", index=None)
device_dataset.to_csv("Device_data.csv", index=None)
print("ALL THE CSV FILES ARE CREATED SUCCESSFULLY..........!!!!!")