In [1]:
import pandas as pd
import numpy as np
import re
import dateparser
from collections import Counter
import matplotlib.pyplot as plt
plt.style.use('ggplot') 

In [2]:
def read_file(file):
    x = open(file,'r', encoding = 'utf-8') #Opens the text file into variable x but the variable cannot be explored yet
    y = x.read() #By now it becomes a huge chunk of string that we need to separate line by line
    content = y.splitlines() #The splitline method converts the chunk of string into a list of strings
    return content 

In [3]:
chat = read_file('chat.txt')
len(chat) 

2149

In [4]:
join = [line for line in chat if  "joined using this" in line]
join

["7/23/20, 6:25 PM - +234 803 664 4810 joined using this group's invite link",
 "7/23/20, 8:04 PM - +234 703 586 0004 joined using this group's invite link"]

In [5]:
#Remove new lines
chat = [line.strip() for line in chat]
print("length of chat is:")
print(len(chat))

length of chat is:
2149


In [6]:
#Clean out the join notification lines
clean_chat = [line for line in chat if not "joined using this" in line]

In [7]:
clean_chat = [line for line in clean_chat if len(line) > 1]
print("length of clean_chat is:")
print(len(clean_chat))

length of clean_chat is:
1706


In [8]:
added = [line for line in clean_chat if line.endswith("added")]
added

['1/20/20, 11:05 PM - You were added']

In [9]:
left = [line for line in clean_chat if line.endswith("left")]
left

[]

In [10]:
msgs = []
pos = 0

for line in clean_chat:
    if re.findall("\A\d+[/]", line):
        msgs.append(line)
        pos +=1
    else:
        take = msgs[pos-1] + ". " + line
        msgs.append(take)
        msgs.pop(pos-1)
len(msgs)

967

In [11]:
msgs[0:10]

['1/20/20, 11:05 PM - Progress tech created group "RCCG WOGP Family Alumni🥰"',
 '1/20/20, 11:05 PM - You were added',
 '7/20/20, 10:52 PM - Skillful joe: Attention please:. Good evening All. I thank God for the lives of everyone here.. Please if you are a prayer warrior or prayer champion here, we need to gather at the back of Gen at delta Park to pray for Nigeria. This time specifically for Niger Delta. Because the revelations coming out from the NDDC saga is about to wake up abacha from the grave. We are on a sweet cruise. Just floating.',
 '7/20/20, 10:54 PM - Dotman: Lol',
 '7/20/20, 10:54 PM - Dotman: <Media omitted>',
 '7/20/20, 10:56 PM - Skillful joe: @447721937330 please do you have visa that you are no more using?',
 "7/20/20, 10:58 PM - Skillful joe: @16822488823 please prepare to pick me up at airport when I land. Don't disappear. You are a child of God that year o. Just behave.",
 '7/20/20, 11:27 PM - +44 7721 937330: Na government de issue visa oh',
 '7/20/20, 11:29 PM - 

In [12]:
time = [msgs[i].split(',')[1].split('-')[0] for i in range(len(msgs))]
time = [s.strip(' ') for s in time]
print("length of time is:")
print(len(time))

length of time is:
967


In [13]:
date = [msgs[i].split(',')[0] for i in range(len(msgs))]
len(date)

967

In [14]:
name = [msgs[i].split('-')[1].split(':')[0] for i in range(len(msgs))]
len(name)

967

In [15]:
content = []
for i in range(len(msgs)):
    try:
        content.append(msgs[i].split(':')[2])
    except IndexError:
        content.append('Missing Text')
len(content)

967

In [16]:
df = pd.DataFrame(list(zip(date, time, name, content)), columns = ['Date', 'Time', 'Name', 'Content'])
df

Unnamed: 0,Date,Time,Name,Content
0,1/20/20,11:05 PM,"Progress tech created group ""RCCG WOGP Family...",Missing Text
1,1/20/20,11:05 PM,You were added,Missing Text
2,7/20/20,10:52 PM,Skillful joe,Attention please
3,7/20/20,10:54 PM,Dotman,Lol
4,7/20/20,10:54 PM,Dotman,<Media omitted>
...,...,...,...,...
962,8/5/20,2:37 PM,+234 703 576 8876,🤣🤣🤣🤣Aggressive dude like his mentor @23470324...
963,8/5/20,2:38 PM,Dotman,That's true my fellow Nurse
964,8/5/20,2:43 PM,ayo adebayo,@2347035768876 you have joined them😕
965,8/5/20,3:05 PM,Michael Wog,Thank youuu


In [17]:
df.drop(df.index[[0,1]], inplace=True)

In [18]:
df = df[df["Content"]!='Missing Text']
df.reset_index(inplace=True, drop=True)
df

Unnamed: 0,Date,Time,Name,Content
0,7/20/20,10:52 PM,Skillful joe,Attention please
1,7/20/20,10:54 PM,Dotman,Lol
2,7/20/20,10:54 PM,Dotman,<Media omitted>
3,7/20/20,10:56 PM,Skillful joe,@447721937330 please do you have visa that yo...
4,7/20/20,10:58 PM,Skillful joe,@16822488823 please prepare to pick me up at ...
...,...,...,...,...
960,8/5/20,2:37 PM,+234 703 576 8876,🤣🤣🤣🤣Aggressive dude like his mentor @23470324...
961,8/5/20,2:38 PM,Dotman,That's true my fellow Nurse
962,8/5/20,2:43 PM,ayo adebayo,@2347035768876 you have joined them😕
963,8/5/20,3:05 PM,Michael Wog,Thank youuu


In [19]:
df['Date'] =  pd.to_datetime(df['Date'])
df.head()

Unnamed: 0,Date,Time,Name,Content
0,2020-07-20,10:52 PM,Skillful joe,Attention please
1,2020-07-20,10:54 PM,Dotman,Lol
2,2020-07-20,10:54 PM,Dotman,<Media omitted>
3,2020-07-20,10:56 PM,Skillful joe,@447721937330 please do you have visa that yo...
4,2020-07-20,10:58 PM,Skillful joe,@16822488823 please prepare to pick me up at ...


In [23]:
df['Time']= df['Time'].apply(lambda x: str(datetime.timedelta(x))

SyntaxError: unexpected EOF while parsing (<ipython-input-23-9e96894d87e4>, line 1)

In [21]:
df['DateTime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'])
df['DateTime']

SyntaxError: unexpected EOF while parsing (<ipython-input-21-109d401b7cab>, line 1)

In [38]:
df['weekday'] = df['DateTime'].apply(lambda x: x.day_name())

In [39]:
df['Letter_Count'] = df['Content'].apply(lambda s : len(s))
df['Word_Count'] = df['Content'].apply(lambda s : len(s.split(' ')))

In [40]:
df['Hour'] = df['Time'].apply(lambda x : x.split(':')[0]) 

In [41]:
df.head()

Unnamed: 0,Date,Time,Name,Content,DateTime,weekday,Letter_Count,Word_Count,Hour
0,7/20/20,10:52 PM,Skillful joe,Attention please,2020-07-20 22:52:00,Monday,17,3,10
1,7/20/20,10:54 PM,Dotman,Lol,2020-07-20 22:54:00,Monday,4,2,10
2,7/20/20,10:54 PM,Dotman,<Media omitted>,2020-07-20 22:54:00,Monday,16,3,10
3,7/20/20,10:56 PM,Skillful joe,@447721937330 please do you have visa that yo...,2020-07-20 22:56:00,Monday,66,13,10
4,7/20/20,10:58 PM,Skillful joe,@16822488823 please prepare to pick me up at ...,2020-07-20 22:58:00,Monday,132,26,10


In [42]:
df.to_csv("Whattsappchat.csv")