In [3]:
#data loading


import re
import pandas as pd

with open("WhatsApp Chat with Megha@sis.txt", encoding="utf-8") as f:
    data = f.read()

In [4]:
pattern = r'\d{1,2}/\d{1,2}/\d{2},\s\d{2}:\d{2}\s-\s'

messages = re.split(pattern, data)[1:]
dates = re.findall(pattern, data)

print(len(messages), len(dates))
print(messages[:3])
print(dates[:3])


17641 17641
['Messages and calls are end-to-end encrypted. Only people in this chat can read, listen to, or share them. Learn more.\n', 'N.Varalakshmi: Hello\n', 'N.Varalakshmi: Megha\n']
['8/15/22, 12:54 - ', '8/15/22, 12:54 - ', '8/15/22, 12:54 - ']


In [5]:
df = pd.DataFrame({
    'user_message': messages,
    'message_date': dates
})


In [6]:
# convert date to date and time
df['message_date'] = pd.to_datetime(
    df['message_date'],
    format='%m/%d/%y, %H:%M - '
)


In [7]:
users = []
messages = []

for message in df['user_message']:
    entry = re.split(r'([\w\W]+?):\s', message)

    if entry[1:]:
        users.append(entry[1])
        messages.append(entry[2])
    else:
        users.append('group_notification')
        messages.append(entry[0])

df['user'] = users
df['message'] = messages


In [8]:
df['message'] = df['message'].str.replace('\n', '', regex=False)
df['message'] = df['message'].str.strip()
df['message'] = (
    df['message']
    .str.replace('\n', '', regex=False)
    .str.strip()
)


In [9]:
df.head()

Unnamed: 0,user_message,message_date,user,message
0,Messages and calls are end-to-end encrypted. O...,2022-08-15 12:54:00,group_notification,Messages and calls are end-to-end encrypted. O...
1,N.Varalakshmi: Hello\n,2022-08-15 12:54:00,N.Varalakshmi,Hello
2,N.Varalakshmi: Megha\n,2022-08-15 12:54:00,N.Varalakshmi,Megha
3,N.Varalakshmi: I am vara\n,2022-08-15 12:54:00,N.Varalakshmi,I am vara
4,Megha@sis: Hlo vara\n,2022-08-15 13:24:00,Megha@sis,Hlo vara


In [10]:
df.shape[0]  #chat volume

17641

In [11]:
df['user'].nunique() # no of active users

3

In [12]:
# most active user

df['user'].value_counts().head(10)

Unnamed: 0_level_0,count
user,Unnamed: 1_level_1
N.Varalakshmi,9465
Megha@sis,8175
group_notification,1


In [13]:
#data  anlaysis basic

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17641 entries, 0 to 17640
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   user_message  17641 non-null  object        
 1   message_date  17641 non-null  datetime64[ns]
 2   user          17641 non-null  object        
 3   message       17641 non-null  object        
dtypes: datetime64[ns](1), object(3)
memory usage: 551.4+ KB


In [14]:

df.groupby('user').count()['message']


Unnamed: 0_level_0,message
user,Unnamed: 1_level_1
Megha@sis,8175
N.Varalakshmi,9465
group_notification,1


In [15]:
# message start date

start_date=df['message_date'].min()
start_date

Timestamp('2022-08-15 12:54:00')

In [16]:
end_date=df['message_date'].max()
end_date

Timestamp('2026-01-12 21:43:00')

In [17]:
df['user'].value_counts(normalize=True)*100

Unnamed: 0_level_0,proportion
user,Unnamed: 1_level_1
N.Varalakshmi,53.653421
Megha@sis,46.34091
group_notification,0.005669


In [18]:
df['hour'] = df['message_date'].dt.hour
df

Unnamed: 0,user_message,message_date,user,message,hour
0,Messages and calls are end-to-end encrypted. O...,2022-08-15 12:54:00,group_notification,Messages and calls are end-to-end encrypted. O...,12
1,N.Varalakshmi: Hello\n,2022-08-15 12:54:00,N.Varalakshmi,Hello,12
2,N.Varalakshmi: Megha\n,2022-08-15 12:54:00,N.Varalakshmi,Megha,12
3,N.Varalakshmi: I am vara\n,2022-08-15 12:54:00,N.Varalakshmi,I am vara,12
4,Megha@sis: Hlo vara\n,2022-08-15 13:24:00,Megha@sis,Hlo vara,13
...,...,...,...,...,...
17636,Megha@sis: Hmm. Ok\n,2026-01-11 21:23:00,Megha@sis,Hmm. Ok,21
17637,N.Varalakshmi: Haa\n,2026-01-11 21:23:00,N.Varalakshmi,Haa,21
17638,N.Varalakshmi: \n,2026-01-12 20:30:00,N.Varalakshmi,,20
17639,Megha@sis: \n,2026-01-12 20:46:00,Megha@sis,,20


In [19]:
df['weekday'] = df['message_date'].dt.day_name()
df

Unnamed: 0,user_message,message_date,user,message,hour,weekday
0,Messages and calls are end-to-end encrypted. O...,2022-08-15 12:54:00,group_notification,Messages and calls are end-to-end encrypted. O...,12,Monday
1,N.Varalakshmi: Hello\n,2022-08-15 12:54:00,N.Varalakshmi,Hello,12,Monday
2,N.Varalakshmi: Megha\n,2022-08-15 12:54:00,N.Varalakshmi,Megha,12,Monday
3,N.Varalakshmi: I am vara\n,2022-08-15 12:54:00,N.Varalakshmi,I am vara,12,Monday
4,Megha@sis: Hlo vara\n,2022-08-15 13:24:00,Megha@sis,Hlo vara,13,Monday
...,...,...,...,...,...,...
17636,Megha@sis: Hmm. Ok\n,2026-01-11 21:23:00,Megha@sis,Hmm. Ok,21,Sunday
17637,N.Varalakshmi: Haa\n,2026-01-11 21:23:00,N.Varalakshmi,Haa,21,Sunday
17638,N.Varalakshmi: \n,2026-01-12 20:30:00,N.Varalakshmi,,20,Monday
17639,Megha@sis: \n,2026-01-12 20:46:00,Megha@sis,,20,Monday


In [20]:
df['month'] = df['message_date'].dt.month_name()
df


Unnamed: 0,user_message,message_date,user,message,hour,weekday,month
0,Messages and calls are end-to-end encrypted. O...,2022-08-15 12:54:00,group_notification,Messages and calls are end-to-end encrypted. O...,12,Monday,August
1,N.Varalakshmi: Hello\n,2022-08-15 12:54:00,N.Varalakshmi,Hello,12,Monday,August
2,N.Varalakshmi: Megha\n,2022-08-15 12:54:00,N.Varalakshmi,Megha,12,Monday,August
3,N.Varalakshmi: I am vara\n,2022-08-15 12:54:00,N.Varalakshmi,I am vara,12,Monday,August
4,Megha@sis: Hlo vara\n,2022-08-15 13:24:00,Megha@sis,Hlo vara,13,Monday,August
...,...,...,...,...,...,...,...
17636,Megha@sis: Hmm. Ok\n,2026-01-11 21:23:00,Megha@sis,Hmm. Ok,21,Sunday,January
17637,N.Varalakshmi: Haa\n,2026-01-11 21:23:00,N.Varalakshmi,Haa,21,Sunday,January
17638,N.Varalakshmi: \n,2026-01-12 20:30:00,N.Varalakshmi,,20,Monday,January
17639,Megha@sis: \n,2026-01-12 20:46:00,Megha@sis,,20,Monday,January


In [21]:
df['user'].value_counts()

Unnamed: 0_level_0,count
user,Unnamed: 1_level_1
N.Varalakshmi,9465
Megha@sis,8175
group_notification,1


In [22]:
df['message_length']=df['message'].str.len()
df.groupby('user')['message_length'].mean()  #average message length per user

Unnamed: 0_level_0,message_length
user,Unnamed: 1_level_1
Megha@sis,19.899083
N.Varalakshmi,14.150343
group_notification,117.0


In [23]:
# time based analysis
df['hour'].value_counts().sort_index()


Unnamed: 0_level_0,count
hour,Unnamed: 1_level_1
0,298
1,52
2,44
3,7
4,13
5,21
6,103
7,298
8,585
9,718


In [39]:
df['date'] = df['message_date'].dt.date
df.groupby('date').size()
#messages per day

Unnamed: 0_level_0,0
date,Unnamed: 1_level_1
2022-08-15,62
2022-08-16,135
2022-08-17,50
2022-08-18,56
2022-08-19,74
...,...
2026-01-08,7
2026-01-09,4
2026-01-10,18
2026-01-11,43


In [40]:
df['mon'] = df['message_date'].dt.to_period('M')
df.groupby('mon').size()
# messages per month

Unnamed: 0_level_0,0
mon,Unnamed: 1_level_1
2022-08,1275
2022-09,911
2022-10,778
2022-11,670
2022-12,410
2023-01,289
2023-02,222
2023-03,328
2023-04,1031
2023-05,433


In [41]:
df['hour'].value_counts().sort_index()
# messages per hour

Unnamed: 0_level_0,count
hour,Unnamed: 1_level_1
0,298
1,52
2,44
3,7
4,13
5,21
6,103
7,298
8,585
9,718


In [42]:
# weekday vs weekend activity

df['is_weekend'] = df['message_date'].dt.weekday >= 5
df['is_weekend'].value_counts()


Unnamed: 0_level_0,count
is_weekend,Unnamed: 1_level_1
False,12024
True,5617


In [24]:
df['weekday'].value_counts()


Unnamed: 0_level_0,count
weekday,Unnamed: 1_level_1
Saturday,3253
Friday,3072
Thursday,2538
Tuesday,2364
Sunday,2364
Wednesday,2087
Monday,1963


In [25]:
df

Unnamed: 0,user_message,message_date,user,message,hour,weekday,month,message_length
0,Messages and calls are end-to-end encrypted. O...,2022-08-15 12:54:00,group_notification,Messages and calls are end-to-end encrypted. O...,12,Monday,August,117
1,N.Varalakshmi: Hello\n,2022-08-15 12:54:00,N.Varalakshmi,Hello,12,Monday,August,5
2,N.Varalakshmi: Megha\n,2022-08-15 12:54:00,N.Varalakshmi,Megha,12,Monday,August,5
3,N.Varalakshmi: I am vara\n,2022-08-15 12:54:00,N.Varalakshmi,I am vara,12,Monday,August,9
4,Megha@sis: Hlo vara\n,2022-08-15 13:24:00,Megha@sis,Hlo vara,13,Monday,August,8
...,...,...,...,...,...,...,...,...
17636,Megha@sis: Hmm. Ok\n,2026-01-11 21:23:00,Megha@sis,Hmm. Ok,21,Sunday,January,7
17637,N.Varalakshmi: Haa\n,2026-01-11 21:23:00,N.Varalakshmi,Haa,21,Sunday,January,3
17638,N.Varalakshmi: \n,2026-01-12 20:30:00,N.Varalakshmi,,20,Monday,January,0
17639,Megha@sis: \n,2026-01-12 20:46:00,Megha@sis,,20,Monday,January,0


In [26]:
df['word_count'] = df['message'].apply(lambda x: len(x.split()))
df['word_count'].describe()


Unnamed: 0,word_count
count,17641.0
mean,2.414035
std,5.981509
min,0.0
25%,1.0
50%,2.0
75%,2.0
max,501.0


In [27]:
# media links and system messages

df['is_media'] = df['message'].str.contains('<Media omitted>')
df['is_media'].sum()


np.int64(2632)

In [28]:
df['is_link']=df['message'].str.contains('http')
df['is_link'].sum()

np.int64(223)

In [30]:
# media count of each user

df.groupby('user')['is_media'].sum()

Unnamed: 0_level_0,is_media
user,Unnamed: 1_level_1
Megha@sis,1297
N.Varalakshmi,1335
group_notification,0


In [31]:
df.groupby('user')['is_link'].sum()

Unnamed: 0_level_0,is_link
user,Unnamed: 1_level_1
Megha@sis,112
N.Varalakshmi,111
group_notification,0


In [32]:
from collections import Counter

words=' '.join(df['message']).lower().split()
Counter(words).most_common(10)

[('<media', 2632),
 ('omitted>', 2632),
 ('vara', 730),
 ('ok', 696),
 ('ha', 636),
 ('ki', 361),
 ('lo', 350),
 ('haa', 323),
 ('inka', 243),
 ('sare', 214)]

In [36]:
# most used emojis
!pip install emoji
import emoji

Collecting emoji
  Downloading emoji-2.15.0-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.15.0-py3-none-any.whl (608 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/608.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m276.5/608.4 kB[0m [31m8.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m608.4/608.4 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.15.0


In [38]:
emojis=[]

for msg in df['message']:
  emojis.extend([c for c in msg if c in emoji.EMOJI_DATA])
pd.Series(emojis).value_counts().head(10)

Unnamed: 0,count
😁,28
🥰,22
👍,19
❤,19
👋,16
😂,14
😊,8
😭,8
🥲,8
✅,7


In [43]:
# behaviour based questions

# average messages per user

df.groupby('user').value_counts().mean()

np.float64(1.1155305425572277)

In [44]:
# who replies fastest

df['response_time'] = df['message_date'].diff().dt.total_seconds()
df.groupby('user')['response_time'].mean().idxmin()


'Megha@sis'

In [None]:
# is chat activity increasing or decrasing

df.groupby(df['message_date'].dt.to_period('M')).size()