## Imports

In [11]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re

## Preprocessing Data

In [49]:
test_path = './data/test_data.txt'
# data_path = './data/wc_marya.txt'
# data_path = './data/wc_adeel.txt'
data_path = './data/wc_amalik.txt'
pf_path = './data/wc_p.csv'
ff_path = './data/by_pd.csv'

In [50]:
# create the list of all the messages pre-processed
msg_lists = list()
i=0
with open(data_path, 'r') as f:
    for line in f.readlines():
        if re.match(r'./.*/.*, .*- .*:', line) is None:
            msg_lists[i-1].append(line)
            continue
        else:
            r2 = line.replace(' - ', ',').replace(': ',',').replace('\n',' ')
            ftrs = r2.split(',')
            msg_lists.append(ftrs)
            i+=1

In [51]:
# write the msgs in csv file
with open(pf_path, 'w') as nf:
    nf.write('date,time,user,msg\n')
    for li in msg_lists:
        # print(len(li))
        f_str = "".join(li[3:]).strip().replace('\n',' ').replace(',',':')
        nr = f'{li[0]},{li[1].strip()},{li[2]},"{f_str}"\n'
        # print(nr)
        nf.write(nr)

In [52]:
# read that csv
chat_data = pd.read_csv(pf_path,
                        skipinitialspace=True,
                        parse_dates={'datetime': [0, 1]})

In [53]:
chat_data.head()

Unnamed: 0,datetime,user,msg
0,2019-08-28 02:35:00,Malik,t ap isy glt adt khty
1,2019-08-28 02:35:00,SubTain Malik,Mtlb mnany ka easy hal
2,2019-08-28 02:36:00,SubTain Malik,Kxh b ho
3,2019-08-28 02:36:00,Malik,sahe sahe
4,2019-08-28 02:36:00,SubTain Malik,Thora hot kryn thora seduce kryn r adha ghnta ...


In [54]:
# chat_data
# chat_data.loc[18].msg

In [55]:
# save the csv by pandas
chat_data.to_csv(ff_path, index=False)

## Explore the data

In [56]:
# read the new_data 
ff_path = './data/by_pd.csv'
chat_data = pd.read_csv(ff_path, parse_dates=['datetime'])

In [57]:
chat_data.head()
# chat_data.loc[98]

Unnamed: 0,datetime,user,msg
0,2019-08-28 02:35:00,Malik,t ap isy glt adt khty
1,2019-08-28 02:35:00,SubTain Malik,Mtlb mnany ka easy hal
2,2019-08-28 02:36:00,SubTain Malik,Kxh b ho
3,2019-08-28 02:36:00,Malik,sahe sahe
4,2019-08-28 02:36:00,SubTain Malik,Thora hot kryn thora seduce kryn r adha ghnta ...


In [58]:
# check col infos
chat_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35683 entries, 0 to 35682
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   datetime  35683 non-null  datetime64[ns]
 1   user      35683 non-null  object        
 2   msg       35683 non-null  object        
dtypes: datetime64[ns](1), object(2)
memory usage: 836.4+ KB


In [59]:
# check useful insights of data
chat_data.describe()

  


Unnamed: 0,datetime,user,msg
count,35683,35683,35683
unique,8318,2,25294
top,2020-05-07 12:30:00,Malik,G
freq,31,19516,488
first,2019-08-28 02:35:00,,
last,2020-08-29 18:28:00,,


## Calculate Statistics

In [60]:
# msgs by the person
chat_data.user.value_counts()
# function for time conversion
c_time_format = lambda x: x.to_pydatetime().strftime('%d %B %Y, %I:%M %p')

In [61]:
details = chat_data.describe()
users = list(chat_data.user.unique())
user_msgs = chat_data.user.value_counts()

total_msgs = details.loc['count'][0]
total_msgs_u1 = user_msgs.loc[users[0]]
total_msgs_u2 = user_msgs.loc[users[1]]

conv_start = c_time_format(details.loc['first'][0])
conv_end = c_time_format(details.loc['last'][0])
most_msgs_at = c_time_format(details.loc['top'][0])

most_sent_msg = details.loc['top'][2]
if most_sent_msg.lower() == '<media omitted>':
    most_sent_msg = 'Media Files'

  """Entry point for launching an IPython kernel.


In [62]:
print(f'Total Messages: {total_msgs}\nMessages by {users[0]}: {total_msgs_u1}\nMessages by {users[1]}: {total_msgs_u2}')
print(f'Conversation Started: {conv_start}\nConversation Ended: {conv_end}')
print(f'Most Sent Messsage: {most_sent_msg}\nMost Messages Sent at: {most_msgs_at}')

Total Messages: 35683
Messages by Malik: 19516
Messages by SubTain Malik: 16167
Conversation Started: 28 August 2019, 02:35 AM
Conversation Ended: 29 August 2020, 06:28 PM
Most Sent Messsage: G
Most Messages Sent at: 07 May 2020, 12:30 PM


In [63]:
chat_data['msg_len'] = chat_data['msg'].agg(len)
longest_msgs = chat_data.groupby('user').msg_len.max()
avg_msgs = chat_data.groupby('user').msg_len.mean()

In [64]:
print(f'Longest message by {users[0]}: {longest_msgs[users[0]]} Characters\
\nLongest message by {users[1]}: {longest_msgs[users[1]]} Characters')
print(f'Average message length by {users[0]}: {avg_msgs[users[0]]:.1f} Characters\
\nAverage message length by {users[1]}: {avg_msgs[users[1]]:.1f} Characters')
print(f'Overall average message length {(avg_msgs[users[0]] + avg_msgs[users[1]])/ 2 :.2f} Characters')
avg_msgs

Longest message by Malik: 2058 Characters
Longest message by SubTain Malik: 215496 Characters
Average message length by Malik: 20.8 Characters
Average message length by SubTain Malik: 32.7 Characters
Overall average message length 26.71 Characters


user
Malik            20.767729
SubTain Malik    32.658378
Name: msg_len, dtype: float64

In [65]:
chat_data['day'] = chat_data['datetime'].dt.day_name()
chat_data['week_num'] = chat_data['datetime'].dt.isocalendar().week
chat_data['year'] = chat_data['datetime'].dt.year
chat_data['time'] = chat_data.datetime - chat_data.datetime.dt.normalize()
chat_data['time_spans'] = pd.cut(chat_data.time,
                                 bins=5,
                                 labels=['Late Night', 'Early Morning', 'Day Time', 'Evening', 'Night'])

In [66]:
chat_data.head()

Unnamed: 0,datetime,user,msg,msg_len,day,week_num,year,time,time_spans
0,2019-08-28 02:35:00,Malik,t ap isy glt adt khty,21,Wednesday,35,2019,0 days 02:35:00,Late Night
1,2019-08-28 02:35:00,SubTain Malik,Mtlb mnany ka easy hal,22,Wednesday,35,2019,0 days 02:35:00,Late Night
2,2019-08-28 02:36:00,SubTain Malik,Kxh b ho,8,Wednesday,35,2019,0 days 02:36:00,Late Night
3,2019-08-28 02:36:00,Malik,sahe sahe,9,Wednesday,35,2019,0 days 02:36:00,Late Night
4,2019-08-28 02:36:00,SubTain Malik,Thora hot kryn thora seduce kryn r adha ghnta ...,73,Wednesday,35,2019,0 days 02:36:00,Late Night


In [67]:
msgs_u_day = chat_data.groupby('user').day.value_counts()
for user in users:
    print(f'Messages per Day by {user}:')
    u_msgs_day = msgs_u_day.loc[user]
    for d, num_m in zip(u_msgs_day.index, u_msgs_day):
        print(f'  {d}:{num_m}')
    print('')

Messages per Day by Malik:
  Thursday:4569
  Monday:3313
  Wednesday:3226
  Sunday:2858
  Friday:2076
  Saturday:1959
  Tuesday:1515

Messages per Day by SubTain Malik:
  Thursday:3907
  Wednesday:3032
  Monday:2731
  Sunday:2177
  Saturday:1593
  Friday:1535
  Tuesday:1192



In [68]:
mostly_tm = chat_data.time_spans.value_counts(ascending=False)
most_msgs_time = mostly_tm.index[0]
most_time_msgs = mostly_tm.iloc[0]
print(f'Mostly talked at: {most_msgs_time} "{most_time_msgs}"')
print('\n')

msgs_u_time = chat_data.groupby('user').time_spans.value_counts()
for user in users:
    print(f'Messages per Day by {user}:')
    u_msgs_time = msgs_u_time.loc[user]
    for t, num_m in zip(u_msgs_time.index, u_msgs_time):
        print(f'  {t}:{num_m}')
    print('')

Mostly talked at: Day Time "9998"


Messages per Day by Malik:
  Day Time:5296
  Early Morning:4418
  Night:4102
  Late Night:3051
  Evening:2649

Messages per Day by SubTain Malik:
  Day Time:4702
  Early Morning:3675
  Night:3077
  Late Night:2755
  Evening:1958



In [69]:
avg_msgs_day = chat_data.groupby(['year']).day.value_counts() / 34
print(f'Average messages per day: {avg_msgs_day.mean():.2f}')
l_msg = chat_data.sort_values(by='msg_len', ascending=False).iloc[0]
print(f'Longest message at: {l_msg.day} '\
      f'{l_msg.time_spans} {l_msg.datetime.strftime("%I %p")}')

Average messages per day: 74.96
Longest message at: Sunday Night 07 PM


**Features to add**
* TONE IN MSGS (happy sad angry (bar chart of users) NLP {Sentiment Analysis})