## Imports

In [226]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re

## Preprocessing Data

In [227]:
test_path = './data/test_data.txt'
data_path = './data/wc_marya.txt'
# data_path = './data/wc_adeel.txt'
# data_path = './data/wc_amalik.txt'
pf_path = './data/wc_p.csv'
ff_path = './data/by_pd.csv'

In [228]:
# create the list of all the messages pre-processed
msg_lists = list()
i=0
with open(data_path, 'r') as f:
    for line in f.readlines():
        if re.match(r'./.*/.*, .*- .*:', line) is None:
            msg_lists[i-1].append(line)
            continue
        else:
            r2 = line.replace(' - ', ',').replace(': ',',').replace('\n',' ')
            ftrs = r2.split(',')
            msg_lists.append(ftrs)
            i+=1

In [229]:
# write the msgs in csv file
with open(pf_path, 'w') as nf:
    nf.write('date,time,user,msg\n')
    for li in msg_lists:
        # print(len(li))
        f_str = "".join(li[3:]).strip().replace('\n',' ').replace(',',':')
        nr = f'{li[0]},{li[1].strip()},{li[2]},"{f_str}"\n'
        # print(nr)
        nf.write(nr)

In [230]:
# read that csv
chat_data = pd.read_csv(pf_path,
                        skipinitialspace=True,
                        parse_dates={'datetime': [0, 1]})

In [231]:
chat_data.head()

Unnamed: 0,datetime,user,msg
0,2020-01-06 14:46:00,Marya NUST-Uni,Poochti hoon
1,2020-01-06 14:46:00,Marya NUST-Uni,Poochke tumhe bataun?
2,2020-01-06 14:46:00,SubTain Malik,R diltaj se b
3,2020-01-06 14:46:00,SubTain Malik,G
4,2020-01-06 14:46:00,Marya NUST-Uni,Okay


In [232]:
# chat_data
# chat_data.loc[18].msg

In [233]:
# save the csv by pandas
chat_data.to_csv(ff_path, index=False)

## Explore the data

In [234]:
# read the new_data 
ff_path = './data/by_pd.csv'
chat_data = pd.read_csv(ff_path, parse_dates=['datetime'])

In [235]:
chat_data.head()
# chat_data.loc[98]

Unnamed: 0,datetime,user,msg
0,2020-01-06 14:46:00,Marya NUST-Uni,Poochti hoon
1,2020-01-06 14:46:00,Marya NUST-Uni,Poochke tumhe bataun?
2,2020-01-06 14:46:00,SubTain Malik,R diltaj se b
3,2020-01-06 14:46:00,SubTain Malik,G
4,2020-01-06 14:46:00,Marya NUST-Uni,Okay


In [236]:
# check col infos
chat_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39999 entries, 0 to 39998
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   datetime  39999 non-null  datetime64[ns]
 1   user      39999 non-null  object        
 2   msg       39999 non-null  object        
dtypes: datetime64[ns](1), object(2)
memory usage: 937.6+ KB


In [237]:
# check useful insights of data
chat_data.describe()

  


Unnamed: 0,datetime,user,msg
count,39999,39999,39999
unique,11615,2,27138
top,2020-03-29 06:47:00,SubTain Malik,Muuuuuwwwaaaaahhhhhhhhhhhhhh
freq,23,23365,603
first,2020-01-06 14:46:00,,
last,2020-08-22 17:32:00,,


## Calculate Statistics

In [238]:
# msgs by the person
chat_data.user.value_counts()
# function for time conversion
c_time_format = lambda x: x.to_pydatetime().strftime('%d %B %Y, %I:%M %p')

In [239]:
details = chat_data.describe()
users = list(chat_data.user.unique())
user_msgs = chat_data.user.value_counts()

total_msgs = details.loc['count'][0]
total_msgs_u1 = user_msgs.loc[users[0]]
total_msgs_u2 = user_msgs.loc[users[1]]

conv_start = c_time_format(details.loc['first'][0])
conv_end = c_time_format(details.loc['last'][0])
most_msgs_at = c_time_format(details.loc['top'][0])

most_sent_msg = details.loc['top'][2]
if most_sent_msg.lower() == '<media omitted>':
    most_sent_msg = 'Media Files'

  """Entry point for launching an IPython kernel.


In [240]:
print(f'Total Messages: {total_msgs}\nMessages by {users[0]}: {total_msgs_u1}\nMessages by {users[1]}: {total_msgs_u2}')
print(f'Conversation Started: {conv_start}\nConversation Ended: {conv_end}')
print(f'Most Sent Messsage: {most_sent_msg}\nMost Messages Sent at: {most_msgs_at}')

Total Messages: 39999
Messages by Marya NUST-Uni: 16634
Messages by SubTain Malik: 23365
Conversation Started: 06 January 2020, 02:46 PM
Conversation Ended: 22 August 2020, 05:32 PM
Most Sent Messsage: Muuuuuwwwaaaaahhhhhhhhhhhhhh
Most Messages Sent at: 29 March 2020, 06:47 AM


In [241]:
chat_data['msg_len'] = chat_data['msg'].agg(len)
longest_msgs = chat_data.groupby('user').msg_len.max()
avg_msgs = chat_data.groupby('user').msg_len.mean()

In [242]:
print(f'Longest message by {users[0]}: {longest_msgs[users[0]]} Characters\
\nLongest message by {users[1]}: {longest_msgs[users[1]]} Characters')
print(f'Average message length by {users[0]}: {avg_msgs[users[0]]:.1f} Characters\
\nAverage message length by {users[1]}: {avg_msgs[users[1]]:.1f} Characters')
print(f'Overall average message length {(avg_msgs[users[0]] + avg_msgs[users[1]])/ 2 :.2f} Characters')
avg_msgs

Longest message by Marya NUST-Uni: 226 Characters
Longest message by SubTain Malik: 2220 Characters
Average message length by Marya NUST-Uni: 21.5 Characters
Average message length by SubTain Malik: 23.9 Characters
Overall average message length 22.74 Characters


user
Marya NUST-Uni    21.547673
SubTain Malik     23.933661
Name: msg_len, dtype: float64

In [243]:
chat_data['day'] = chat_data['datetime'].dt.day_name()
chat_data['week_num'] = chat_data['datetime'].dt.isocalendar().week
chat_data['year'] = chat_data['datetime'].dt.year
chat_data['time'] = chat_data.datetime - chat_data.datetime.dt.normalize()
chat_data['time_spans'] = pd.cut(chat_data.time,
                                 bins=5,
                                 labels=['Late Night', 'Early Morning', 'Day Time', 'Evening', 'Night'])

In [244]:
chat_data.head()

Unnamed: 0,datetime,user,msg,msg_len,day,week_num,year,time,time_spans
0,2020-01-06 14:46:00,Marya NUST-Uni,Poochti hoon,12,Monday,2,2020,0 days 14:46:00,Evening
1,2020-01-06 14:46:00,Marya NUST-Uni,Poochke tumhe bataun?,21,Monday,2,2020,0 days 14:46:00,Evening
2,2020-01-06 14:46:00,SubTain Malik,R diltaj se b,13,Monday,2,2020,0 days 14:46:00,Evening
3,2020-01-06 14:46:00,SubTain Malik,G,1,Monday,2,2020,0 days 14:46:00,Evening
4,2020-01-06 14:46:00,Marya NUST-Uni,Okay,4,Monday,2,2020,0 days 14:46:00,Evening


In [249]:
msgs_u_day = chat_data.groupby('user').day.value_counts()
for user in users:
    print(f'Messages per Day by {user}:')
    u_msgs_day = msgs_u_day.loc[user]
    for d, num_m in zip(u_msgs_day.index, u_msgs_day):
        print(f'  {d}:{num_m}')
    print('')

Messages per Day by Marya NUST-Uni:
  Friday:3087
  Monday:2816
  Saturday:2774
  Sunday:2448
  Thursday:2134
  Wednesday:1760
  Tuesday:1615

Messages per Day by SubTain Malik:
  Friday:4230
  Monday:4059
  Saturday:3981
  Sunday:3401
  Thursday:3056
  Wednesday:2379
  Tuesday:2259



In [250]:
mostly_tm = chat_data.time_spans.value_counts(ascending=False)
most_msgs_time = mostly_tm.index[0]
most_time_msgs = mostly_tm.iloc[0]
print(f'Mostly talked at: {most_msgs_time} "{most_time_msgs}"')
print('\n')
msgs_u_time = chat_data.groupby('user').time_spans.value_counts()
for user in users:
    print(f'Messages per Day by {user}:')
    u_msgs_time = msgs_u_time.loc[user]
    for t, num_m in zip(u_msgs_time.index, u_msgs_time):
        print(f'  {t}:{num_m}')
    print('')

Mostly talked at: Early Morning "11712"


Messages per Day by Marya NUST-Uni:
  Early Morning:4866
  Day Time:4629
  Night:3154
  Evening:2935
  Late Night:1050

Messages per Day by SubTain Malik:
  Early Morning:6846
  Day Time:6528
  Evening:4119
  Night:4119
  Late Night:1753



In [251]:
avg_msgs_day = chat_data.groupby(['year']).day.value_counts() / 34
print(f'Average messages per day: {avg_msgs_day.mean():.2f}')
l_msg = chat_data.sort_values(by='msg_len', ascending=False).iloc[0]
print(f'Longest message at: {l_msg.day} '\
      f'{l_msg.time_spans} {l_msg.datetime.strftime("%I %p")}')

Average messages per day: 168.06
Longest message at: Friday Late Night 04 AM


**Features to add**
* TONE IN MSGS (happy sad angry (bar chart of users) NLP {Sentiment Analysis})