# Whats app analysis

In [61]:
import pandas as pd
import numpy as np
import re
import dateparser
from collections import Counter
import matplotlib.pyplot as plt
plt.style.use('ggplot') 

def read_file(file):
    '''Reads Whatsapp text file into a list of strings''' 

    x = open(file,'r', encoding = 'utf-8') #Opens the text file into variable x but the variable cannot be explored yet
    y = x.read() #By now it becomes a huge chunk of string that we need to separate line by line
    content = y.splitlines() #The splitline method converts the chunk of string into a list of strings
    return content

chat = read_file('Sachin Sonavane.txt')
print(len(chat))


join = [line for line in chat if  "joined using this" in line]


chat

9946


['8/10/18, 11:17 - Messages to this chat and calls are now secured with end-to-end encryption. Tap for more info.',
 '8/12/18, 20:15 - @mahesh23_s: <Media omitted>',
 '8/12/18, 20:15 - @mahesh23_s: Accenture madhe Astana',
 '8/13/18, 01:12 - @mahesh23_s: send me',
 '8/13/18, 01:12 - @mahesh23_s: manisha',
 '8/13/18, 01:12 - @mahesh23_s: kundlai',
 '8/13/18, 09:15 - Sachin Sonavane: <Media omitted>',
 '8/13/18, 09:16 - Sachin Sonavane: <Media omitted>',
 '8/13/18, 09:24 - @mahesh23_s: thnx',
 '8/13/18, 12:31 - @mahesh23_s: pls pay 7lack',
 '8/13/18, 12:31 - @mahesh23_s: as prepayment',
 '8/13/18, 12:31 - @mahesh23_s: at earliest',
 '8/13/18, 12:31 - Sachin Sonavane: Aaj karu ka',
 '8/13/18, 12:32 - @mahesh23_s: yes',
 '8/13/18, 12:32 - @mahesh23_s: kar',
 '8/13/18, 12:32 - Sachin Sonavane: Ok',
 '8/13/18, 12:32 - @mahesh23_s: ani kiti',
 '8/13/18, 12:32 - @mahesh23_s: mahine',
 '8/13/18, 12:32 - @mahesh23_s: rahatata',
 '8/13/18, 12:32 - @mahesh23_s: te vichar',
 '8/13/18, 12:33 - Sachi

In [62]:
 #Remove new lines
chat = [line.strip() for line in chat]
print("length of chat is:")
print(len(chat))

length of chat is:
9946


In [63]:
#Clean out the join notification lines
clean_chat = [line for line in chat if not "joined using this" in line]

#Further cleaning
#Remove empty lines
clean_chat = [line for line in clean_chat if len(line) > 1]
print("length of clean_chat is:")
print(len(clean_chat)) 

length of clean_chat is:
9706


In [64]:
left = [line for line in clean_chat if line.endswith("left")]
left 

['8. Turn left']

In [65]:
#Clean out the left notification lines
clean_chat = [line for line in clean_chat if not line.endswith("left")]
print(len(clean_chat))

9705


In [66]:
#Merge messages that belong together
msgs = [] #message container
pos = 0 #counter for position of msgs in the container
"""
Flow:
For every line, see if it matches the expression which is starting with the format "number(s)+slash" eg "12/"
If it does, it is a new line of conversion as they begin with dates, add it to msgs container
Else, it is a continuation of the previous line, add it to the previous line and append to msgs, then pop previous line.
"""
for line in clean_chat:
    if re.findall("\A\d+[/]", line):
        msgs.append(line)
        pos += 1
    else:
        take = msgs[pos-1] + ". " + line
        msgs.append(take)
        msgs.pop(pos-1)


In [68]:
time = [msgs[i].split(',')[1].split('-')[0] for i in range(len(msgs))]
time = [s.strip(' ') for s in time] # Remove spacing
time

IndexError: list index out of range

In [78]:
msgs[i].split('-')[1].split(':')[0]

' @mahesh23_s'

In [80]:
date = [msgs[i].split(',')[0] for i in range(len(msgs))]
len(date) 

name = [msgs[i].split('-')[1].split(':')[0] for i in range(len(msgs))]
len(name) 

content = []
for i in range(len(msgs)):
    try:
        content.append(msgs[i].split(':')[2])
    except IndexError:
        content.append('Missing Text')
len(content)

IndexError: list index out of range

In [87]:
msgs[500].split('-')[1].split(':')[0]

' @mahesh23_s'

In [52]:
df = pd.DataFrame(list(zip(date, time, name, content)), columns = ['Date', 'Time', 'Name', 'Content'])

In [16]:
df = df[df["Content"]!='Missing Text']
df.reset_index(inplace=True, drop=True)
df

Unnamed: 0,Date,Time,Name,Content
0,5/16/20,14:41,+971 52 766 2655,Thanks for adding me the group @919791449719....
1,5/16/20,14:41,+91 97914 49719,Follow this link to join my WhatsApp group
2,5/16/20,14:42,+91 97914 49719,You are welcome👍🏻
3,5/16/20,14:43,+91 97914 49719,If you want to add any of your friends kindly...
4,5/16/20,14:43,+971 52 766 2655,Surely
...,...,...,...,...
208,5/29/20,15:53,+91 88852 33238,Requirement is not to change to field data ty...
209,5/29/20,16:27,+91 97914 49719,Are you opening the exported file in excel or...
210,5/29/20,16:28,+91 88852 33238,Its exported as csv
211,5/29/20,16:29,+91 82973 34445,Can you please share the test data here then ...


In [17]:
df['DateTime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'])
df['DateTime']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['DateTime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'])


0     2020-05-16 14:41:00
1     2020-05-16 14:41:00
2     2020-05-16 14:42:00
3     2020-05-16 14:43:00
4     2020-05-16 14:43:00
              ...        
208   2020-05-29 15:53:00
209   2020-05-29 16:27:00
210   2020-05-29 16:28:00
211   2020-05-29 16:29:00
212   2020-05-29 16:54:00
Name: DateTime, Length: 213, dtype: datetime64[ns]

In [18]:
df['weekday'] = df['DateTime'].apply(lambda x: x.day_name()) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['weekday'] = df['DateTime'].apply(lambda x: x.day_name())


In [19]:
df['Letter_Count'] = df['Content'].apply(lambda s : len(s))
df['Word_Count'] = df['Content'].apply(lambda s : len(s.split(' ')))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Letter_Count'] = df['Content'].apply(lambda s : len(s))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Word_Count'] = df['Content'].apply(lambda s : len(s.split(' ')))


In [20]:
df['Hour'] = df['Time'].apply(lambda x : x.split(':')[0]) 
# The first token of a value in the Time Column contains the hour (Eg., "12" in "12:15")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Hour'] = df['Time'].apply(lambda x : x.split(':')[0])


In [21]:
#print first five rows of our dataframe
df.head()

Unnamed: 0,Date,Time,Name,Content,DateTime,weekday,Letter_Count,Word_Count,Hour
0,5/16/20,14:41,+971 52 766 2655,Thanks for adding me the group @919791449719....,2020-05-16 14:41:00,Saturday,70,12,14
1,5/16/20,14:41,+91 97914 49719,Follow this link to join my WhatsApp group,2020-05-16 14:41:00,Saturday,43,9,14
2,5/16/20,14:42,+91 97914 49719,You are welcome👍🏻,2020-05-16 14:42:00,Saturday,18,4,14
3,5/16/20,14:43,+91 97914 49719,If you want to add any of your friends kindly...,2020-05-16 14:43:00,Saturday,75,16,14
4,5/16/20,14:43,+971 52 766 2655,Surely,2020-05-16 14:43:00,Saturday,7,2,14


In [22]:
fileName = "Power BI Developers.txt"
fileName

'Power BI Developers.txt'

In [23]:
filePath = 'C:\\Users\\msonavane\\Downloads\whatsapp\Power BI Developers.txt'

SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape (<ipython-input-23-7c32e38078d2>, line 1)

In [32]:
import os
path= os.getcwd()
path

'C:\\Users\\msonavane\\Downloads\\whatsapp'

In [33]:
print(os.path)

<module 'ntpath' from 'c:\\users\\msonavane\\appdata\\local\\programs\\python\\python38\\lib\\ntpath.py'>


In [35]:
print(os.path.join(path,"file.txt")) 

C:\Users\msonavane\Downloads\whatsapp\file.txt


In [55]:
filename = "abc.txt"

In [60]:
filename.split('.')[0]+'.csv'

'abc.csv'