## Parsing WhatsApp and Telegram messages

1) Add WhatsApp messages to chats folder

In [1]:
# concat all files in chats directory [WhatsApp]

import os

path = 'raw_chats/whatsapp_raw'
files = os.listdir(path)
with open("raw_chats/output_file.txt", "w") as fo: 
    for infile in files:
        with open(os.path.join(path, infile)) as fin:
            for line in fin:
                fo.write(line)

In [None]:
# concat certain files in directory [WhatsApp]
# only if you want to use specific chats from WhatsApp

filenames = ['1.txt', '2.txt', '3.txt', '4.txt', "5.txt" ]

path = 'raw_chats/whatsapp_raw'

with open("raw_chats/output_file.txt", "w") as outfile:

    for filename in filenames:

        with open(path + filename) as infile:

            contents = infile.read()

            outfile.write(contents)

In [2]:
# To clean and convert a whatsapp txt file export to a CSV file

import pandas as pd

# read file by lines
file_path = "raw_chats/output_file.txt"
f = open(file_path, 'r')
data = f.readlines()
f.close()

# sanity stats
print('num lines: %s' %(len(data)))

# parse text and create list of lists structure
# remove first whatsapp info message
dataset = data[1:]
cleaned_data = []
for line in dataset:
    # grab the info and cut it out
    date = line.split(",")[0]
    line2 = line[len(date):]
    time = line2.split("-")[0][2:]
    line3 = line2[len(time):]
    name = line3.split(":")[0][4:]
    line4 = line3[len(name):]
    message = line4[6:-1] # strip newline charactor

    #print(date, time, name, message)
    cleaned_data.append([date, time, name, message])

    
# Create the DataFrame 
df = pd.DataFrame(cleaned_data, columns = ['Date', 'Time', 'Name', 'Message']) 

# check formatting 
if 0:
    print(df.head())
    print(df.tail())


# Save it
df.to_csv('raw_chats/converted_messages.csv', index=False)

num lines: 10


In [3]:
df = pd.read_csv ('raw_chats/converted_messages.csv')
print(df)

      Date    Time                Name  \
0  8/22/21  19:54   Alexander John Lee   
1  9/18/21  07:48   Alexander John Lee   
2  9/18/21  07:48   Alexander John Lee   
3  9/18/21  09:57             Vladimir   
4  9/19/21  22:11   Alexander John Lee   
5  9/19/21  22:13   Alexander John Lee   
6  9/19/21  22:19             Vladimir   
7  9/19/21  22:20             Vladimir   
8  9/19/21  22:34   Alexander John Lee   

                                             Message  
0                                                Hey  
1                                    <Media omitted>  
2                          check out this bot I made  
3                                         impressive  
4  I have a legal question: could an agreement be...  
5  I don't think it can be because an internation...  
6                                  Missed video call  
7       I have some ideas. Call me when you’re free.  
8                               Let's talk tomorrow.  


In [4]:
df.drop(['Date', 'Time'], axis=1, inplace = True)

In [5]:
df.columns = ['name', 'line']

In [6]:
df

Unnamed: 0,name,line
0,Alexander John Lee,Hey
1,Alexander John Lee,<Media omitted>
2,Alexander John Lee,check out this bot I made
3,Vladimir,impressive
4,Alexander John Lee,I have a legal question: could an agreement be...
5,Alexander John Lee,I don't think it can be because an internation...
6,Vladimir,Missed video call
7,Vladimir,I have some ideas. Call me when you’re free.
8,Alexander John Lee,Let's talk tomorrow.


In [7]:
df.to_csv("raw_chats/messages.csv", index = False)

In [8]:
# remove empty lines (if any exist)
import csv


with open('raw_chats/messages.csv') as in_file:
    with open('raw_chats/messages1.csv', 'w') as out_file:
        writer = csv.writer(out_file)
        for row in csv.reader(in_file):
            if any(row):
                writer.writerow(row)


## cleaning WhatsApp messages - remove "Media ommitted" lines

In [9]:
# delete anyline that has this phrase in it: <Media ommitted> (this message was an image)
# We open the source file and get its lines
with open('raw_chats/messages1.csv', 'r') as inp:
    lines = inp.readlines()

# We open the target file in write-mode
with open('csv/parsed/wa-messages.csv', 'w') as out:
    # We go line by line writing in the target file
    # if the original line does not include the
    # strings 'py-board' or 'coffee'
    for line in lines:
        if not '<Media omitted>' in line and not '/' in line:
            out.write(line)

## 2) Add result.json to the raw_chats/telegram_raw directory

In [10]:
!python3 telegramParser.py raw_chats/telegram_raw/result.json dump.csv

Parsing Completed


In [11]:
import pandas as pd
import glob

path = "csv" 
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

df = pd.concat(li, axis=0, ignore_index=True)

In [12]:
df = df.filter(['sender', 'msg_content'])
df.columns = ['name', 'line']

In [13]:
df.to_csv("csv/parsed/telegram-messages.csv", index = False)

In [14]:
df

Unnamed: 0,name,line
0,Alexander Lee,Hey Vlad!!
1,Vladimir Putin,Hello Alexander
2,Alexander Lee,How's Moscow?!
3,Vladimir Putin,It's nice this time of year.
4,Vladimir Putin,very nice
5,Vladimir Putin,How are you?
6,Alexander Lee,I'm doing well. Looking forward to being back ...
7,Vladimir Putin,Looking forward to see you again my friend.
8,Alexander Lee,"Thanks man, you too"


In [15]:
import pandas as pd
import glob

path = "csv/parsed" 
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

df = pd.concat(li, axis=0, ignore_index=True)

In [16]:
df

Unnamed: 0,name,line
0,Alexander John Lee,Hey
1,Alexander John Lee,check out this bot I made
2,Vladimir,impressive
3,Alexander John Lee,I have a legal question: could an agreement be...
4,Alexander John Lee,I don't think it can be because an internation...
5,Vladimir,Missed video call
6,Vladimir,I have some ideas. Call me when you’re free.
7,Alexander John Lee,Let's talk tomorrow.
8,Alexander Lee,Hey
9,Alexander Lee,check out this bot I made


In [17]:
df.to_csv("csv/parsed/all-messages.csv", index = False)

In [18]:
with open('csv/parsed/all-messages.csv', 'r') as inp:
    lines = inp.readlines()

# We open the target file in write-mode
with open('csv/parsed/all-messages1.csv', 'w') as out:
    # We go line by line writing in the target file
    # if the original line does not include the
    # strings 'py-board' or 'coffee'
    for line in lines:
        if not '(File not included. Change data exporting settings to download.)' in line and not '/' in line:
            out.write(line)

In [19]:
df = pd.read_csv('csv/parsed/all-messages1.csv')
df.to_csv("csv/parsed/all-messages.csv", index = False)

In [20]:
with open('csv/parsed/all-messages.csv', 'r') as inp:
    lines = inp.readlines()

# We open the target file in write-mode
with open('csv/parsed/all-messages1.csv', 'w') as out:
    # We go line by line writing in the target file
    # if the original line does not include the
    # strings 'py-board' or 'coffee'
    for line in lines:
        if not 'Messages and calls are end-to-end encrypted' in line and not '/' in line:
            out.write(line)

## Removing Emojis (this is optional)

In [21]:
import re

with open('csv/parsed/all-messages.csv', 'r') as file :
  filedata = file.read()

# Replace the target string
filedata = filedata.replace('+1 (865) 804-1446', 'Mom')
filedata = filedata.replace('Alexander John Lee', 'Alexander Lee')


emoji_pattern = re.compile("["
   "\U0001F600-\U0001F64F"
   "\U0001F300-\U0001F5FF"
   "\U0001F680-\U0001F6FF"
   "\U0001F1E0-\U0001F1FF"
   # flags(iOS)
   "]+", flags = re.UNICODE)

filedata = emoji_pattern.sub('haha', filedata)

# Write the file out again
with open('csv/parsed/all-messages.csv', 'w') as file:
  file.write(filedata)

## Removing lines with less than 6 characters (formatting)

In [22]:
text = ''

with open('csv/parsed/all-messages.csv', 'r') as f:
    strip = str.rstrip
    text += '\n'.join([line for line in f if len(strip(line, '\n')) >= 6]).rstrip('\n')

with open('csv/parsed/all-messages1.csv', 'w') as file:
    file.write(text)

## Removing empty lines

In [23]:
with open('csv/parsed/all-messages1.csv') as in_file:
    with open('csv/parsed/all-messages.csv', 'w') as out_file:
        writer = csv.writer(out_file)
        for row in csv.reader(in_file):
            if any(row):
                writer.writerow(row)


## Convert csv to txt

In [24]:
import csv
csv_file = "csv/parsed/all-messages.csv"
txt_file = "csv/parsed/all-messages.txt"

with open(txt_file, "w") as my_output_file:
   with open(csv_file, "r") as my_input_file: [my_output_file.write(": ".join(row) + '\n') for row in csv.reader(my_input_file)]
my_output_file.close()


In [25]:
# @dev

# 1)
# clean up txt file: change phone numbers to names
# make telegram and whatsapp usernames the same 

# 2) 
# remove emojis
# remove foreign languages or translate text. 

In [26]:
# translate foreign language to English if you are bi or trilingual

In [27]:
import re
import pandas as pd

pattern = r'([a-zA-Zа-яА-Я\s]+):(.+)'

data = {
    'name': [],
    'line': []
}

with open('csv/parsed/all-messages.txt', 'rt') as file:
    for line in file.readlines():
        match = re.findall(pattern,line)
        if match:
            name, line = match[0]
            data['name'].append(name)
            data['line'].append(line)
            
df = pd.DataFrame(data)


In [28]:
df

Unnamed: 0,name,line
0,name,line
1,Alexander Lee,Hey
2,Alexander Lee,check out this bot I made
3,Vladimir,impressive
4,Alexander Lee,I have a legal question: could an agreement b...
5,Alexander Lee,I don't think it can be because an internatio...
6,Vladimir,Missed video call
7,Vladimir,I have some ideas. Call me when you’re free.
8,Alexander Lee,Let's talk tomorrow.
9,Alexander Lee,Hey


In [29]:
df.to_csv("all-messages.csv", index = False)

In [30]:
# convert csv to txt
import csv
csv_file = "all-messages.csv"
txt_file = "all-messages.txt"

with open(txt_file, "w") as my_output_file:
   with open(csv_file, "r") as my_input_file: [my_output_file.write(": ".join(row) + '\n') for row in csv.reader(my_input_file)]
my_output_file.close()

## Translate messages that are not in English

In [31]:
!pip install deep_translator

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [32]:
from tqdm import tqdm
from deep_translator import GoogleTranslator

def isEnglish(s):
    return s.isascii()

file = open('all-messages.txt', 'r')
lines = file.readlines()

with open('all-messages1.txt', 'a') as f:
     
    for line in tqdm(lines):
        
        result = isEnglish(line)
        
        if result == False:        
            translated = GoogleTranslator(source='auto', target='en').translate(line)
            f.write(translated + '\n') 
        else:
            f.write(line + '\n')


100%|██████████| 53/53 [00:00<00:00, 113.69it/s]


# Formating

In [33]:
import re
import pandas as pd

pattern = r'([a-zA-Z\s]+):(.+)'

data = {
    'name': [],
    'line': []
}

with open('all-messages1.txt', 'rt') as file:
    for line in file.readlines():
        match = re.findall(pattern,line)
        if match:
            name, line = match[0]
            data['name'].append(name)
            data['line'].append(line)
            
df = pd.DataFrame(data)

df

Unnamed: 0,name,line
0,name,line
1,name,line
2,Alexander Lee,Hey
3,Alexander Lee,check out this bot I made
4,Vladimir,impressive
5,Alexander Lee,I have a legal question: could an agreement ...
6,Alexander Lee,I don't think it can be because an internati...
7,Vladimir,Missed video call
8,Vladimir,I have some ideas. Call me when you’re free.
9,Alexander Lee,Let's talk tomorrow.


In [34]:
df.to_csv("all-messages.csv", index = False)

In [35]:
!rm all-messages.txt
!rm all-messages1.txt

## All done! Now you can train your model! 
Open trainModel.ipynb in Google Colab and upload all-messages.csv