# Whatsapp Chat Analyzer

This code generates some interesting data visualization for a whatsapp group chat history.

This takes the group chat history text as an input.

To export group chat history in whatsapp, go to group chat > Settings > More > Export Chat > Without Media

You need the following libraries in your python environment
nltk (Also download nltk data stopwords and vader_lexicon)
matplotlib


Reference: https://towardsdatascience.com/build-your-own-whatsapp-chat-analyzer-9590acca9014

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from pprint import pprint
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import RegexpTokenizer
%matplotlib inline

#### Defining some static variables for regular expression

In [None]:
# fig, axs = plt.subplots(60,1)
# #size in inches for the plot
# plt_size = [12,480]


mediaPattern = r"(\<Media omitted\>)" # Because it serves no purpose
regexMedia = re.compile(mediaPattern, flags=re.M)

dateAndTimepattern = r"(\d+\/\d+\/\d+)(,)(\s)(\d+:\d+)(\s)(\w+)(\s)(-)(\s\w+)*(:)"
regexDate = re.compile(dateAndTimepattern, flags=re.M)

#### Define path to the input text file & read the file

In [None]:
def readFile(filename):
    chat = open(filename, 'r', encoding='utf-8')
    chatText = chat.read()
    chat.close()

    lines = []
    for line in chatText.splitlines():
        if line.strip() is not "": # If it's empty, we don't need it
            lines.append(line.strip())
    return lines

lines = readFile("<path to text file>")

print('Number of lines in the file: ' + str(len(lines)))

Parse all lines and convert into a tuple of date, time, author and message.

Each line which starts with a date is the start of a new message, followed by time, author and a single / multi line message.

Join multi line messages into a single string.



In [None]:
def get_data(lines):
    parsed_data = []
    message_buffer = []
    date, time, author = None, None, None
    for line in lines:

        if starts_with_date(line): # If a line starts with a Date Time pattern, then this indicates the beginning of a new message
            # print(line)
            if len(message_buffer) > 0: # Check if the message buffer contains characters from previous iterations
                parsed_data.append([date, time, author, ' '.join(message_buffer)]) # Save the tokens from the previous message in parsedData
            message_buffer.clear() # Clear the message buffer so that it can be used for the next message
            date, time, author, message = get_data_point(line) # Identify and extract tokens from the line
            message_buffer.append(message) # Append message to buffer
        else:
                message_buffer.append(line)

    return parsed_data


def starts_with_date(s):
    pattern = '[0-9]{1,2}/[0-9]{1,2}/[0-9]{1,2}, [0-9]{1,2}:[0-9]{1,2} [AP]M -'
    result = re.match(pattern, s)
    if result:
        return True
    return False


def starts_with_author(s):
    patterns = [
        '([\w]+):',                        # First Name
        '([\w]+[\s]+[\w]+):',              # First Name + Last Name
        '([\w]+[\s]+[\w]+[\s]+[\w]+):',    # First Name + Middle Name + Last Name
        '([+]\d{2} \d{5} \d{5}):',         # Mobile Number (India)
        '([+]\d{2} \d{3} \d{3} \d{4}):',   # Mobile Number (US)
        '([+]\d{2} \d{4} \d{7})'           # Mobile Number (Europe)
    ]
    pattern = '^' + '|'.join(patterns)
    result = re.match(pattern, s)
    if result:
        return True
    return False

def get_data_point(line):
    # line = 18/06/17, 22:47 - Loki: Why do you have 2 numbers, Banner?

    split_line = line.split(' - ')  # split_line = ['18/06/17, 22:47', 'Loki: Why do you have 2 numbers, Banner?']

    date_time = split_line[0]  # date_time = '18/06/17, 22:47'

    date, time = date_time.split(', ')  # date = '18/06/17'; time = '22:47'

    message = ' '.join(split_line[1:])  # message = 'Loki: Why do you have 2 numbers, Banner?'

    if starts_with_author(message):  # True
        split_message = message.split(': ')  # split_message = ['Loki', 'Why do you have 2 numbers, Banner?']
        author = split_message[0]  
        message = ' '.join(split_message[1:])  # message = 'Why do you have 2 numbers, Banner?'
    else:
        author = None
    return date, time, author, message

data = get_data(lines)
print('Number of messages ' + str(len(data)))


Parse all lines and convert into a dataframe with 4 columns - 'Date', 'Time', 'Author', 'Message'

In [None]:
def get_data_frame(data):
    df = pd.DataFrame(data, columns=['Date', 'Time', 'Author', 'Message'])
    df.head()
    df.drop(df.index[:1], inplace=True)
    df.to_csv(index=False, path_or_buf='C:/temp/out.csv' )
    return df

df = get_data_frame(data)
df.head()


Print last few messages

In [None]:
df.tail()

Describe dataframe

In [None]:
df.describe()

Export data frame to CSV

In [None]:
df.to_csv(path_or_buf='C:/temp/df.csv')

#### Plot horizontal bar graph of Author vs Number of messages

In [None]:
def print_messages_by_author(df):
    #Total messages by Author
    author_value_counts = df['Author'].value_counts() 
    # Number of messages per author
    #pprint(author_value_counts)
    top_10_author_value_counts = author_value_counts.head(18)
    # Number of messages per author for the top 10 most active authors
    plt = top_10_author_value_counts.plot.barh( figsize=[12,8],
                                               title='Most Messages (Highly Talkative)') # Plot a bar chart using pandas built-in plotting apis
    
    
print_messages_by_author(df)

#### Plot horizontal bar graph of Author vs Number of media messages

In [None]:
     
def print_media_messages_by_author(df):
    #Media messages by Author
    # Plot a bar chart using pandas built-in plotting apis
    media_messages_df = df[df['Message'] == '<Media omitted>']
    media_messages_count = media_messages_df['Author'].value_counts() 
    #pprint(media_messages_count)
    media_messages_count.plot.barh(figsize=[12,8], title='Most Photos') 
    
    return media_messages_df

media_messages_df = print_media_messages_by_author(df)
print('Total number of media messages: ' + str(len(media_messages_df.index)))

Remove media messages for further analysis of text messages

In [None]:

def remove_media_messages(df, media_messages_df):
    #Remove media messages
    messages_df = df.drop(media_messages_df.index) # Drops all rows of the data frame containing media messages
    messages_df.head()
    #print(messages_df)
    return messages_df

messages_df = remove_media_messages(df, media_messages_df)
print('Total number of non media messages: ' + str(len(messages_df.index)))

Add new columns word count and letter counts

In [None]:
def add_word_count(messages_df):
    messages_df['Letter_Count'] = messages_df['Message'].apply(lambda s : len(s))
    messages_df['Word_Count'] = messages_df['Message'].apply(lambda s : len(s.split(' ')))
    print('Total Number of words: ' + str(messages_df['Word_Count'].sum()))
    print('Total Number of letters: ' + str(messages_df['Letter_Count'].sum()))
    return messages_df


messages_df = add_word_count(messages_df)
messages_df.head()


In [None]:
messages_df.describe()

#### Plot horizontal bar graph of Author vs word count 

In [None]:
def print_word_count_by_author(messages_df):
    #Word count by Author 
    total_word_count_grouped_by_author = messages_df[['Author',
                                                      'Word_Count']].groupby('Author').sum()
    sorted_total_word_count_grouped_by_author = total_word_count_grouped_by_author.sort_values('Word_Count',
                                                                                               ascending=False)
    top_10_sorted_total_word_count_grouped_by_author = sorted_total_word_count_grouped_by_author.head(20)
    top_10_sorted_total_word_count_grouped_by_author.plot.barh(figsize=[12,8], 
                                                            title='Most words (Truly Talkative)')

    
print_word_count_by_author(messages_df)


#### Plot horizontal bar graph of Author vs Letter count

In [None]:
def print_letter_count_by_author(messages_df):
    total_letter_count_grouped_by_author = messages_df[['Author', 'Letter_Count']].groupby('Author').sum()
    sorted_total_letter_count_grouped_by_author = total_letter_count_grouped_by_author.sort_values('Letter_Count', ascending=False)
    top_10_sorted_total_letter_count_grouped_by_author = sorted_total_letter_count_grouped_by_author.head(16)
    top_10_sorted_total_letter_count_grouped_by_author.plot.barh( figsize=[12,8],title='Most letters (Typist)')

print_letter_count_by_author(messages_df)

#### Plot most active dates for the group

In [None]:
def print_most_active_dates(messages_df):
    #Date with most messages
    date_counts = messages_df['Date'].value_counts()
    #print(date_counts)
    date_counts.sort_values(ascending=False).head(10).plot.barh(figsize=[12,8],title='Most active days ') 
   
               
print_most_active_dates(messages_df)

#### Plot most active dates for each person in the group 

In terms of sending messages

In [None]:
number_of_authors = len(set(messages_df['Author']))
fig, axs = plt.subplots(number_of_authors-1,1)
plot_size=[12, number_of_authors*8]

def print_most_active_dates_by_author(messages_df):
    i = 0
    for author in set(messages_df['Author']):
        messages_author = messages_df.loc[messages_df['Author'] == author]
        
        if len(messages_author > 0):
            date_counts = messages_author['Date'].value_counts()
            date_counts.sort_values(ascending=False).head(10).plot.barh(ax=axs[i],
                                    figsize=plot_size,title='Most active days ' + author) 
            i+=1
   

print_most_active_dates_by_author(messages_df)

#### Plot horizontal bar graph for number of messages vs hour of the day

Create a new column hour from date

In [None]:
def print_most_active_hour(messages_df):
    #Time with most messages
    messages_df['Hour'] = messages_df['Time'].apply(lambda x : int(x.split(':')[0]) if 'AM' in x else int(x.split(':')[0]) + 12 )
    messages_df['Hour'].value_counts().sort_index(ascending=False).plot.barh(
         figsize=[12,8], title='Most active time')
    
print_most_active_hour(messages_df)


#### Plot number of messages vs hour for each person in the group

In [None]:
number_of_authors = len(set(messages_df['Author']))
fig, axs = plt.subplots(number_of_authors - 1,1)
plot_size=[12, number_of_authors*8]

def print_most_active_hour_by_author(messages_df):
    i = 0
    for author in set(messages_df['Author']):
        messages_author = messages_df.loc[messages_df['Author'] == author]
        if len(messages_author > 0):
         #Time with most messages
            messages_author['Hour'].value_counts().sort_index(ascending=False).plot.barh(
                ax=axs[i],  figsize=plot_size, title='Most active time ' + author)
            i+=1
            
        
print_most_active_hour_by_author(messages_df)


Tokenize all messages.


In [None]:
tokenizer = RegexpTokenizer(r'\w+')

messages_df['Tokens'] = messages_df['Message'].apply(tokenizer.tokenize)
messages_df.head()



Find all unique words.

Remove all stop words.

Plot frequency distribution of all words.

In [None]:
unique_words = {word for tokens in messages_df.Tokens for word in tokens}
all_words = [word for tokens in messages_df.Tokens for word in tokens]
print("Total number of unique words "+ str(len(unique_words)))
# pprint(all_words)

sr = stopwords.words('english')
clean_tokens = [t for t in all_words if t not in stopwords.words('english')]
word_frequency_dist = nltk.FreqDist(clean_tokens)
word_frequency_dist.plot(20, cumulative=False, title='Most used words')

##### Find most repeating trigrams

In [None]:
trigram_frequency_dist = nltk.FreqDist(nltk.everygrams(clean_tokens, min_len=3, max_len=3))
    

print('Top 50 3 letter sequences: ')
print(trigram_frequency_dist.most_common(50))
trigram_frequency_dist.plot(20,cumulative=False, title='Most used bigrams')

##### Get sentiment of each message

Create a new column Sentiment

In [None]:
sentiment_analyzer = SentimentIntensityAnalyzer()
messages_df['Sentiment'] = messages_df['Message'].apply(sentiment_analyzer.polarity_scores)
messages_df.head()

#### Plot a pie chart for overall sentiment of the group chat

In [None]:
def print_sentiments(sentiment_scores):
    
    positives = np.array([x['pos'] for x in sentiment_scores])
    positives = positives[positives != 0]
    positive_score = (len(positives) / len (sentiment_scores))*100
    #print('Positive sentiment ' + str(positive_score))
    
    negatives = np.array([x['neg'] for x in sentiment_scores])
    negatives = negatives[negatives != 0]
    negative_score = (len(negatives) / len (sentiment_scores))*100
    #print('Negative sentiment ' + str(negative_score))
    label = ['Postive','Negative','Neutral']
    sentiment = pd.DataFrame({'Sentiment':[positive_score,
                                           negative_score, 100-positive_score-negative_score]},
                            index = label)
    
    sentiment.plot.pie(y='Sentiment', title='Sentiment of messages')
    
print_sentiments(messages_df['Sentiment'].values)