In [None]:
import re
import pandas as pd
import matplotlib.pyplot as plt
import emoji
import json
from PIL import Image
import numpy as np
from wordcloud import WordCloud,STOPWORDS

def startsWithDate(s):
    pattern = '^(([0-9]|(0)[0-9])|((1)[0-2]))(\/)([0-9]|[0-2][0-9]|(3)[0-1])(\/)(\d{2}|\d{4}), (([0-9]:[0-9][0-9])|(1)[0-2]:[0-9][0-9]) (\w)(\w) - '
    result = re.match(pattern, s)
    if result:
        return True
    return False

def startsWithAuthor(s):
    patterns = [
        '([\w]+):',                        # First Name
        '([\w]+[\s]+[\w]+):',              # First Name + Last Name
        '([\w]+[\s]+[\w]+[\s]+[\w]+):',    # First Name + Middle Name + Last Name
        '([+]\d{2} \d{5} \d{5}):',         # Mobile Number (India)
    ]
    pattern = '^' + '|'.join(patterns)
    result = re.match(pattern, s)
    if result:
        return True
    return False

def getDataPoint(line):
    # line = 12/13/19, 12:52 AM - Nirbhay Singh: Late ho gya tha metro nhi mili aur cab k 600 lg rhee
    
    splitLine = line.split(' - ') # splitLine = ['12/13/19, 12:52 AM', 'Nirbhay Singh: Late ho gya tha metro nhi mili aur cab k 600 lg rhee']
    
    dateTime = splitLine[0] # dateTime = '12/13/19, 12:52 AM'
    
    date, time = dateTime.split(', ') # date = '12/13/19'; time = '12:52 AM'
    
    message = ' '.join(splitLine[1:]) # message = 'Nirbhay Singh: Late ho gya tha metro nhi mili aur cab k 600 lg rhee'
    
    if startsWithAuthor(message): # True
        splitMessage = message.split(': ') # splitMessage = ['Nirbhay Singh', 'Late ho gya tha metro nhi mili aur cab k 600 lg rhee']
        author = splitMessage[0] # author = 'Nirbhay Singh'
        message = ' '.join(splitMessage[1:]) # message = 'Late ho gya tha metro nhi mili aur cab k 600 lg rhee'
    else:
        author = None
    return date, time, author, message

parsedData = [] # List to keep track of data so it can be used by a Pandas dataframe
conversationPath = 'Chats_m.txt' 
with open(conversationPath, encoding="utf-8") as fp:
    fp.readline() # Skipping first line of the file (usually contains information about end-to-end encryption)
        
    messageBuffer = [] # Buffer to capture intermediate output for multi-line messages
    date, time, author = None, None, None # Intermediate variables to keep track of the current message being processed
    
    while True:
        line = fp.readline() 
        if not line: # Stop reading further if end of file has been reached
            break
        line = line.strip() # Guarding against erroneous leading and trailing whitespaces
        if startsWithDate(line): # If a line starts with a Date Time pattern, then this indicates the beginning of a new message
            if len(messageBuffer) > 0: # Check if the message buffer contains characters from previous iterations
                parsedData.append([date, time, author, ' '.join(messageBuffer)]) # Save the tokens from the previous message in parsedData
            messageBuffer.clear() # Clear the message buffer so that it can be used for the next message
            date, time, author, message = getDataPoint(line) # Identify and extract tokens from the line
            messageBuffer.append(message) # Append message to buffer
        else:
            messageBuffer.append(line)

In [None]:
with open('message_1.json') as jsonfile:
    data = json.load(jsonfile)

In [None]:
with open('message_2.json') as jsonfile:
    data2 = json.load(jsonfile)

In [None]:
with open('message_3.json') as jsonfile:
    data3 = json.load(jsonfile)

In [None]:
with open('message_4.json') as jsonfile:
    data4 = json.load(jsonfile)

In [None]:
print(len(data['messages']),len(data2['messages']),len(data3['messages']),len(data4['messages']))

In [None]:
Author = []
Message = []
for i in range(0,len(data['messages'])):
    try:
        Author.append(data['messages'][i]['sender_name'])
        Message.append(data['messages'][i]['content'])
    except KeyError:
        Message.append('None')
for i in range(0,len(data2['messages'])):
    try:
        Author.append(data2['messages'][i]['sender_name'])
        Message.append(data2['messages'][i]['content'])
    except KeyError:
        Message.append('None')
for i in range(0,len(data3['messages'])):
    try:
        Author.append(data3['messages'][i]['sender_name'])
        Message.append(data3['messages'][i]['content'])
    except KeyError:
        Message.append('None')
for i in range(0,len(data4['messages'])):
    try:
        Author.append(data4['messages'][i]['sender_name'])
        Message.append(data4['messages'][i]['content'])
    except KeyError:
        Message.append('None')

In [None]:
len(Author)

In [None]:
len(Message)

In [None]:
df1 = pd.DataFrame({'Author':Author,'Message':Message})
df1.head()

In [None]:
emj_dict = {':)':'\U0001F642',
            ':*':'\U0001F618',
            ':-*':'\U0001F618',
            ':p':'\U0001F61C',
            ':-p':'\U0001F61C',
            ':P':'\U0001F61C',
            ':-P':'\U0001F61C',
            ':D':'\U0001F600',
            ':-D':'\U0001F600',
            ':(':'\U0001F614',
            ':-(':'\U0001F614',
            ':\'(':'\U0001F62D',
            ':-\'(':'\U0001F62D'}

In [None]:
df1['Message'] = df1['Message'].apply(lambda x : x.lower())
df1.head()

In [None]:
df1['Text'] = df1['Message'].apply(lambda x : re.sub(r'[^a-zA-Z0-9\s]','',x).strip().replace('\n',' '))
df1['Emoji'] = df1['Message'].apply(lambda x : ''.join(emoji.emojize(emj_dict[c]) for c in x.split(' ') if c in emj_dict))
df1.head()

In [None]:
# adding letter count and word count in new column
df1['Letter_Count'] = df1['Text'].apply(lambda s : len(s))
df1['Word_Count'] = df1['Text'].apply(lambda s : len(s.split(' ')))
df1.head()

In [None]:
df1['Author'] = df1['Author'].apply(lambda x : x.replace('Naushad Alam','Naushad').replace('Nirbhay Singh','Nirbhay'))
df1.head()

In [None]:
df = pd.DataFrame(parsedData, columns=['Date', 'Time', 'Author', 'Message'])
df.head()

In [None]:
# checking anyone hasn't sent any message in group
null_authors_df = df[df['Author'].isnull()]
null_authors_df.head()

In [None]:
# checking media messages
media_messages_df = df[df['Message'] == '<Media omitted>']
print(media_messages_df.head())

In [None]:
# dropping null and media message
messages_df = df.drop(null_authors_df.index) # Drops all rows of the data frame containing messages from null authors
messages_df = messages_df.drop(media_messages_df.index) # Drops all rows of the data frame containing media messages
messages_df.head()

In [None]:
messages_df['Text'] = messages_df['Message'].apply(lambda x : re.sub(r'[^a-zA-Z0-9\s]','',x.encode('ascii', 'ignore').decode('ascii')).lower().strip()) # extracting text
messages_df['Emoji'] = messages_df['Message'].apply(lambda x : ''.join(c for c in x if c in emoji.UNICODE_EMOJI)) # extracting emoji
messages_df.head()

In [None]:
# adding letter count and word count in new column
messages_df['Letter_Count'] = messages_df['Text'].apply(lambda s : len(s))
messages_df['Word_Count'] = messages_df['Text'].apply(lambda s : len(s.split(' ')))
messages_df.head()

In [None]:
# adding hour column

messages_df['Hour'] = messages_df['Time'].apply(lambda x : x.split(':')[0]) # The first token of a value in the Time Column contains the hour (Eg., "20" in "20:15")
messages_df.head()

In [None]:
print(messages_df.shape, df1.shape)

In [None]:
df_final = messages_df.append(df1, sort=False)

In [None]:
df_final.head()

In [None]:
df_final.shape

In [None]:
df_final[25000:]

In [None]:
emj = {"Naushad":[],"Nirbhay":[]}
for i,j in zip(df_final.Emoji,df_final.Author):
        if i in emoji.UNICODE_EMOJI and j == "Naushad" :
            emj["Naushad"].append(i)
        elif i in emoji.UNICODE_EMOJI and j == "Nirbhay":
            emj["Nirbhay"].append(i)
print("Naushad sent {} emoji and Nirbhay sent {} emoji".format(len(emj["Naushad"]),len(emj["Nirbhay"])))

emj_df = pd.DataFrame(emj.items(), columns = ['Author','Emoji'])
emj_df.head()
#list_emj = sorted(emj.items()) # sorted by key, return a list of tuples

#x_emj, y_emj = zip(*list_emj) # unpack a list of pairs into two tuples

In [None]:
media_count = {"Naushad":0,"Nirbhay":0}

for i,j in zip(df.Message,df.Author):
        if i == "<Media omitted>" and j == "Naushad" :
            media_count["Naushad"] += 1
        elif i == "<Media omitted>" and j == "Nirbhay":
            media_count["Nirbhay"] += 1

lists = sorted(media_count.items()) # sorted by key, return a list of tuples

x, y = zip(*lists) # unpack a list of pairs into two tuples

plt.bar(x, y)
plt.xlabel("Authors")
plt.ylabel("Media Sent")
plt.show()

In [None]:
filler1 = 'a b c' # filler words with spaces
filler2 = 'd e f'
filler3 = 'x y z'
fillers = list(filler1.split(' ')) + list(filler2.split(' ')) + list(filler3.split(' ')) + ['\r','']
print(fillers)

In [None]:
def word_cloud(text):
    stop_words = fillers
    
    mask = np.array(Image.open("sample.jpg"))
    
    wc = WordCloud(background_color = "black",
                   stopwords = stop_words,
                   mask = mask,
                   max_words = 200,
                   width=1920,
                   height=1200)
    wc.generate(text)
    wc.to_file("sample.png")
    print("PNG file created")

text = "".join(df_final['Text']).strip().lower()
#print(text)
word_cloud(text)

In [None]:
def count_words(df):
    word_count = {}
    for i in df.Text:
        try:
            x = i.split(" ")
        except:
            pass
        for word in x:
            if word in emoji.UNICODE_EMOJI:
                continue
            elif word in fillers:
                continue
            elif word in ['<media','omitted>']:
                continue
            elif word in ['this','message','was','deleted','']:
                continue
            elif word in word_count:
                word_count[word] += 1
            else:
                word_count[word] = 1
    return word_count

cnt_wrd = count_words(df_final)
words_df = pd.DataFrame(cnt_wrd.items(), columns = ['Word','Count'])
words_df.head()
#messages_df['Emoji'] = messages_df['Message'].apply(lambda s : len(s.split('')))
#messages_df.head()

In [None]:
words_df.sort_values(by = 'Count', ascending = False, inplace = True)
words_df.head(50)

In [None]:
%matplotlib inline
words_df[words_df['Count']>1500].plot(x='Word', kind = 'bar')

In [None]:
# checking discrete values
discrete_columns = ['Date', 'Time', 'Author', 'Message']
messages_df[discrete_columns].describe()

In [None]:
# checking continuos values
continuous_columns = ['Letter_Count', 'Word_Count']
messages_df[continuous_columns].describe()

In [None]:
# Total number of word count and letter count sent
messages_df['Letter_Count'].sum(), messages_df['Word_Count'].sum()

In [None]:
# Who sent how many words

total_word_count_grouped_by_author = messages_df[['Author', 'Word_Count']].groupby('Author').sum()
sorted_total_word_count_grouped_by_author = total_word_count_grouped_by_author.sort_values('Word_Count', ascending=False)
sorted_total_word_count_grouped_by_author.plot.barh()
plt.xlabel('Number of Words')
plt.ylabel('Authors')

In [None]:
# frequency of word count

plt.figure(figsize=(15, 2)) # To ensure that the bar plot fits in the output cell of a Jupyter notebook
word_count_value_counts = messages_df['Word_Count'].value_counts()
top_40_word_count_value_counts = word_count_value_counts.head(40)
top_40_word_count_value_counts.plot.bar()
plt.xlabel('Word Count')
plt.ylabel('Frequency')

In [None]:
# Who sent how many letters

total_letter_count_grouped_by_author = messages_df[['Author', 'Letter_Count']].groupby('Author').sum()
sorted_total_letter_count_grouped_by_author = total_letter_count_grouped_by_author.sort_values('Letter_Count', ascending=False)
top_10_sorted_total_letter_count_grouped_by_author = sorted_total_letter_count_grouped_by_author.head(10)
top_10_sorted_total_letter_count_grouped_by_author.plot.barh()
plt.xlabel('Number of Letters')
plt.ylabel('Authors')

In [None]:
# frequency of letter count

plt.figure(figsize=(15, 2))
letter_count_value_counts = messages_df['Letter_Count'].value_counts()
top_40_letter_count_value_counts = letter_count_value_counts.head(40)
top_40_letter_count_value_counts.plot.bar()
plt.xlabel('Letter count')
plt.ylabel('Frequency')

In [None]:
# which date was most chatty

messages_df['Date'].value_counts().head(10).plot.barh() # Top 10 Dates on which the most number of messages were sent
plt.xlabel('Number of Messages')
plt.ylabel('Date')

In [None]:
# checking most chatty time

messages_df['Time'].value_counts().head(10).plot.barh() # Top 10 Times of the day at which the most number of messages were sent
plt.xlabel('Number of messages')
plt.ylabel('Time')

In [None]:
# Which hour was most busy

messages_df['Hour'].value_counts().head(10).sort_index(ascending=False).plot.barh() # Top 10 Hours of the day during which the most number of messages were sent
plt.xlabel('Number of messages')
plt.ylabel('Hour of Day')