## 0. Importing libraries and raw data

In [None]:
#Libraries for data manipulation
import pandas as pd
import re
import datetime as dt

#Library for sentiment analysis
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

#Libraries for generating the word cloud
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [None]:
#Reading txt file as text
file_path = r'<Insert txt file path here>' 

with open(file_path, "r") as file:
    file_contents = file.read()

print(file_contents)

## 1. Preparation of file for analysis

In [None]:
#End goal is to have a dataframe with the columns date, time, participant and message

### 1.1. Initial dataframe cleaning 

In [None]:
#Splitting each new line and converting to list
file_contents = file_contents.split('\n').copy()

#Converting to dataframe
df = pd.DataFrame(file_contents)
df

In [None]:
#Renaming column
df.rename(columns={0:'Text'},inplace=True)

#Removing blank rows
df = df[df['Text']!=''].copy()

#Removing standard WhatsApp messages
df = df[~(df['Text'].str.contains(' was added') | df['Text'].str.contains('changed the group name') | df['Text'].str.contains('calls are end-to-end encrypted') | df['Text'].str.contains(' omitted'))].reset_index(drop=True).copy()


### 1.2. Parsing text into columns

In [None]:
#Defining function to parse text
def extract_timestamp(column,text):

    #Defining patterns
    patterns = {'Timestamp':'\[([^\]]+)\] ',
                'Participant':'\]\s*(.*?):',
                'Message':'\d{2}:\d{2}:\d{2}\] .*?:\s*(.*)$'}

    match = re.search(patterns.get(column), text)
    if match:
        return match.group(1)
    elif column == 'Message':
        return text
    else:
        return ''

#Applying function to create relevant columns
df['Timestamp'] = df['Text'].apply(lambda x: extract_timestamp('Timestamp',x))
df['Participant'] = df['Text'].apply(lambda x: extract_timestamp('Participant',x))
df['Message'] = df['Text'].apply(lambda x: extract_timestamp('Message',x))
df

### 1.3. Exclusion of invalid rows

In [None]:
#Finding group name
try:
    grp_name = re.search('“([^“”]+)”',df.loc[0,'Message']).group(1)

    #Excluding rows with Participant as group name
    exceptions_df = df[df['Participant']==grp_name].reset_index(drop=True)
    exceptions_df['Flag'] = 'Rows automatically created by WhatsApp'

    df = df[df['Participant']!=grp_name].reset_index(drop=True)
except:
    exceptions_df = pd.DataFrame()

In [None]:
#Excluding rows with Participant as WhatsApp
exc = df[df['Participant'].str.contains('WhatsApp')]
exc['Flag'] = 'Invalid rows'
exceptions_df = pd.concat([exceptions_df,exc],ignore_index=True)

df = df[~df['Participant'].str.contains('WhatsApp')].reset_index(drop=True)

In [None]:
#Excluding invalid rows based on Timestamp length. These are due to forwarded messages
df['Len'] = df['Timestamp'].apply(lambda x: len(x.split(',')[0]))

exc = df[df['Len']==5]
exc['Flag'] = 'Forwarded messages'
exceptions_df = pd.concat([exceptions_df,exc],ignore_index=True)

df = df[df['Len']!=5].reset_index(drop=True)

#Dropping temporary len column
df.drop(columns='Len',inplace=True)
exceptions_df.drop(columns='Len',inplace=True)

### 1.4. Final cleaning of dataframe

In [None]:
#Removing whitespace from timestamp column
df['Timestamp'] = df['Timestamp'].str.strip()

#Creating date and time columns from timestamp
df.loc[:,['Date','Time']] = df['Timestamp'].str.split(',',expand=True).rename(columns={0:'Date',1:'Time'})

#Rearranging columns and dropping timestamp
df = df[['Text','Participant','Date','Time','Message']]

#Reassigning blank values in Participant, Date and Time columns as None
df.loc[df['Date']=='',['Participant','Date','Time']] = None

#Forwarding filling none columns
df.loc[:,['Participant','Date','Time']] = df.loc[:,['Participant','Date','Time']].ffill()

In [None]:
#Converting date and time columns into pandas date/time, and then a timestamp column
df['Date'] = pd.to_datetime(df['Date'].str.strip(),format='%d/%m/%Y')
df['Time'] = pd.to_datetime(df['Time'].str.strip(), format='%H:%M:%S').dt.time
df['Timestamp'] = pd.to_datetime(df['Date'].astype(str) + ' ' + df['Time'].astype(str))

### 1.5. Transforming dataframe to concatenate messages

In [None]:
#Concatenating messages after grouping by timestamp and participant
final_df = df.groupby(['Timestamp','Participant'])['Message'].agg(lambda x: ', '.join(x)).reset_index()

In [None]:
#Creating date descriptors from timestamp column
final_df['Date'] = final_df['Timestamp'].dt.date
final_df['Month-year'] = final_df['Timestamp'].dt.strftime("%B %Y")
final_df['Day of Week'] = final_df['Timestamp'].dt.day_name()

## 2. Analysis

### 2.1. Sentiment analysis

In [None]:
#Creating VADER sentiment analysis classification
analyzer = SentimentIntensityAnalyzer()

#Creating function to get sentiment scores
def get_sentiment_score(sentence):
    score = analyzer.polarity_scores(sentence)['compound']
    if score<-0.05:
        return 'Negative'
    elif score>0.05:
        return 'Positive'
    else:
        return 'Neutral'

#Applying sentiment analysis classification to the Message column
final_df['Sentiment'] = final_df['Message'].apply(get_sentiment_score)
final_df

In [None]:
exceptions_df

### 2.2. Generating a Word Cloud

In [None]:
#Concatenating all text into a single string
text = ' '.join(final_df['Message'])

#Generating the word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white',max_words=30).generate(text)

#Plotting the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()