In [1]:
import re
import json
from datetime import datetime
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.collocations import BigramCollocationFinder
from collections import Counter, defaultdict
import matplotlib.pyplot as plt



In [10]:

class WhatsAppChatAnalyzer:
    def __init__(self, file_path):
        # Download required NLTK data
        nltk.download('punkt')
        nltk.download('stopwords')
        nltk.download('wordnet')
        nltk.download('averaged_perceptron_tagger')
        nltk.download('punkt_tab')
        
        # Initialize lemmatizer
        self.lemmatizer = WordNetLemmatizer()
        
        # Initialize Spanish and English stopwords
        self.stop_words = set(stopwords.words('spanish') + stopwords.words('english'))
        
        # Custom words to ignore
        self.ignore_words = {'image', 'video', 'omitted', 'audio', 'document'}
        
        # Read and parse the chat file
        with open(file_path, 'r', encoding='utf-8') as file:
            self.raw_text = file.read()

        # Emotional words dictionary with weights
        self.emotional_words = {
            'love': 1.0, 'hate': -1.0, 'miss': 0.5, 'happy': 0.8,
            'sad': -0.8, 'excited': 0.9, 'angry': -0.9,
            'awesome': 0.7, 'terrible': -0.7, 'good': 0.6,
            'bad': -0.6, 'great': 0.8, 'worst': -0.8
        }
    
    def parse_messages(self):
        # Regular expression for WhatsApp message format
        pattern = r'\[(\d{1,2}/\d{1,2}/\d{2},\s\d{1,2}:\d{2}:\d{2}\s[AP]M)\]\s(.*?):\s(.*?)(?=\n\[\d{1,2}/\d{1,2}/\d{2}|\Z)'
        
        messages = []
        for match in re.finditer(pattern, self.raw_text, re.DOTALL):
            timestamp_str, sender, content = match.groups()
            
            # Parse timestamp
            timestamp = datetime.strptime(timestamp_str, '%m/%d/%y, %I:%M:%S %p')
            
            # Skip system messages
            if 'Messages and calls are end-to-end encrypted' in content:
                continue
                
            messages.append({
                'timestamp': timestamp,
                'sender': sender.strip(),
                'content': content.strip(),
                'is_media': any(word in content.lower() for word in self.ignore_words)
            })
        
        return pd.DataFrame(messages)

    def preprocess_text(self, text):
        # Consolidate laughter expressions before tokenization
        text = re.sub(r'(?i)(ha|ja|he|je){2,}|lol|lmao', 'LAUGHTER', text)
        
        # Tokenize
        tokens = word_tokenize(text.lower())
        
        # Remove stopwords, ignored words, and lemmatize
        processed_tokens = [
            self.lemmatizer.lemmatize(token)
            for token in tokens
            if token.isalnum() and 
            token not in self.stop_words and 
            token not in self.ignore_words
        ]
        
        return processed_tokens

    def analyze_text(self, df):
        word_timeline = defaultdict(list)
        word_frequencies = Counter()
        co_occurrences = defaultdict(Counter)
        
        # Process each message
        for _, row in df.iterrows():
            if not row['is_media']:
                tokens = self.preprocess_text(row['content'])
                
                # Update word frequencies and timeline
                for token in tokens:
                    word_frequencies[token] += 1
                    word_timeline[token].append(row['timestamp'])
                
                # Update co-occurrences
                for i, token1 in enumerate(tokens):
                    for token2 in tokens[i+1:]:
                        co_occurrences[token1][token2] += 1
                        co_occurrences[token2][token1] += 1

        # Create the final analysis structure
        analysis_results = {}
        
        for word, freq in word_frequencies.items():
            timestamps = word_timeline[word]
            if timestamps:
                analysis_results[word] = {
                    "frequency": freq,
                    "first_appearance": min(timestamps).isoformat(),
                    "peak_usage": max(timestamps).isoformat(),
                    "co_occurring_words": [
                        {"word": co_word, "count": count}
                        for co_word, count in co_occurrences[word].most_common(5)
                    ],
                    "emotional_weight": self.calculate_emotional_weight(word)
                }

        return analysis_results

    def calculate_emotional_weight(self, word):
        # Direct emotional weight
        weight = self.emotional_words.get(word, 0)
        
        # Check for partial matches (e.g., "loving" matches "love")
        if weight == 0:
            for emotional_word, emotional_weight in self.emotional_words.items():
                if emotional_word in word or word in emotional_word:
                    weight = emotional_weight * 0.8  # Slightly reduced weight for partial matches
                    break
        
        return weight

    def analyze_temporal_patterns(self, df):
        # Group messages by hour and day
        df['hour'] = df['timestamp'].dt.hour
        df['day'] = df['timestamp'].dt.day_name()
        
        temporal_patterns = {
            'hourly_activity': df['hour'].value_counts().to_dict(),
            'daily_activity': df['day'].value_counts().to_dict(),
            'message_density': {
                str(date): count
                for date, count in df.groupby(df['timestamp'].dt.date).size().items()
            }
        }
        
        return temporal_patterns


In [11]:
# Initialize analyzer
input_file = '_chat.txt'
analyzer = WhatsAppChatAnalyzer(input_file)

# Parse and analyze messages
messages_df = analyzer.parse_messages()
word_analysis = analyzer.analyze_text(messages_df)
temporal_patterns = analyzer.analyze_temporal_patterns(messages_df)

# Combine results
analysis_results = {
    'word_analysis': word_analysis,
    'temporal_patterns': temporal_patterns
}

# Convert the nested dictionary to a DataFrame
word_analysis_df = pd.DataFrame.from_dict(word_analysis, orient='index')

# Expand the co_occurring_words column into separate columns
co_occurring_expanded = pd.json_normalize(word_analysis_df['co_occurring_words'].apply(lambda x: {f"co_word_{i+1}": item['word'] for i, item in enumerate(x)}))
co_occurring_counts = pd.json_normalize(word_analysis_df['co_occurring_words'].apply(lambda x: {f"co_count_{i+1}": item['count'] for i, item in enumerate(x)}))

# Combine all DataFrames
analysis_df = pd.concat([
    word_analysis_df.drop('co_occurring_words', axis=1),
    co_occurring_expanded,
    co_occurring_counts
], axis=1)

# Convert temporal patterns to DataFrames
hourly_df = pd.DataFrame.from_dict(temporal_patterns['hourly_activity'], orient='index', columns=['message_count'])
hourly_df.index.name = 'hour'

daily_df = pd.DataFrame.from_dict(temporal_patterns['daily_activity'], orient='index', columns=['message_count'])
daily_df.index.name = 'day'

density_df = pd.DataFrame.from_dict(temporal_patterns['message_density'], orient='index', columns=['message_count'])
density_df.index = pd.to_datetime(density_df.index)
density_df.index.name = 'date'


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/luz.calero/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/luz.calero/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/luz.calero/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/luz.calero/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/luz.calero/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [12]:
# Print out the top 400 most frequent words in a pretty way
# Sort and get top 400 words
top_400_words = word_analysis_df.sort_values('frequency', ascending=False).head(400)

print("Top 400 Most Frequent Words:")
print("-" * 80)
print(f"{'Word':<20} {'Frequency':<10} {'Emotional Weight':<15} {'First Appearance':<25}")
print("-" * 80)

for word, row in top_400_words.iterrows():
    print(f"{word:<20} {row['frequency']:<10} {row['emotional_weight']:<15.2f} {row['first_appearance'][:10]}")

Top 400 Most Frequent Words:
--------------------------------------------------------------------------------
Word                 Frequency  Emotional Weight First Appearance         
--------------------------------------------------------------------------------
q                    4639       0.00            2020-10-17
laughter             4630       0.00            2020-10-15
u                    3118       0.00            2020-10-15
im                   2804       0.00            2020-11-02
si                   2325       0.00            2020-10-16
http                 2219       0.00            2020-10-17
omg                  2153       0.00            2020-10-18
like                 1572       0.00            2020-11-02
ok                   1565       0.00            2020-10-15
sorry                1192       0.00            2020-10-20
beba                 1163       0.00            2020-12-27
ay                   1127       0.00            2020-10-20
oki                  1015 