In [1]:
import copy

import pandas as pd

from src.data.telegram import TelegramReader

In [2]:
reader = TelegramReader()

# EDA

## Structure of data

Raw Telegram JSON file is read as a DataFrame. 

In [3]:
reader.df.head()

Unnamed: 0,name,type,id,messages
0,AIAP12,private_group,828903753,"{'id': 136009, 'type': 'service', 'date': '202..."
1,AIAP12,private_group,828903753,"{'id': 136010, 'type': 'service', 'date': '202..."
2,AIAP12,private_group,828903753,"{'id': 136011, 'type': 'service', 'date': '202..."
3,AIAP12,private_group,828903753,"{'id': 136013, 'type': 'service', 'date': '202..."
4,AIAP12,private_group,828903753,"{'id': 136014, 'type': 'service', 'date': '202..."


Messages column contains a dictionary with various pieces of information.

In [54]:
def get_dict_keys(series: pd.Series, is_list=False) -> set:
    """Get set of unique keys from a Series of dictionaries
    (or a series of lists of dictionaries)."""
    keys = set()
    for row in series:
        # Added to handle nested lists of dictionaries (e.g. text_entities)
        if is_list:
            for item in row:
                keys.update(item.keys())
        else:
            keys.update(row.keys())
    return keys


get_dict_keys(reader.df["messages"])

{'action',
 'actor',
 'actor_id',
 'date',
 'date_unixtime',
 'edited',
 'edited_unixtime',
 'file',
 'forwarded_from',
 'from',
 'from_id',
 'height',
 'id',
 'inviter',
 'members',
 'message_id',
 'mime_type',
 'photo',
 'reply_to_message_id',
 'text',
 'text_entities',
 'thumbnail',
 'type',
 'via_bot',
 'width'}

In [18]:
for row in reader.df["messages"].sample(5):
    print(row)

{'id': 136143, 'type': 'message', 'date': '2023-03-03T15:32:32', 'date_unixtime': '1677828752', 'from': 'paul', 'from_id': 'user1139710027', 'text': 'Thx', 'text_entities': [{'type': 'plain', 'text': 'Thx'}]}
{'id': 136047, 'type': 'message', 'date': '2023-02-23T20:32:20', 'date_unixtime': '1677155540', 'from': 'Mayank Soni', 'from_id': 'user565286338', 'text': [{'type': 'bot_command', 'text': '/in'}], 'text_entities': [{'type': 'bot_command', 'text': '/in'}]}
{'id': 136200, 'type': 'message', 'date': '2023-03-07T13:31:57', 'date_unixtime': '1678167117', 'from': 'Walter Teng', 'from_id': 'user299206190', 'text': 'say too easy lol', 'text_entities': [{'type': 'plain', 'text': 'say too easy lol'}]}
{'id': 136125, 'type': 'message', 'date': '2023-03-02T12:38:36', 'date_unixtime': '1677731916', 'from': 'Yan Liong Tan', 'from_id': 'user1904912399', 'text': 'lower cafe in guild house', 'text_entities': [{'type': 'plain', 'text': 'lower cafe in guild house'}]}
{'id': 136279, 'type': 'message'

## Compare the text and text_entities fields

In [29]:
def text_compare_text_entities(messages: pd.Series):
    for row in messages:
        print(row["text"])
        print(row["text_entities"])
        print("----")


text_compare_text_entities(reader.df["messages"].sample(5))

our master ng
[{'type': 'plain', 'text': 'our master ng'}]
----
Where u ah
[{'type': 'plain', 'text': 'Where u ah'}]
----
i think lately i've become a stanford salesman 😂
[{'type': 'plain', 'text': "i think lately i've become a stanford salesman 😂"}]
----
What we doing?
[{'type': 'plain', 'text': 'What we doing?'}]
----
['liong said this was helpful for him \n', {'type': 'link', 'text': 'https://keras.io/examples/vision/visualizing_what_convnets_learn/'}]
[{'type': 'plain', 'text': 'liong said this was helpful for him \n'}, {'type': 'link', 'text': 'https://keras.io/examples/vision/visualizing_what_convnets_learn/'}]
----


Looks like text is the same as text_entities, except that it represents text_entities of type 'plain' as simple plain text. Let's confirm

Ok, so another difference is that empty messages are represented as an empty string in text and an empty list in text_entities

In [56]:
def text_equals_text_entities(row: dict) -> bool:
    """Check if text and text_entities are the same."""
    return row["text"] == row["text_entities"]


# Boolean filter for rows where text and text_entities are the same
same = reader.df["messages"].apply(text_equals_text_entities)


def get_text_entity_types(messages: pd.Series, is_text_entities: bool = False) -> set:
    """Get set of values of 'type' in text entities.
    If is_text_entities is True, then text_entities have already been extracted from messages and are directly passed in
    """
    types = set()
    for row in messages:
        # Added to handle if text_entities already extracted before being passed in
        row = row if is_text_entities else row["text_entities"]
        for item in row:
            types.add(item.get("type"))
    return types


# Check if plain exists in messages where text_entities and text are the same.
get_text_entity_types(reader.df["messages"][same])

{'bot_command', 'link', 'mention_name'}

In [45]:
def get_text_entity_type_exists_in_all(messages: pd.Series) -> set:
    """Get set of values of 'type' that exist in all messages."""
    types = get_text_entity_types(messages)
    for row in messages:
        individual_types = get_text_entity_types(pd.Series([row]))
        # Remove empty sets (i.e. messages with no text_entities)
        if len(individual_types) > 0:
            types.intersection_update(individual_types)
    return types


# Check if plain is the only text entity type that exists in all messages where text_entities and text are not the same.
get_text_entity_type_exists_in_all(reader.df["messages"][~same])

{'plain'}

Let's use text entities as it has a more consistent format (all lists of dictionaries). 

In [51]:
# Confirm types of text_entities are the same for all messages
for row in reader.df["messages"]:
    assert type(row["text_entities"]) == list
    for item in row["text_entities"]:
        assert type(item) == dict

## Structure of text entities

In [5]:
reader.text_entities.sample(5)

286         [{'type': 'plain', 'text': 'our master ng'}]
136    [{'type': 'plain', 'text': 'if wan to kill the...
98     [{'type': 'plain', 'text': 'i set the interpre...
257    [{'type': 'plain', 'text': 'elon musk sign so ...
14                                                    []
Name: text_entities, dtype: object

Most of the lists have a single element, but some have 0 and some have > 1

In [14]:
reader.text_entities.apply(len).value_counts().sort_index()

text_entities
0      40
1     242
2      41
3      14
4       3
5       1
6       3
7       3
9       2
10      1
Name: count, dtype: int64

## Text entities with 0 items

These are either people joining/leaving the group or photos or files i.e. there's no text 

In [37]:
for row in reader.df[(reader.text_entities.apply(len) == 0)]["messages"]:
    temp = copy.copy(row)
    # Removing a few elements to make it easier to read
    del temp["id"]
    del temp["date"]
    del temp["date_unixtime"]
    print(temp)

{'type': 'service', 'actor': 'Mayank Soni', 'actor_id': 'user565286338', 'action': 'join_group_by_link', 'inviter': 'Group', 'text': '', 'text_entities': []}
{'type': 'service', 'actor': 'Loi Xue Zheng', 'actor_id': 'user234580032', 'action': 'join_group_by_link', 'inviter': 'Group', 'text': '', 'text_entities': []}
{'type': 'service', 'actor': 'Shu Ying', 'actor_id': 'user5362962200', 'action': 'join_group_by_link', 'inviter': 'Group', 'text': '', 'text_entities': []}
{'type': 'service', 'actor': 'Walter Teng', 'actor_id': 'user299206190', 'action': 'join_group_by_link', 'inviter': 'Group', 'text': '', 'text_entities': []}
{'type': 'service', 'actor': 'Marvin Ng', 'actor_id': 'user1371801874', 'action': 'join_group_by_link', 'inviter': 'Group', 'text': '', 'text_entities': []}
{'type': 'service', 'actor': 'Zhi Qiang Quek', 'actor_id': 'user752030617', 'action': 'join_group_by_link', 'inviter': 'Group', 'text': '', 'text_entities': []}
{'type': 'service', 'actor': 'Bryan AIAP', 'actor_

## Structure of text entities

In [55]:
get_dict_keys(reader.text_entities, is_list=True)

{'text', 'type', 'user_id'}

In [57]:
get_text_entity_types(reader.text_entities, is_text_entities=True)

{'bold',
 'bot_command',
 'code',
 'email',
 'link',
 'mention',
 'mention_name',
 'plain',
 'spoiler'}

In [9]:
def get_text_entity_type_examples(
    text_entities: pd.Series, which_type: str, to_print: bool = True
) -> pd.Series:
    """Display text entities which contain an element of a certain type."""

    def contains_type(text_entities, which_type):
        "Filtering function. See if any of the text entities in message are of desired type"
        return any(
            text_entity.get("type") == which_type for text_entity in text_entities
        )

    # Filter messages using filter function
    filtered_text_entities = text_entities[
        text_entities.apply(contains_type, which_type=which_type)
    ]
    # Prettily print the messages containing the desired type
    if to_print:
        for text_entity_row in filtered_text_entities:
            for text_entity in text_entity_row:
                print(text_entity)
            print("----")
    # Get the text entities and return
    return filtered_text_entities


# Print out text entities of each type (other than plain, which has too many entries)
for entity_type in get_text_entity_types(reader.text_entities):
    if entity_type != "plain":
        print("\n\n" + entity_type)
        get_text_entity_type_examples(reader.text_entities, entity_type)



email
{'type': 'plain', 'text': '(REF: CREATION OF AISG PROJECT/PACKAGE, LINKING WITH GITLAB)\n\n\nFOR THOSE LOST AFTER THE DOCKER NOTEBOOK, LOOK AT THE README IN ASSIGNMENT 2. BASICALLY THE GIST OF IT IS AS FOLLOWS:\n\n# SET UP CRUFT\n# CRUFT IS AN APP THAT AUTOMATES CREATION OF PROJECT FOLDERS BASED ON COOKIECUTTER TEMPLATES\npip install cruft\n\n# RUN THIS TO BEGIN PROJECT CREATION WIZARD\ncruft create '}
{'type': 'link', 'text': 'https://github.com/aisingapore/ml-project-cookiecutter-gcp'}
{'type': 'plain', 'text': "\n\n# FILL IN AS FOLLOWS:\nproject_name: AIAP DSP A2 MLOps\ndescription: For AIAP 11's Deep Skilling Phase Assignment 2.\nrepo_name: aiap-dsp-mlops\nsrc_package_name: aiap_dsp_mlops\nsrc_package_name_short: amlo\ngcp_project_id: aiap-11-ds\ngcr_personal_subdir: 1 - Yes\nauthor_name: <YOUR_AIAP_EMAIL_WITHOUT_DOMAIN>\nFor example, if your AIAP email is "}
{'type': 'email', 'text': 'chief_montgomery_scott@aiap.org'}
{'type': 'plain', 'text': " the value shall be chief_mo

To keep things simple, will just add the text from all the entity types into the message text

## Look at metadata

In [59]:
reader.metadata

Unnamed: 0,date,from
0,2023-02-21T15:27:19,
1,2023-02-21T16:04:03,
2,2023-02-21T17:15:25,
3,2023-02-21T17:15:35,
4,2023-02-21T17:16:36,
...,...,...
345,2023-04-12T11:55:02,Hanafi Haffidz
346,2023-04-12T11:57:27,Wayne Lau
347,2023-04-12T11:58:00,paul
348,2023-04-12T12:04:39,Wayne Lau


# Check messages with just a full stop

These no longer exist because I've now incorporated all the types of text into the plaintext. 

In [58]:
reader.plaintext["plaintext"].apply(len).value_counts().sort_index()

AttributeError: 'TelegramReader' object has no attribute 'plaintext'

In [9]:
reader.plaintext[reader.plaintext["plaintext"].apply(len) < 5]

Unnamed: 0,plaintext
5,Okie
6,Thx
24,/in
78,thx
84,Thx
126,haha
158,😈
249,LOL
280,Yea
