Skip to content

Commit

Permalink
Add script to import data from desktop client json dumps. Implement #10.
Browse files Browse the repository at this point in the history
  • Loading branch information
mkdryden committed Dec 19, 2022
1 parent 7d80fc8 commit b8b8eff
Show file tree
Hide file tree
Showing 6 changed files with 159 additions and 15 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.rst
Expand Up @@ -13,6 +13,10 @@ Fixed
-----
- Sticker pack names save correctly now

Added
-----
- Add script to import data from desktop client json dumps

-------------
`0.6.4`_ - 2022-02-27
-------------
Expand Down
22 changes: 22 additions & 0 deletions README.rst
Expand Up @@ -39,6 +39,8 @@ Table of contents

- `Setup`_

- `Importing Data`_

- `Fetching Stats`_

- `counts`_
Expand Down Expand Up @@ -191,6 +193,26 @@ you've sent a message to trigger the update).
You can see if messages are being logged correctly by reviewing the terminal output.
You should see a line like ``2020-06-04 02:08:39,212 - __main__ - INFO - 8``, whenever a message is logged.

--------------
Importing Data
--------------
Data can be imported from JSON dumps from the desktop client.
Hit the three dot button from inside the desired group and select "Export chat history".
Make sure you select JSON as the output format.
You can also limit the date, as desired.
The database will be updated and existing messages will remain, so you can use this feature to fill in gaps when the bot was not running.

To import data, simply call:

.. code:: shell
$ python -m telegram_stats_bot.json_dump_parser "/some/path/to/dump.json" "postgresql://telegram:CoolPassword@localhost/telegram_bot" --tz="America/Toronto"
Where the first argument is the path to the json dump, the second is the db connection string, as above, and the optional `tz` argument should be the time zone of the system used to dump the json.

This can be run without stopping a running bot, though it also attempts to set the user id to user name mapping, so will add an extra entry to every user in the dump (this currently only affects the user stats related to user name changes).
Before you run this, make sure your db string is correct or you might accidentally mess up other databases on the same server.

--------------
Fetching Stats
--------------
Expand Down
41 changes: 40 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Expand Up @@ -19,6 +19,7 @@ numpy = "^1.22.0"
matplotlib = "^3.2.1"
appdirs = "^1.4.4"
single-source = "^0.2.0"
typer = "^0.7.0"

[tool.poetry.dev-dependencies]

Expand Down
4 changes: 2 additions & 2 deletions telegram_stats_bot/db.py
Expand Up @@ -74,8 +74,8 @@ def init_dbs(engine: Engine):
create table if not exists user_events
(
message_id bigint,
user_id bigint,
date timestamp with time zone,
user_id text,
date timestamptz,
event text
);
Expand Down
102 changes: 90 additions & 12 deletions telegram_stats_bot/json_dump_parser.py
Expand Up @@ -23,6 +23,11 @@
import typing

import pandas as pd
import sqlalchemy.engine
import typer
from sqlalchemy import create_engine

from .stats import StatsRunner

media_dict = {'sticker': 'sticker',
'animation': 'animation',
Expand All @@ -31,8 +36,8 @@
'audio_file': 'audio',
'video_message': 'video_note'}

user_event_cat = pd.Categorical(['left', 'joined'])
message_type_cat = pd.Categorical(['migrate_from_group', 'text', 'pinned_message', 'photo', 'sticker',
user_event_cat = pd.CategoricalDtype(['left', 'joined'])
message_type_cat = pd.CategoricalDtype(['migrate_from_group', 'text', 'pinned_message', 'photo', 'sticker',
'new_chat_members', 'left_chat_member', 'animation', 'video',
'location', 'new_chat_title', 'voice', 'audio',
'new_chat_photo', 'video_note', 'poll'])
Expand All @@ -50,9 +55,10 @@ def text_list_parser(text: typing.Union[str, typing.Sequence]) -> str:
return out


def convert_messages(df: pd.DataFrame) -> typing.Tuple[typing.List[dict], typing.List[dict]]:
def convert_messages(df: pd.DataFrame) -> typing.Tuple[typing.List[dict], typing.List[dict], dict]:
messages_out = []
users_out = []

for message in df.itertuples():
message_dict = {'message_id': message.id,
'date': message.date,
Expand All @@ -71,16 +77,18 @@ def convert_messages(df: pd.DataFrame) -> typing.Tuple[typing.List[dict], typing
user_event_dict = {}
if message.type == 'message':
if pd.notnull(message.from_id):
message_dict['from_user'] = message.from_id
if not message.from_id.startswith('user'):
continue
message_dict['from_user'] = int(message.from_id[4:]) # remove 'user' from id

if pd.notnull(message.forwarded_from):
try:
message_dict['forward_from'] = int(message.forwarded_from)
message_dict['forward_from'] = int(message.from_id[4:]) # username is used in forwarded_from
except ValueError:
pass

if pd.notnull(message.reply_to_message_id):
message_dict['reply_to_message'] = message.reply_to_message_id
message_dict['reply_to_message'] = int(message.reply_to_message_id)

if pd.notnull(message.photo):
message_dict['type'] = 'photo'
Expand All @@ -97,12 +105,11 @@ def convert_messages(df: pd.DataFrame) -> typing.Tuple[typing.List[dict], typing
message_dict['text'] = text_list_parser(message.text)
elif pd.notnull(message.poll):
message_dict['type'] = 'poll'
elif pd.notnull(message.location_information):
message_dict['type'] = 'location'

elif message.type == 'service':
if pd.notnull(message.actor_id):
message_dict['from_user'] = message.actor_id
if message.actor_id.startswith('user'):
message_dict['from_user'] = int(message.actor_id[4:])

if message.action == 'edit_group_title':
message_dict['type'] = 'new_chat_title'
Expand All @@ -118,12 +125,12 @@ def convert_messages(df: pd.DataFrame) -> typing.Tuple[typing.List[dict], typing
users_out.append({'message_id': message.id,
'user_id': i,
'date': message.date,
'event': 'join'})
'event': 'joined'})
except TypeError:
user_event_dict = {'message_id': message.id,
'user_id': message.actor_id,
'date': message.date,
'event': 'join'}
'event': 'joined'}
elif message.action == 'remove_members':
message_dict['type'] = 'left_chat_member'
for i in message.members:
Expand All @@ -136,11 +143,82 @@ def convert_messages(df: pd.DataFrame) -> typing.Tuple[typing.List[dict], typing
messages_out.append(message_dict)
if user_event_dict != {}:
users_out.append(user_event_dict)
return messages_out, users_out

user_map = {int(i[4:]): df.loc[df['from_id'] == i, 'from'].iloc[0]
for i in df['from_id'].unique()
if (df['from_id'] == i).any() and i.startswith('user')}

# Use long name for both name and long name since we can't fetch usernames
user_map = {k: (v, v) for k, v in user_map.items() if v}

return messages_out, users_out, user_map


def parse_json(path: str):
with open(path, encoding='utf-8') as f:
js = json.load(f)
chat = js['chats']['list'][1]['messages']
df = pd.DataFrame(chat)


def fix_dtypes_m(df: pd.DataFrame, tz: str) -> pd.DataFrame:
intcols = ['forward_from_message_id', 'forward_from', 'forward_from_chat',
'from_user', 'reply_to_message']
df_out = df.copy()
df_out.loc[:, intcols] = df_out.loc[:, intcols].astype('Int64')
df_out.loc[:, 'date'] = pd.to_datetime(df_out['date'], utc=False).dt.tz_localize(tz=tz,
ambiguous=True)
df_out.loc[:, 'type'] = df_out.loc[:, 'type'].astype(message_type_cat)
return df_out.convert_dtypes()


def fix_dtypes_u(df: pd.DataFrame, tz: str) -> pd.DataFrame:
df_out = df.copy()
df_out.loc[:, 'date'] = pd.to_datetime(df_out['date'], utc=False).dt.tz_localize(tz=tz,
ambiguous=True)
df_out.loc[df_out.event == 'join', 'event'] = 'joined'
df_out['event'] = df_out.event.astype(user_event_cat)

return df_out.convert_dtypes()


def update_user_list(users: dict[int, tuple[str, str]], engine: sqlalchemy.engine.Engine, tz: str):
stats_runner = StatsRunner(engine, tz)
stats_runner.update_user_ids(users)


def main(json_path: str, db_url: str, tz: str = 'Etc/UTC'):
"""
Parse backup json file and update database with contents.
:param json_path:
:param db_url:
:param tz:
:return:
"""
with open(json_path, encoding='utf-8') as f:
js = json.load(f)

chat = js['messages']
messages, users, user_map = convert_messages(pd.DataFrame(chat))

df_m = pd.DataFrame(messages).set_index('message_id')
df_m = fix_dtypes_m(df_m, tz)
df_u = pd.DataFrame(users).set_index('message_id')
df_u = fix_dtypes_u(df_u, tz)

engine = create_engine(db_url, echo=False)

# Exclude existing messages
m_ids = pd.read_sql_table('messages_utc', engine).message_id
df_m = df_m.loc[~df_m.index.isin(m_ids)]
m_ids = pd.read_sql_table('user_events', engine).message_id
df_u = df_u.loc[~df_u.index.isin(m_ids)]

df_u.to_sql('user_events', engine, if_exists='append')
df_m.to_sql('messages_utc', engine, if_exists='append')

update_user_list(user_map, engine, tz)


if __name__ == '__main__':
typer.run(main)

0 comments on commit b8b8eff

Please sign in to comment.