In [1]:
import os
import pandas as pd
import glob

In [2]:
DIALOGS_DATA_PATH = "../data/dialogs"
DIALOGS_META_DATA_PATH = "../data/dialogs_meta"

MERGED_DATA_PATH = "../data/merged_data"

In [3]:
!ls ../data/

[1m[36mdialogs[m[m      [1m[36mdialogs_meta[m[m [1m[36mmerged_data[m[m


In [4]:
dialogs_data_files = glob.glob(f"{DIALOGS_DATA_PATH}/*.csv")
dialogs_meta_data_files = glob.glob(f"{DIALOGS_META_DATA_PATH}/*.json")

## merging all dialogs_data (csv files) into one

In [5]:
df_array = []

for d in dialogs_data_files:
    local_df = pd.read_csv(d)
    local_df["dialog_id"] = os.path.basename(d).split(".")[0]
    
    df_array.append(local_df)
    
df = pd.concat(df_array, ignore_index=True)

In [6]:
df.shape

(315271, 11)

In [7]:
if not os.path.isdir(MERGED_DATA_PATH):
    os.mkdir(MERGED_DATA_PATH)

if "Unnamed: 0" in df:
    df = df.drop(["Unnamed: 0"], axis=True)
    
df.to_csv(f"{MERGED_DATA_PATH}/dialogs_data_all.csv", index=False)

In [8]:
df.shape

(315271, 10)

In [9]:
df.head(10)

Unnamed: 0,id,date,from_id,to_id,fwd_from,message,type,duration,reactions,dialog_id
0,58429.0,2020-09-23 15:06:51+00:00,PeerUser(user_id=540076029),823583949,,,text,,{},823583949
1,121603.0,2023-11-11 08:44:47+00:00,PeerUser(user_id=540076029),1221533037,,Ukraine 🇺🇦,text,,{},1221533037
2,121602.0,2023-11-11 08:44:22+00:00,PeerUser(user_id=540076029),1221533037,,"Ich bin nicht sicher, was du meinst",text,,{},1221533037
3,121601.0,2023-11-11 08:36:58+00:00,,540076029,,Und woher kommst du genau?,text,,{},1221533037
4,121600.0,2023-11-11 08:36:15+00:00,,540076029,,Was suchst du so?,text,,{},1221533037
5,121599.0,2023-11-11 08:35:34+00:00,,540076029,,Mir geht auch gut,text,,{},1221533037
6,121598.0,2023-11-11 06:44:19+00:00,PeerUser(user_id=540076029),1221533037,,Mir geht’s gut und dir?,text,,{},1221533037
7,121597.0,2023-11-11 06:37:38+00:00,,540076029,,Wie geht es dir so?,text,,{},1221533037
8,121596.0,2023-11-11 05:22:54+00:00,PeerUser(user_id=540076029),1221533037,,Hey?,text,,{},1221533037
9,121595.0,2023-11-11 03:37:46+00:00,,540076029,,Hey,text,,{},1221533037


In [10]:
min(df["date"]),max(df["date"])

('2016-12-27 16:15:24+00:00', '2024-10-17 19:07:24+00:00')

In [11]:
df.groupby(["type"])["type"].count()

type
photo       54889
sticker      9152
text       243603
video        5716
voice        1911
Name: type, dtype: int64

In [12]:
df.groupby(["type"])["duration"].sum()

type
photo           0.000000
sticker         0.000000
text            0.000000
video      190197.649068
voice       28004.000000
Name: duration, dtype: float64

## merging all dialogs_meta_data (json files) into one

In [13]:
df_array = []

for d in dialogs_meta_data_files:
    local_df = pd.read_json(d)
    local_df = local_df.rename({'id': 'dialog_id'}, axis=1)
    df_array.append(local_df)
    
df_meta = pd.concat(df_array, ignore_index=True)

In [14]:
df_meta.to_csv(f"{MERGED_DATA_PATH}/dialogs_users_all.csv", index=False)

In [15]:
df_meta.shape

(2385, 4)

In [16]:
df_meta.head(10)

Unnamed: 0,dialog_id,name,type,users
0,492434726,GuitarRSlavik,Private dialog,"{'user_id': 492434726, 'first_name': 'GuitarRS..."
1,629272826,Tetiana Dulina,Private dialog,"{'user_id': 629272826, 'first_name': 'Tetiana'..."
2,147963511,Yarik Ponomarenko,Private dialog,"{'user_id': 147963511, 'first_name': 'Yarik', ..."
3,396401840,Julia Kaliuzhka,Private dialog,"{'user_id': 396401840, 'first_name': 'Julia Ka..."
4,685852754,Tania,Private dialog,"{'user_id': 685852754, 'first_name': 'Tania', ..."
5,-1001624040028,Computer Vision,Group,"{'user_id': 7387421516, 'first_name': 'AI Bush..."
6,-1001624040028,Computer Vision,Group,"{'user_id': 2133360479, 'first_name': 'Taras',..."
7,-1001624040028,Computer Vision,Group,"{'user_id': 183894264, 'first_name': 'Yehor', ..."
8,429000,Stickers,Private dialog,"{'user_id': 429000, 'first_name': 'Stickers', ..."
9,-1001724599778,ФМ ❤️,Group,"{'user_id': 428524360, 'first_name': 'Yaroслав..."


In [17]:
df_meta.groupby(["type"])["type"].count()

type
Channel             15
Group             2139
Private dialog     231
Name: type, dtype: int64