#### Setup

In [1]:
import pandas as pd
import numpy as np
import regex as re

#### Load and transform data

In [12]:
df_original = pd.read_csv('data.csv')

In [13]:
df_original.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3672 entries, 0 to 3671
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   msg_id    3672 non-null   int64 
 1   time      3672 non-null   object
 2   sender    3672 non-null   object
 3   reply_id  3672 non-null   int64 
 4   msg       3549 non-null   object
dtypes: int64(2), object(3)
memory usage: 143.6+ KB


In [14]:
df = df_original.rename({
    "Message Id": "msg_id", "Time": "time", "Sender Name": "sender", 
    "Reply Id": "reply_id", "Message": "msg"
    }, axis=1)
df.reply_id = df.reply_id.fillna(-1).astype(np.int64)
df.time = df.time.astype(np.datetime64)
df.set_index('msg_id', inplace=True)
df.msg.fillna('', inplace=True)

### Anonymize

In [15]:
def gen_names_map(names):
    new_name = iter(["Michael","Christopher","Jessica","Matthew","Ashley","Jennifer","Joshua","Amanda","Daniel","David","James","Robert","John","Joseph","Andrew","Ryan","Brandon","Jason","Justin","Sarah","William","Jonathan","Stephanie","Brian","Nicole","Nicholas","Anthony","Heather","Eric","Elizabeth","Adam","Megan","Melissa","Kevin","Steven","Thomas","Timothy","Christina","Kyle","Rachel","Laura","Lauren","Amber","Brittany","Danielle","Richard","Kimberly","Jeffrey","Amy","Crystal","Michelle","Tiffany","Jeremy","Benjamin","Mark","Emily","Aaron","Charles","Rebecca","Jacob","Stephen","Patrick","Sean","Erin","Zachary","Jamie","Kelly","Samantha","Nathan","Sara","Dustin","Paul","Angela","Tyler","Scott","Katherine","Andrea","Gregory","Erica","Mary","Travis","Lisa","Kenneth","Bryan","Lindsey","Kristen","Jose","Alexander","Jesse","Katie","Lindsay","Shannon","Vanessa","Courtney","Christine","Alicia","Cody","Allison","Bradley","Samuel","Shawn","April","Derek","Kathryn","Kristin","Chad","Jenna","Tara","Maria","Krystal","Jared","Anna","Edward","Julie","Peter","Holly","Marcus","Kristina","Natalie","Jordan","Victoria","Jacqueline","Corey","Keith","Monica","Juan","Donald","Cassandra","Meghan","Joel","Shane","Phillip","Patricia","Brett","Ronald","Catherine","George","Antonio","Cynthia","Stacy","Kathleen","Raymond","Carlos","Brandi","Douglas","Nathaniel","Ian","Craig","Brandy","Alex","Valerie","Veronica","Cory","Whitney","Gary","Derrick","Philip","Luis","Diana","Chelsea","Leslie","Caitlin","Leah","Natasha","Erika","Casey","Latoya","Erik","Dana","Victor","Brent","Dominique","Frank","Brittney","Evan","Gabriel","Julia","Candice","Karen","Melanie","Adrian","Stacey","Margaret","Sheena","Wesley","Vincent","Alexandra","Katrina","Bethany","Nichole","Larry","Jeffery","Curtis","Carrie","Todd","Blake","Christian","Randy","Dennis","Alison","Trevor","Seth","Kara","Joanna","Rachael","Luke","Felicia","Brooke","Austin","Candace","Jasmine","Jesus","Alan","Susan","Sandra","Tracy","Kayla","Nancy","Tina","Krystle","Russell","Jeremiah","Carl","Miguel","Tony","Alexis","Gina","Jillian","Pamela","Mitchell","Hannah","Renee","Denise","Molly","Jerry","Misty","Mario","Johnathan","Jaclyn","Brenda","Terry","Lacey","Shaun","Devin","Heidi","Troy","Lucas","Desiree","Jorge","Andre","Morgan","Drew","Sabrina","Miranda","Alyssa","Alisha","Teresa","Johnny","Meagan","Allen","Krista","Marc","Tabitha","Lance","Ricardo","Martin","Chase","Theresa","Melinda","Monique","Tanya","Linda","Kristopher","Bobby","Caleb","Ashlee","Kelli","Henry","Garrett","Mallory","Jill","Jonathon","Kristy","Anne","Francisco","Danny","Robin","Lee","Tamara","Manuel","Meredith","Colleen","Lawrence","Christy","Ricky","Randall","Marissa","Ross","Mathew","Jimmy"])
    result = {}
    for name in names:
        result[name] = next(new_name)
    return result

In [16]:
names_map = gen_names_map(df.sender.unique())
df.sender.replace(names_map, inplace=True)
df.msg.replace(names_map, inplace=True)

In [17]:
df.msg = df.msg.str.replace(r'(?<!\w@)\b(?<=@)(\w+)(?<!bot)\b', flags=re.I, 
                   repl=lambda m: names_map[m[0]] if m[0] in names_map else re.sub('.', '*', m[0]))

In [18]:
df.msg = df.msg.str.replace(r'((?<![\d\=\-_]|(?<!\\)[A-z])(?:\(?\+?55\)?)? ?(?:\(?0?[2-9]\d\)?)? ?(?:9[ \.]?)?[1-9]\d{3}[ \-]?\d{4}\b)', flags=re.I, 
                   repl=lambda m: re.sub('\d', '*', m[0]))

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3672 entries, 1 to 3870
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   time      3672 non-null   datetime64[ns]
 1   sender    3672 non-null   object        
 2   reply_id  3672 non-null   int64         
 3   msg       3672 non-null   object        
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 143.4+ KB


In [10]:
df.head()

Unnamed: 0_level_0,time,sender,reply_id,msg
msg_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2019-07-14 21:04:58,Michael,-1,
2,2019-07-14 21:17:05,Christopher,-1,
4,2019-07-14 21:18:48,Christopher,-1,✌️🏻
5,2019-07-14 21:20:37,Jessica,-1,
6,2019-07-14 21:20:53,Jessica,-1,"Opa, boa iniciativa!"


In [11]:
df.to_csv('out.csv')

### ...

In [None]:
msg_cnt = df.sender.value_counts() 
df = df[~df.sender.isin(msg_cnt[msg_cnt == 1].index)]

In [None]:
reply_df = df[df.reply_id != -1]
replied_df = df[df.index.isin(reply_df.reply_id)]
links_df = df[df.msg.str.contains(r'https?://[\w\-\.]+')]
has_emoji_df = df[df.msg.str.match(r"[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF]")]

In [None]:
members_df = pd.DataFrame(index=df.sender.unique())

In [None]:
members_df['reply_percent'] = (
    reply_df.sender.value_counts() / df.sender.value_counts()
    ).fillna(0).apply(
        lambda p: '< 1%' if p < 0.01 else '>= 1%')
members_df.reply_percent.value_counts()

>= 1%    67
< 1%     43
Name: reply_percent, dtype: int64

In [None]:
members_df['replied_percent'] = (
    replied_df.sender.value_counts() / df.sender.value_counts()
    ).fillna(0).apply(
        lambda p: '< 10%' if p < 0.10 else '>= 10%')
members_df.replied_percent.value_counts()

>= 10%    61
< 10%     49
Name: replied_percent, dtype: int64

In [None]:
members_df['link_percent'] = (
    links_df.sender.value_counts() / df.sender.value_counts()
    ).fillna(0).apply(
        lambda p: '< 5%' if p < 0.05 else '>= 5%')
# Removing the links so that it will not affect the size of the messages.
df.msg = df.msg.str.replace(r'https?://.+', '')
members_df.link_percent.value_counts()

>= 5%    57
< 5%     53
Name: link_percent, dtype: int64

In [None]:
df['msg_size'] = df.msg.str.len()
members_df['msg_mean_size'] = df.groupby('sender').msg_size.mean().apply(
    lambda s: '< 10 characters' if s < 10 else '< 200 characters' if s < 200 else '>= 200 characters')
members_df.msg_mean_size.value_counts()

< 200 characters     89
< 10 characters      15
>= 200 characters     6
Name: msg_mean_size, dtype: int64

In [None]:
members_df['msg_cnt'] = df.sender.value_counts().apply(
    lambda c: '< 5' if c < 5 else '>= 5')
members_df.msg_cnt.value_counts()

>= 5    63
< 5     47
Name: msg_cnt, dtype: int64

In [None]:
members_df['uses_emoji'] = (has_emoji_df.sender.value_counts() / df.sender.value_counts()).apply(
    lambda p: 'Yes' if p > 0 else "No")
members_df.uses_emoji.value_counts()

No     83
Yes    27
Name: uses_emoji, dtype: int64

In [None]:
import plotly.graph_objects as go

# Create dimensions
msg_cnt_dim = go.parcats.Dimension(
    values=members_df.msg_cnt,label="Number of messages"
)
msg_mean_size_dim = go.parcats.Dimension(
    values=members_df.msg_mean_size,label="Messages size"
)
replied_percent_dim = go.parcats.Dimension(
    values=members_df.replied_percent,label="Percent of replied messages"
)
reply_percent_dim = go.parcats.Dimension(
    values=members_df.reply_percent,label="Percent of reply messages"
)
link_percent_dim = go.parcats.Dimension(
    values=members_df.link_percent,label="Percent of messages with hyperlinks"
)
uses_emoji_dim = go.parcats.Dimension(
    values=members_df.uses_emoji,
    categoryorder='category ascending', label="Uses emoji"
)


In [None]:
group_one = np.int32(members_df.uses_emoji == 'Yes')
group_two = ((members_df.msg_cnt == '< 5') & 
             (members_df.replied_percent == '< 10%') &
             (members_df.reply_percent == '< 1%'))
color = np.int32(group_one) + np.int32(group_two)*2
colorscale = [[0, 'lightsteelblue'], [0.5, 'mediumseagreen'], [1, 'lightsalmon']];

fig = go.Figure(data = [
    go.Parcats(
        dimensions=[msg_cnt_dim, replied_percent_dim, 
                    reply_percent_dim, uses_emoji_dim],
        line={'color': color, 'colorscale': colorscale},
        labelfont={'size': 18, 'family': 'Times'},
        tickfont={'size': 16, 'family': 'Times'},
        arrangement='freeform')])

fig.layout = {
    'title': ('The relation between the use of emotes and the type of interaction' +
              ' over a Telegram Group')
    }

fig.show()

In [None]:
group_one = ((members_df.msg_cnt == '>= 5') & 
             (members_df.replied_percent == '>= 10%') &
             (members_df.reply_percent == '>= 1%'))
group_two = ((members_df.msg_cnt == '< 5') & 
             (members_df.replied_percent == '< 10%') &
             (members_df.reply_percent == '< 1%'))
color = np.int32(group_one) + np.int32(group_two)*2
colorscale = [[0, 'lightsteelblue'], [0.5, 'peru'], [1, 'lightsalmon']];

fig = go.Figure(data = [
    go.Parcats(
        dimensions=[msg_cnt_dim, replied_percent_dim,
                    reply_percent_dim],
        line={'color': color, 'colorscale': colorscale},
        labelfont={'size': 18, 'family': 'Times'},
        tickfont={'size': 16, 'family': 'Times'},
        arrangement='freeform'
        )])

fig.layout = {
    'title': ('The relation between the proportion of replies and the replied' +
              ' messages over a Telegram Group')
    }

fig.show()

In [None]:
fig = go.Figure(data = [
    go.Parcats(
        dimensions=[msg_cnt_dim, replied_percent_dim, 
                    reply_percent_dim, uses_emoji_dim, link_percent_dim, msg_mean_size_dim],
        labelfont={'size': 18, 'family': 'Times'},
        tickfont={'size': 16, 'family': 'Times'},
        arrangement='freeform')])

fig.layout = {
    'title': ('The interaction' +
              ' over a Telegram Group')
    }

fig.show()