In [2]:
from tika import parser
import re

In [3]:
pdf_path = "telegram_gate.pdf"

In [4]:
raw = parser.from_file(pdf_path)

In [9]:
# print(raw['content'])

The text contains a lot of new lines, time stamps, dates, 2 letter username accronyms, and the term "Telegram Web". Let's remove them.

In [10]:
pdf_content = raw['content']

In [11]:
pdf_content = pdf_content.strip("\n")

In [12]:
# Remove time stamps and dates

regex = re.compile(r'([0-9][0-9]?:[0-9][0-9]( )?:[0-9][0-9]( )?(A|P)M)|([0-9][0-9]?\/[0-9][0-9]?\/[0-9][0-9][0-9][0-9])')

pdf_content = re.sub(regex, "", pdf_content)

# Remove 'Telegram Web'

pdf_content = pdf_content.replace("Telegram Web", "")

In [14]:
print(pdf_content[0:200])

  

WRF 12 members 

Friday, November 30, 2018 

Ramon Rosario admin  
RR 

 

https://web .telegram .org/#/im?p=s1209505337 _ 15413785455230905639 1/889 



 

RR 

Edwin Miranda admin 

En PRINT! 




In [16]:
# Create a list of sentences from document

pdf_lines = pdf_content.split("\n")

In [17]:
pdf_lines[0:10]

['  ',
 '',
 'WRF 12 members ',
 '',
 'Friday, November 30, 2018 ',
 '',
 'Ramon Rosario admin  ',
 'RR ',
 '',
 ' ']

In [19]:
# Remove empty string elements

regex = re.compile(r'^( )*$')

pdf_lines = [line for line in pdf_lines if not regex.match(line)]

In [20]:
pdf_lines[0:10]

['WRF 12 members ',
 'Friday, November 30, 2018 ',
 'Ramon Rosario admin  ',
 'RR ',
 'https://web .telegram .org/#/im?p=s1209505337 _ 15413785455230905639 1/889 ',
 'RR ',
 'Edwin Miranda admin ',
 'En PRINT! ',
 'Ramon Rosario admin ',
 'R Rossello admin ']

In [21]:
len(pdf_lines)

31433

In [22]:
# Get admin chat members by keeping unique lines which contain the word 'admin'

admin_chat_members = set(line.strip() for line in pdf_lines if ' admin ' in line)

In [23]:
admin_chat_members

{'Alfonso Orona admin',
 'Alfonso Orona admin edited',
 'Carlos Bermudez admin',
 'Ch Sobri admin',
 'Ch Sobri via @gif admin',
 'Edwin Miranda admin',
 'Edwin Miranda admin  --~ Edwin Miranda',
 'Edwin Miranda admin --Super! Anthony O. Maceira layas Photo',
 'Edwin Miranda admin --~~ @ - Carlos Bermudez Photo',
 'Edwin Miranda admin -~-',
 'Edwin Miranda via @gif admin',
 'F do admin',
 'Fdo admin',
 'Fdo via@gif admin',
 'R Rossello admin',
 'R Rossello admin edited',
 'R Russello admin',
 "Rafael Cerame D'Acosta admin",
 'Ramon Rosario admin'}

In [24]:
# Clean the admin chat members list by removing ' admin'

admin_chat_members_cleaned = set()

for element in admin_chat_members:
    
    index = element.find(' admin')
    
    admin_chat_members_cleaned.add(element[:index])

In [25]:
admin_chat_members = admin_chat_members_cleaned

In [26]:
admin_chat_members

{'Alfonso Orona',
 'Carlos Bermudez',
 'Ch Sobri',
 'Ch Sobri via @gif',
 'Edwin Miranda',
 'Edwin Miranda via @gif',
 'F do',
 'Fdo',
 'Fdo via@gif',
 'R Rossello',
 'R Russello',
 "Rafael Cerame D'Acosta",
 'Ramon Rosario'}

In [27]:
# Remove the elements in admin_chat_members which contain the text 'gif', they are repeated

admin_chat_members = [member for member in admin_chat_members if not('gif' in member)]
admin_chat_members

['R Russello',
 'R Rossello',
 'Fdo',
 "Rafael Cerame D'Acosta",
 'F do',
 'Ch Sobri',
 'Alfonso Orona',
 'Edwin Miranda',
 'Carlos Bermudez',
 'Ramon Rosario']

In [28]:
# Two elements in the admin_chat_members list seem to be typos, remove them

admin_chat_members.remove("R Russello") # assuming it's 'R Rosello' read incorrectly by Tika
admin_chat_members.remove("Fdo") # assuming it's 'F do' read incorrectly by Tika

In [29]:
admin_chat_members

['R Rossello',
 "Rafael Cerame D'Acosta",
 'F do',
 'Ch Sobri',
 'Alfonso Orona',
 'Edwin Miranda',
 'Carlos Bermudez',
 'Ramon Rosario']

In [30]:
len(admin_chat_members)

8

According to https://www.latinorebels.com/2019/07/09/telegramgate/, some of the visible members in the chat were: 
1. Rosselló
* Chief of Staff Ricardo Llerandi 
* the Governor’s former legal advisor Alfonso Orona
* former Public Affairs Secretary for La Fortaleza Ramón Rosario
* government communications advisor Carlos Bermudez
* government communications advisor Rafael Cerame D’Acosta
* publicist and statehood party donor Edwin Miranda
* executive director of the Financial Advisory Authority and Fiscal Agency and representative to the Governor on the Financial Oversight Board Christian Sobrino
* A person identified only as “F do” (possibly Fernando).

From going through the PDF document manually, I found that the non-admin members in the chat are:
* Raul Maldonado
* Anthony O. Maceira Zayas
* Ricardo Llerandi
* LuisG
  
How do we find this without searching manually? Chat members appear in bold, could we have Tika keep text formatting?

Now that we know whom the admin chat members are, let's create their acronyms, search for them in pdf_lines, and remove them.

In [31]:
# Create admin member accronyms

accronyms = set()

for admin_chat_member in admin_chat_members:
    
    names = admin_chat_member.split(" ")
    
    accronym = ""
    for name in names:
        accronym += name[0].upper()
    
    accronyms.add(accronym)

In [32]:
accronyms

{'AO', 'CB', 'CS', 'EM', 'FD', 'RCD', 'RR'}

In [33]:
# Build regular expression for identifying text containing only an accronym and spaces

regular_exp = ""

for accronym in accronyms:
    
    regular_exp += accronym + "|"
    
regular_exp = regular_exp[:len(regular_exp)-1]

regular_exp

'CS|AO|CB|FD|EM|RR|RCD'

In [34]:
# Remove accronyms from pdf_lines

regex = re.compile(r'^( )*(%s)( )*$' % regular_exp)

pdf_lines = [line for line in pdf_lines if not regex.match(line)]

In [36]:
pdf_lines[0:10]

['WRF 12 members ',
 'Friday, November 30, 2018 ',
 'Ramon Rosario admin  ',
 'https://web .telegram .org/#/im?p=s1209505337 _ 15413785455230905639 1/889 ',
 'Edwin Miranda admin ',
 'En PRINT! ',
 'Ramon Rosario admin ',
 'R Rossello admin ',
 'lComo se col6 eso? ',
 'https ://web .telegram .org/#/im?p=s1209505337 _ 15413785455230905639 ']

In [37]:
# Store messages per user into admin_chat_member_messages dictionary, 
# and store the conversation in the list conversation, which will be 
# a list of dictionaries where the dictionary will contain the keys 
# 'chat_member' and 'message'

admin_chat_member_messages = dict()

for chat_member in admin_chat_members:
    admin_chat_member_messages[chat_member] = []
    
conversation = []

current_chat_member = None

new = False

for line in pdf_lines:
        
    for chat_member in admin_chat_members:
        if chat_member in line:
            # don't add chat member element to messages/conversation
            current_chat_member = chat_member
            new = True
            break
            
    if new:
        new = False
        continue
       
    if (current_chat_member != None) and not(current_chat_member in line):
        admin_chat_member_messages[current_chat_member].append(line)
        conversation.append({
            "chat_member": current_chat_member,
            "message": line
        })

In [38]:
admin_chat_member_messages.keys()

dict_keys(['R Rossello', "Rafael Cerame D'Acosta", 'F do', 'Ch Sobri', 'Alfonso Orona', 'Edwin Miranda', 'Carlos Bermudez', 'Ramon Rosario'])

In [39]:
admin_chat_member_messages["R Rossello"][:10]

['lComo se col6 eso? ',
 'https ://web .telegram .org/#/im?p=s1209505337 _ 15413785455230905639 ',
 '2/889 ',
 'https ://web .telegram .org/#/im?p=s1209505337 _ 15413785455230905639 ',
 '3/889 ',
 'https://twitter.com/davilacolon/status/1068463010967158784? ',
 '5=12 ',
 '(https://twitter .com/davilacolon/status/1068463010967158784? ',
 '5=12) ',
 'Twitter ']

In [41]:
conversation[0:10]

[{'chat_member': 'Ramon Rosario',
  'message': 'https://web .telegram .org/#/im?p=s1209505337 _ 15413785455230905639 1/889 '},
 {'chat_member': 'Edwin Miranda', 'message': 'En PRINT! '},
 {'chat_member': 'R Rossello', 'message': 'lComo se col6 eso? '},
 {'chat_member': 'R Rossello',
  'message': 'https ://web .telegram .org/#/im?p=s1209505337 _ 15413785455230905639 '},
 {'chat_member': 'R Rossello', 'message': '2/889 '},
 {'chat_member': 'Alfonso Orona', 'message': 'Buen dfa!!! '},
 {'chat_member': 'Alfonso Orona',
  'message': 'Cifra de Asesinatos para hoy viernes, 30 de noviembre de 2018 '},
 {'chat_member': 'Alfonso Orona', 'message': '2018-571 '},
 {'chat_member': 'Alfonso Orona', 'message': '2017-629 '},
 {'chat_member': 'Alfonso Orona', 'message': '-58 '}]

In [42]:
len(conversation)

25550

In [44]:
for k,v in admin_chat_member_messages.items():
    
    print("{user} sent {num_msg} messages".format(user=k,num_msg=len(v)))

R Rossello sent 3280 messages
Rafael Cerame D'Acosta sent 1606 messages
F do sent 936 messages
Ch Sobri sent 2384 messages
Alfonso Orona sent 2815 messages
Edwin Miranda sent 4848 messages
Carlos Bermudez sent 4911 messages
Ramon Rosario sent 4770 messages
