In [1]:
import pandas as pd
import numpy as np
import re, emoji, regex

from dateutil import parser
from sklearn.preprocessing import LabelEncoder
from urlextract import URLExtract

# Cleaning

In [2]:
df_original = pd.read_csv("belgavi_teachers.csv")
df_original.head(5)

Unnamed: 0,Date_Time,Author,Author_role,Message,Category,Code,Code.1,Comment,Comment.1
0,"03/01/2019, 7:13 pm",- Messages to this group are now secured with ...,,,,,,,
1,"03/01/2019, 7:26 pm",- +91 94484 20837:,Teacher,🙏🏼👌🏼,,,,,
2,"03/01/2019, 7:27 pm",- +91 99724 48016:,Teacher,Congratulations 💐💐💐,,,,,
3,"03/01/2019, 7:27 pm",- +91 99459 99466:,Teacher,<Media omitted>,,,,,
4,"03/01/2019, 7:29 pm",- Kush Desai:,Teacher,👌👌💐💐,,,,,


In [3]:
#Execute only once to get the cleaned Dataframe

df = df_original.copy()
df = df.drop(df.index[0]) #Removes the Encryption automated message
df = df.drop(['Comment.1'], axis = 1) #Removes empty column
df.columns = ['date_time','sender','sender_role',
            'message','category_type','code',
            'code_1', 'comments']

In [4]:
def convert_date(date_time):
    # Converts date_time into machine readable timestamps
    return parser.parse(date_time)

def demojize(text):
    # Removes Emojis from a given text
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text) # no emoji

def split_count(text):
    # Creates a list of emojis from a given text
    emoji_list = []
    data = regex.findall(r'\X', text)
    
    for word in data:
        if any(char in emoji.UNICODE_EMOJI for char in word):
            emoji_list.append(word)

    return emoji_list

def find_url(text):
    # Creates a list of URLs found in a given text
    extractor = URLExtract()
    return extractor.find_urls(text)

In [5]:
# Converts the date_time into machine readable time stamp
df.date_time = df.date_time.apply(convert_date)

In [6]:
# Converting senders to sender IDs
le = LabelEncoder()
le.fit(df['sender'])
df['sender_id'] = le.transform(df['sender'])

In [7]:
# Creates a dictionary of sender ids mapped to sender nums
sender_ids = le.transform(le.classes_)
sender_nums = le.inverse_transform(sender_ids)
le_name_mapping = dict(zip(sender_ids, sender_nums))
le_name_mapping[5]

'- +91 78994 45604:'

In [8]:
message_series = [] #Original messages with empty strings at group activities
em_series = [] #List of emojis
demojized_series = [] #Cleaned message devoid of emojis
url_series = [] #List of URLs found in the message
media_binary_series = [] #Binary encoding whether the message contains a media file

for i in range(len(df.message)):
    if pd.isnull(df.message.iloc[i]) == False:
        msg = df.message.iloc[i]
        em_list = split_count(df.message.iloc[i])
        dmj = demojize(df.message.iloc[i])
        url = find_url(df.message.iloc[i])
        if df.message.iloc[i] == "<Media omitted>":
            media = 1
        else:
            media = 0
        
    else:
        msg = ""
        em_list = []
        dmj = ""
        url = []
        media = 0
    
    message_series.append(msg)
    em_series.append(em_list)
    demojized_series.append(dmj)
    url_series.append(url)
    media_binary_series.append(media)

In [9]:
df['emoji'] = em_series
df['messages_clean'] = demojized_series
df['url'] = url_series
df['has_media'] = media_binary_series

In [10]:
df.head(5)

Unnamed: 0,date_time,sender,sender_role,message,category_type,code,code_1,comments,sender_id,emoji,messages_clean,url,has_media
1,2019-03-01 19:26:00,- +91 94484 20837:,Teacher,🙏🏼👌🏼,,,,,42,"[🙏🏼, 👌🏼]",,[],0
2,2019-03-01 19:27:00,- +91 99724 48016:,Teacher,Congratulations 💐💐💐,,,,,111,"[💐, 💐, 💐]",Congratulations,[],0
3,2019-03-01 19:27:00,- +91 99459 99466:,Teacher,<Media omitted>,,,,,103,[],<Media omitted>,[],1
4,2019-03-01 19:29:00,- Kush Desai:,Teacher,👌👌💐💐,,,,,131,"[👌, 👌, 💐, 💐]",,[],0
5,2019-03-01 19:31:00,- +91 99721 55199:,Teacher,[Translated] May God give Madam his ideal and ...,,,,,110,"[🙏🏻, 🙏🏻, 🙏🏻]",[Translated] May God give Madam his ideal and ...,[],0


# Translation

In [11]:
# Imports the Google Cloud client library
from google.cloud import translate

# Instantiates a client
translate_client = translate.Client()

# The text to translate
text = u'Hello, world!'
# The target language
target = 'ru'

# Translates some text into Russian
translation = translate_client.translate(
    text,
    target_language=target)

print(u'Text: {}'.format(text))
print(u'Translation: {}'.format(translation['translatedText']))

DefaultCredentialsError: Could not automatically determine credentials. Please set GOOGLE_APPLICATION_CREDENTIALS or explicitly create credentials and re-run the application. For more information, please see https://cloud.google.com/docs/authentication/getting-started

In [14]:
import os

print('Credendtials from environ: {}'.format(
    os.environ.get('GOOGLE_APPLICATION_CREDENTIALS')))

Credendtials from environ: None


In [13]:
from google.oauth2 import service_account

credentials = service_account.Credentials.from_service_account_file(
    'whatsapp-translate-824a6a72c9c3.json')

scoped_credentials = credentials.with_scopes(
    ['https://www.googleapis.com/auth/cloud-platform'])