In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from collections import Counter
from bs4 import BeautifulSoup
import seaborn as sns
from langdetect import DetectorFactory, detect

%matplotlib inline

In [5]:
wn = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [6]:
df_twcs = pd.read_csv('../Chatbot/dataset/twcs/twcs.csv', nrows=20000).set_index('tweet_id')

In [7]:
inbound_chat = df_twcs[df_twcs.inbound]

In [8]:
df_in_outs = pd.merge(inbound_chat, df_twcs, left_on='tweet_id', 
                                  right_on='in_response_to_tweet_id')

In [9]:
class Preprocess_Dataset:
    def __init__(self, dataframe):
        self.df = dataframe
        self.last_id = 0
        self.conv = []
        self.company_name = ''
        self.df_convs = pd.DataFrame(columns=['author_id', 'company_name', 'dialog'])
        
    def add_to_df(self, last_id, author_id, company_name, text_x, text_y):
        if (last_id == author_id):
            self.conv.append('participant1|'+ " ".join(filter(lambda x:x[0]!='@', text_x.split())))
            self.conv.append('participant2|'+ " ".join(filter(lambda x:x[0]!='@', text_y.split())))
        elif self.last_id != 0:
            if len(self.conv) > 0:
                id = len(self.df_convs)
                self.df_convs.loc[id, 'author_id'] = self.last_id
                self.df_convs.loc[id, 'company_name'] = self.company_name
                self.df_convs.loc[id, 'dialog'] = self.conv
                self.conv = []
            self.last_id = author_id
            
        else:
            self.conv.append('participant1|'+ text_x)
            self.conv.append('participant2|'+ text_y)
            self.last_id = author_id
            self.company_name = company_name
    def create_df(self):
        [self.add_to_df(self.last_id, row[0], row[1], row[2], row[3]) for row in self.df[['author_id_x', 'author_id_y','text_x', 'text_y']].values]
        return self.df_convs

In [15]:
def clean_text(text):
    clean_text = []
    text = re.sub("''", "", text)
    text = " ".join(filter(lambda x:x[0]!='@', text.split())) # Remove the words starts with '@'
    text = re.sub("(\\d|\\W)+"," ", text)
    clean_text = [wn.lemmatize(word, pos="v") for word in word_tokenize(text.lower()) 
                  if (word not in stop_words and word not in list(string.punctuation))]
    return clean_text
    #return " ".join([word for word in clean_text])

In [13]:
def split_participants(conversation):
    part1_dialog = []
    part2_dialog = []
    conv_token = []
    for conv in conversation:
        dialog = conv.split('|')
        if dialog[0] == 'participant1':
            part1_dialog.append(dialog[1])
        else:
            part2_dialog.append(dialog[1])
            
    if (len(part1_dialog) > 0):
        part1_str = " ".join([word for word in part1_dialog])
        conv_token.append(clean_text(part1_str))
    if (len(part2_dialog) > 0):
        part2_str = " ".join([word for word in part2_dialog])
        conv_token.append(clean_text(part2_str))
    return conv_token

In [10]:
cls_prep = Preprocess_Dataset(df_in_outs)

In [11]:
%%time
df_tw_convs = cls_prep.create_df()

Wall time: 10.6 s


In [22]:
(df_tw_convs['dialog'][52])

['participant1|I did this some time ago now but haven’t heard from you...',
 'participant2|Hi Richard, I apologize for the delayed response time. Please DM your conf # so I can review your trip details. *TLT... 1/2',
 "participant1|Actually, I just looked again and they disagree and say it's on you. Any idea what I should do? https://t.co/twIogKd3Kx",
 'participant2|Hello, Richard! Please DM your confirmation# and I will try to assist. *TCC',
 'participant1|Hi - and thanks. Think I just figured this out. Would it be you’re not running the MEX-MID flight so I need to check in via AeroMexico?',
 "participant2|Actually, I just looked again and they disagree and say it's on you. Any idea what I should do? https://t.co/twIogKd3Kx",
 "participant1|I'm flying JFK-MEX-MID tomorrow and you say I'm booked in, but it looks like it's just the first leg. Can you check, please?",
 'participant2|Hi, Richard. Can you pls DM your confirmation number so I may look into this matter for you? Thanks! *TJF 

In [16]:
df_tw_convs['conv_tokens'] = df_tw_convs['dialog'].apply(lambda x: split_participants(x))

In [17]:
def check_lang(conv):
    lng = 'en'
    if(len(conv)>1):
        text = " ".join([word for word in (conv[0] + conv[1])])
        lng = detect(text)
    return lng

In [18]:
df_tw_convs['lang'] = df_tw_convs['conv_tokens'].apply(lambda x: check_lang(x))

In [19]:
# Check the language used for tweet
df_tw_convs['lang'].value_counts()

en    2012
ja      27
es      19
fr      13
nl       9
de       6
it       5
af       5
pt       4
da       3
no       3
ro       2
et       2
cy       1
fi       1
ca       1
sv       1
Name: lang, dtype: int64

In [20]:
# Filter for English
df_tw_convs = df_tw_convs[(df_tw_convs['lang'] == 'en')]

In [21]:
splitted = split_participants(df_tw_convs['dialog'][65])
splitted

[['ready',
  'next',
  'iphone',
  'join',
  'iphone',
  'upgrade',
  'program',
  'free',
  'https',
  'co',
  'elb',
  'yqspf',
  'ready',
  'next',
  'iphone',
  'join',
  'iphone',
  'upgrade',
  'program',
  'free',
  'https',
  'co',
  'elb',
  'yqspf',
  'ready',
  'next',
  'iphone',
  'join',
  'iphone',
  'upgrade',
  'program',
  'free',
  'https',
  'co',
  'elb',
  'yqspf'],
 ['please',
  'let',
  'people',
  'pay',
  'higher',
  'monthly',
  'cost',
  'eliminate',
  'ridiculously',
  'high',
  'cost',
  'cost',
  'per',
  'year',
  'anyone',
  'else',
  'number',
  'unlimited',
  'service',
  'installment',
  'iphone',
  'x',
  'jump',
  'program',
  'go',
  'get',
  'iphone',
  'x']]