In [2]:
import os
import tarfile
import zipfile
import pandas as pd
import re
import numpy as np
import pickle

# Extracting data

In [10]:
if not os.path.exists('data/customer-support'):
    with zipfile.ZipFile('data/customer-support-on-twitter.zip') as f:
        f.extractall('data/customer-support')

In [2]:
if not os.path.exists('data/ubuntu-dialogue'):
    with zipfile.ZipFile('data/ubuntu-dialogue-corpus.zip') as f:
        f.extractall('data/ubuntu-dialogue')

# Loading data

## Twitter Customer Support data

In [90]:
twitter = pd.read_csv('data/customer-support/twcs/twcs.csv')
twitter = twitter.astype({'in_response_to_tweet_id': 'object', 'response_tweet_id': 'object'})

twitter.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,1,sprintcare,False,Tue Oct 31 22:10:47 +0000 2017,@115712 I understand. I would like to assist y...,2.0,3.0
1,2,115712,True,Tue Oct 31 22:11:45 +0000 2017,@sprintcare and how do you propose we do that,,1.0
2,3,115712,True,Tue Oct 31 22:08:27 +0000 2017,@sprintcare I have sent several private messag...,1.0,4.0
3,4,sprintcare,False,Tue Oct 31 21:54:49 +0000 2017,@115712 Please send us a Private Message so th...,3.0,5.0
4,5,115712,True,Tue Oct 31 21:49:35 +0000 2017,@sprintcare I did.,4.0,6.0


In [91]:
twitter.drop(columns=['created_at'], inplace=True)

twitter['text'] = twitter['text'].apply(lambda text: re.sub(r'([^A-Za-z0-9]+@[A-Za-z0-9_]+)|(^@[A-Za-z0-9_]+)', '', text))
twitter['text'] = twitter['text'].apply(lambda text: text.strip())
#twitter['text'] = twitter['text'].apply(lambda text: text.replace('\n', ' '))
twitter = twitter.set_index('tweet_id')

twitter.head()

Unnamed: 0_level_0,author_id,inbound,text,response_tweet_id,in_response_to_tweet_id
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,sprintcare,False,I understand. I would like to assist you. We w...,2.0,3.0
2,115712,True,and how do you propose we do that,,1.0
3,115712,True,I have sent several private messages and no on...,1.0,4.0
4,sprintcare,False,Please send us a Private Message so that we ca...,3.0,5.0
5,115712,True,I did.,4.0,6.0


In [5]:
twitter_dict = twitter.to_dict('index')

In [41]:
twitter_ = {}

for idx, values in twitter_dict.items(): 
    twitter_[idx] = {
        'author_id': values['author_id'],
        'inbound': values['inbound'],
        'text': values['text'],
        'response_tweet_id': [int(i) for i in str(values['response_tweet_id']).split(',')] if not pd.isna(values['response_tweet_id']) else [],
        'in_response_to_tweet_id': int(values['in_response_to_tweet_id']) if not pd.isna(values['in_response_to_tweet_id']) else np.nan
    }

In [42]:
twitter_inbounds_dict = {k: v for k, v in twitter_.items() if v['inbound'] == True and pd.isna(v['in_response_to_tweet_id'])}

In [43]:
twitter_ = {k: v['response_tweet_id'] for k,v in twitter_.items()}

In [44]:
twitter_inbounds_dict = {k: v['response_tweet_id'] for k,v in twitter_inbounds_dict.items()}

In [66]:
def get_conversations():
    conversations = []
    for start_id in twitter_inbounds_dict.keys():
        visitedList = []

        def depthFirst(graph, currentVertex, visited):
            visited.append(currentVertex)
            if currentVertex in graph:
                for vertex in graph[currentVertex]:
                    if vertex not in visited:
                        depthFirst(graph, vertex, visited.copy())
                visitedList.append(visited)

        depthFirst(twitter_, start_id, [])

        conversations.extend([l for l in visitedList if len(l) > 1])

    return conversations

In [67]:
conversations_ids = get_conversations()

In [95]:
conversations = []

for c in conversations_ids:
    conv = []
    for idx in c:
        conv.append(twitter_dict[idx]['text'].strip())
    conversations.append(conv)

In [98]:
conversations_filtered = [c for c in conversations if len(c) ==4]

In [101]:
len(conversations_filtered)

262565

In [100]:
with open('data/twitter_data.pickle', 'wb') as f:
    pickle.dump(conversations_filtered, f)

## Ubuntu Dialogue

In [116]:
ubuntu = pd.read_csv('data/ubuntu-dialogue/Ubuntu-dialogue-corpus/dialogueText_301.csv', nrows=200000)
ubuntu = ubuntu.sort_values(['dialogueID', 'date'], ignore_index=True)
ubuntu['to'] = ubuntu['to'].replace({np.nan: ''})
ubuntu.head(10)

Unnamed: 0,folder,dialogueID,date,from,to,text
0,60,1.tsv,2004-09-07T09:18:00.000Z,fabbione,,FUCK
1,60,1.tsv,2004-09-07T09:18:00.000Z,fabbione,,they stolen my car!
2,60,1.tsv,2004-09-07T09:18:00.000Z,fabbione,,FUCK
3,60,1.tsv,2004-09-07T09:18:00.000Z,jdub,fabbione,!!!
4,60,1.tsv,2004-09-12T13:12:00.000Z,jdub,fabbione,your hackergotchi can be a bum :)
5,60,1.tsv,2004-09-13T06:41:00.000Z,fabbione,jdub,dude?
6,60,1.tsv,2004-09-13T12:33:00.000Z,fabbione,jdub,are you going to fix it???
7,60,1.tsv,2004-09-14T12:18:00.000Z,jdub,fabbione,fix on its way
8,60,1.tsv,2004-09-14T12:20:00.000Z,fabbione,jdub,thanks
9,60,1.tsv,2004-09-17T11:48:00.000Z,fabbione,jdub,mine is normal installation. i don't customize...


In [118]:
def merge(d):
    texts = []
    _temp = d['text'][0]
    last_from = d['from'][0]
    last_to = d['to'][0]

    for i in range(1, len(d['text'])):
        if last_from == d['from'][i] and last_to == d['to'][i]:
            _temp += " " + str(d['text'][i])

        else:
            last_from = d['from'][i]
            last_to = d['to'][i]
            texts.append(_temp)
            _temp = str(d['text'][i])

    return texts

ubuntu_dialogues = ubuntu['dialogueID'].unique()

total = []
for dialogue in ubuntu_dialogues:
    ubuntu_sub = ubuntu[ubuntu['dialogueID'] == dialogue]

    total.append(merge({'text': ubuntu_sub['text'].tolist(), 'from': ubuntu_sub['from'].tolist(), 'to': ubuntu_sub['to'].tolist()}))

In [119]:
len(total)

7736

In [120]:
total = [t[i:i+4] for t in total for i in range(0, len(t), 4) if len(t[i:i+4]) == 4 ]

In [121]:
len(total)

24251

In [122]:
with open('data/ubuntu_data.pickle', 'wb') as f:
    pickle.dump(total, f)

## Twitter Customer Support data

In [None]:
utterances_dict = {}

with open('data/cornell movie-dialogs corpus/movie_lines.txt', 'rt', encoding='iso-8859-1') as f:
    for line in f:
        l = line.split('+++$+++')

        utterances_dict[l[0].strip()] = l[-1].strip()

In [None]:
conversations = []

with open('data/cornell movie-dialogs corpus/movie_conversations.txt', 'rt', encoding='iso-8859-1') as f:
    for line in f:
        conv = line.split('+++$+++')[-1]
        conv = conv[2:-2].split(', ')

        conv = [c.strip('\'') for c in conv]

        conversations.append([utterances_dict[c] for c in conv])

In [None]:
sum([len(c) for c in conversations])/len(conversations)

In [None]:
ready_conversations = [list(reversed(c[:4])) for c in conversations if len(c) >= 4]
len(ready_conversations)

In [None]:
with open('data/movie_data.pickle', 'wb') as f:
    pickle.dump(ready_conversations, f)

In [None]:
tolokers = pd.read_json('data/data_tolokers.json')['dialog']

tolokers_conversations = []

for i in range(len(tolokers)):
    conv = []
    for j in range(0, len(tolokers[i])-3):
        conv.append(list(reversed([t['text'] for t in tolokers[i][j:j+4]] )))
    tolokers_conversations.extend(conv)

In [1]:
len(tolokers_conversations)

NameError: name 'tolokers_conversations' is not defined

In [None]:
with open('data/tolokers_data.pickle', 'wb') as f:
    pickle.dump(tolokers_conversations, f)