In [2]:
import numpy as np
import os
import operator
import pandas as pd
import plotly.graph_objs as go

from plotly.offline import init_notebook_mode, plot, iplot
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA

init_notebook_mode(connected=True)

## Loading data

In [3]:
# Loading the data into as a pandas data frame
data = pd.read_csv("./data/ubuntu_support_extract.csv")
data.head(10)

Unnamed: 0,conversation_id,datetime,from,to,text
0,10-10000,2010-04-17 20:15:00+00:00,fk91,,"Hello, I have a minimal linux system: how can ..."
1,10-10000,2010-04-17 20:15:00+00:00,fk91,,"@Maco: ip is there, thanks :)"
2,10-10000,2010-04-17 20:15:00+00:00,sometux,fk91,ifconfig
3,10-10000,2010-04-17 20:15:00+00:00,sometux,fk91,static or dhcp
4,10-10000,2010-04-17 20:16:00+00:00,fk91,,static
5,10-10000,2010-04-17 20:16:00+00:00,sometux,fk91,look at /etc/interface
6,10-10000,2010-04-17 20:17:00+00:00,fk91,sometux,"/etc/interfaces are not there, its a fli4l"
7,10-10000,2010-04-17 20:17:00+00:00,sometux,fk91,sorry look at /etc/network/interfaces
8,10-10000,2010-04-17 20:18:00+00:00,fk91,sometux,This file isnt there too.
9,10-10000,2010-04-17 20:19:00+00:00,sometux,fk91,i think you have to look in /proc


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3075574 entries, 0 to 3075573
Data columns (total 5 columns):
conversation_id    object
datetime           object
from               object
to                 object
text               object
dtypes: object(5)
memory usage: 117.3+ MB


In [4]:
# Checking number of rows with missing values in each of the columns
data.isnull().sum()

conversation_id          0
datetime                 0
from                    25
to                 1133776
text                     2
dtype: int64

In [5]:
# Removing rows which with missing values in 'text' and 'from' columns only,
# column 'to' can have valid null values in the first msg of the conversation.
data = data.dropna(subset=['text','from']) 
data.shape

(3075547, 5)

In [6]:
# Verifying if rows with null values in 'text' and 'from' columns are removed
data.isnull().sum()

conversation_id          0
datetime                 0
from                     0
to                 1133752
text                     0
dtype: int64

In [7]:
from_users = data['from'].tolist()
to_users = data['to'].tolist()
print(len(set(from_users)), ',', len(set(to_users)))

92495 , 89090


In [8]:
# Total number of unique users
print(len(set(from_users + to_users)))

92519


## Count based approach for selecting agents
A naive count based selection of agents. Rationale is that the users involved in high number of messages/conversations are:
1. either advisors who responded to an issue
2. or enquirers who gained knowledge after querying about an issue

Either way, users involved in both sides of conversation are assumed to have knowledge of the covered topics
'after' a conversation has taken place.

Minimal incremental improvements: 
1. Filter out users who were on the advising side most of the times, based on the initial inquiry in the conversation.
2. Give more weightage to the users who are currently active, based on date time. 
3. Ranking agents based on the average handle/response time based on the time stamps. 
4. The below agent selection mechanism does not use the language used in the conversations to filter out the conversations in a different language. But this can be done in a minimal way by using off the shelf language detectors.

In [4]:
# Computing total incoming and outgoing messages for each user. 
from_user_dist = data.drop(columns=['text','to','datetime']).groupby(['from'])\
                              .size().reset_index(name='outgoing')

from_user_dist = from_user_dist.rename(columns={'from': 'users'})

to_user_dist = data.drop(columns=['text','from','datetime']).groupby(['to'])\
                            .size().reset_index(name='incoming') 
to_user_dist = to_user_dist.rename(columns={'to': 'users'})

In [5]:
# Sorting the agents based on outgoing/incoming messages.
incoming_outgoing = pd.merge(from_user_dist, to_user_dist, on='users')\
                    .sort_values(ascending=False, by='outgoing')
incoming_outgoing.head(15)

Unnamed: 0,users,outgoing,incoming
349,ActionParsnip,55125,27053
6001,Dr_Willis,36626,16590
51341,ikonia,30715,13762
43183,edbian,21307,9511
54888,jrib,18491,11398
44083,erUSUL,17653,9789
33104,bazhang,17512,6701
38388,coz_,14329,5404
13215,Jordan_U,13537,8821
81069,theadmin,13047,6662


## Topic based approach for selecting agents

The above quantitative approach does not take into account the topics/knowledge coverage, 
and the distribution of topics in the conversations. If conversations cover 3 topics in a 
distribution of 60:20:20, a good approach will be that the top 15 agents should be distributed across 
the 3 topics in a similar way, i.e., 9:3:3

Topics can be seen as clusters, where algorithms like LDA (~soft clustering approach) can be utilized to uncover themes being discussed in the conversations.

This approach first identifies n number of topics in the full dataset, and calculate the coverage of each topic in each message, using LDA. LDA identifies topics as a cluster of tokens. N more columns are added to the dataset where each column represents a topic, and holds a % value which is the topic coverage of the message. This final dataset can be queried in a number of ways to make a topic based selection of users.

In [None]:
def plot_frequent_n_words(n, word_count_dict):
    sorted_word_count = sorted(word_count_dict.items(), key=operator.itemgetter(1), reverse=True)
    word_count = sorted_word_count[:n+1]
    trace1 = go.Bar(
      x = [i[0] for i in word_count], 
      y = [i[1] for i in word_count], 
      marker = dict(color='blue'))

    data = [trace1]
    layout = go.Layout(
      title= "Ngrams Frequency", 
      xaxis= dict(
      title= "Ngrams"), 
      yaxis=dict(title="Count")
    )
    
    fig = go.Figure(data=data, layout=layout)
    iplot(fig, filename='jupyter-styled_bar')

In [None]:
def print_topics(model, vectorizer, num_ngrams):
    ngrams = vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(", ".join([ngrams[i]
                        for i in topic.argsort()[:-num_ngrams - 1:-1]]))

In [None]:
data_sample = data.sample(n=10000)
data_sample.shape

In [None]:
# TODO
# Improvement: text pre-processing. Removal of stopwords, adjectives, adverbs etc. 
# which are less likely to represent the topic in a conversation. 

In [None]:
count_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, 3))
ngrams_cv = count_vectorizer.fit_transform(data_sample['text'])
# Ngrams and their count
ngrams = count_vectorizer.get_feature_names()
counts = ngrams_cv.toarray().sum(axis=0)        

In [None]:
plot_frequent_n_words(20, dict(zip(ngrams, counts)))

In [None]:
# LDA parameters 
number_topics = 5
topic_ngrams = 20

# Learning a topic model on the message texts
lda = LDA(n_components=number_topics)
lda.fit(ngrams_cv)

# Print ngrams in each topic identified by LDA, each topic would have its own weights 
# over different ngrams.
print("Topics found via LDA:")
print_topics(lda, count_vectorizer, topic_ngrams)

In [None]:
#transform text messages into their respective topic distributions
message_lda_topics_vectors = lda.transform(ngrams_cv).tolist()

# Print topic distribution for 5 sample messages.
for n in range(5):    
    topic_pr = message_lda_topics_vectors[n]    
    print("msg: {} topic: {}\n".format(n, topic_pr))

In [None]:
topics_df = pd.DataFrame(message_lda_topics_vectors, columns = ['Topic 1', 'Topic 2','Topic 3','Topic 4','Topic 5'])
topics_df.head(10)

In [None]:
# Adding a column 'message_id' to use it for join with the topic vectors
data_sample['message_id'] = range(1, len(data_sample) + 1)
data_sample.head(10)

In [None]:
# Assigning message ids to allow join with the data  
topics_df['message_id'] = range(1, len(topics_df) + 1)
topics_df.head(10)

In [None]:
#original dataset extended with topic columns
data_sample = pd.merge(data_sample, topics_df, on='message_id')
data_sample.head(10)

#### Choosing n agents for a given topic 

In [None]:
# View top texts for a topic
data_sample.sort_values(by=['Topic 4'], ascending=False)

In [None]:
# coverage of a given topic by different users across all messages
topic = 'Topic 1'
data_sample_topic = data_sample[['from', topic]].groupby(['from'])\
                    .sum().sort_values(by=[topic], ascending=False)

data_sample_topic.head(10)