In [3]:
import numpy as np
import os
import operator
import pandas as pd
import plotly.graph_objs as go

from plotly.offline import init_notebook_mode, plot, iplot
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA

init_notebook_mode(connected=True)

## Loading data

In [5]:
# Loading the data into as a pandas data frame
data = pd.read_csv("./data/ubuntu_support_extract.csv")
data.head()

Unnamed: 0,conversation_id,datetime,from,to,text
0,10-10000,2010-04-17 20:15:00+00:00,fk91,,"Hello, I have a minimal linux system: how can ..."
1,10-10000,2010-04-17 20:15:00+00:00,fk91,,"@Maco: ip is there, thanks :)"
2,10-10000,2010-04-17 20:15:00+00:00,sometux,fk91,ifconfig
3,10-10000,2010-04-17 20:15:00+00:00,sometux,fk91,static or dhcp
4,10-10000,2010-04-17 20:16:00+00:00,fk91,,static


In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3075574 entries, 0 to 3075573
Data columns (total 5 columns):
conversation_id    object
datetime           object
from               object
to                 object
text               object
dtypes: object(5)
memory usage: 117.3+ MB


In [17]:
# Checking number of rows with missing values in each of the columns
data.isnull().sum()

conversation_id          0
datetime                 0
from                    25
to                 1133776
text                     2
dtype: int64

In [18]:
# Removing rows which with missing values in 'text' and 'from' columns only,
# column 'to' can have valid null values in the first msg of the conversation.
data = data.dropna(subset=['text','from']) 
data.shape

(3075547, 5)

In [19]:
# Verifying if rows with null values in 'text' and 'from' columns are removed
data.isnull().sum()

conversation_id          0
datetime                 0
from                     0
to                 1133752
text                     0
dtype: int64

In [20]:
from_users = data['from'].tolist()
to_users = data['to'].tolist()
print(len(set(from_users)), ',', len(set(to_users)))

92495 , 89090


In [21]:
# Total number of unique users
print(len(set(from_users + to_users)))

92519


## Count based approach for selecting agents
A naive count based selection of agents. Rationale is that the users involved in high number of messages/conversations are:
1. either advisors who responded to an issue
2. or enquirers who gained knowledge after querying about an issue

Either way, users involved in both sides of conversation are assumed to have knowledge of the covered topics
'after' a conversation has taken place.

Minimal incremental improvements: 
1. Filter out users who were on the advising side most of the times, based on the initial inquiry in the conversation.
2. Give more weightage to the users who are currently active, based on date time. 
3. Ranking agents based on the average handle/response time based on the time stamps. 
4. The below agent selection mechanism does not use the language used in the conversations to filter out the conversations in a different language. But this can be done in a minimal way by using off the shelf language detectors.

In [73]:
# Computing total incoming and outgoing messages for each user. 
from_user_dist = data.drop(columns=['text','to','datetime']).groupby(['from'])\
                              .size().reset_index(name='outgoing')

from_user_dist = from_user_dist.rename(columns={'from': 'users'})

to_user_dist = data.drop(columns=['text','from','datetime']).groupby(['to'])\
                            .size().reset_index(name='incoming') 
to_user_dist = to_user_dist.rename(columns={'to': 'users'})

In [90]:
# Sorting the agents based on outgoing/incoming messages.
incoming_outgoing = pd.merge(from_user_conversation_dist, to_user_conversation_dist, on='users')\
                    .sort_values(ascending=False, by='outgoing')
incoming_outgoing.head(15)

Unnamed: 0,users,outgoing,incoming
349,ActionParsnip,55125,27053
6001,Dr_Willis,36626,16590
51341,ikonia,30715,13762
43183,edbian,21307,9511
54888,jrib,18491,11398
44083,erUSUL,17653,9789
33104,bazhang,17512,6701
38388,coz_,14329,5404
13215,Jordan_U,13537,8821
81069,theadmin,13047,6662


## Topic based approach for selecting agents

The above quantitative approach does not take into account the topics/knowledge coverage, 
and the distribution of topics in the conversations. If conversations cover 3 topics in a 
distribution of 60:20:20, a good approach will be that the top 15 agents should be distributed across 
the 3 topics in a similar way, i.e., 9:3:3

Topics can be seen as clusters, where algorithms like LDA (~soft clustering approach) can be utilized to uncover themes being discussed in the conversations.

This approach first identifies n number of topics in the full dataset, and calculate the coverage of each topic in each message, using LDA. LDA identifies topics as a cluster of tokens. N more columns are added to the dataset where each column represents a topic, and holds a % value which is the topic coverage of the message. This final dataset can be queried in a number of ways to make a topic based selection of users.

In [6]:
def plot_frequent_n_words(n, word_count_dict):
    sorted_word_count = sorted(word_count_dict.items(), key=operator.itemgetter(1), reverse=True)
    word_count = sorted_word_count[:n+1]
    trace1 = go.Bar(
      x = [i[0] for i in word_count], 
      y = [i[1] for i in word_count], 
      marker = dict(color='blue'))

    data = [trace1]
    layout = go.Layout(
      title= "Ngrams Frequency", 
      xaxis= dict(
      title= "Ngrams"), 
      yaxis=dict(title="Count")
    )
    
    fig = go.Figure(data=data, layout=layout)
    iplot(fig, filename='jupyter-styled_bar')

In [10]:
def print_topics(model, vectorizer, num_ngrams):
    ngrams = vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(", ".join([ngrams[i]
                        for i in topic.argsort()[:-num_ngrams - 1:-1]]))

In [7]:
data_sample = data.sample(n=10000)
data_sample.shape

(10000, 5)

In [126]:
# TODO
# Improvement: text pre-processing. Removal of stopwords, adjectives, adverbs etc. 
# which are less likely to represent the topic in a conversation. 

In [8]:
count_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, 3))
ngrams_cv = count_vectorizer.fit_transform(data_sample['text'])
# Ngrams and their count
ngrams = count_vectorizer.get_feature_names()
counts = ngrams_cv.toarray().sum(axis=0)        

In [9]:
plot_frequent_n_words(20, dict(zip(ngrams, counts)))

In [156]:
# LDA parameters 
number_topics = 5
topic_ngrams = 20

# Learning a topic model on the message texts
lda = LDA(n_components=number_topics)
lda.fit(ngrams_cv)

# Print ngrams in each topic identified by LDA, each topic would have its own weights 
# over different ngrams.
print("Topics found via LDA:")
print_topics(lda, count_vectorizer, topic_ngrams)

Topics found via LDA:

Topic #0:
ubuntu, install, use, just, need, like, problem, know, using, does, help, think, 10, file, installed, error, boot, sudo, linux, terminal

Topic #1:
ubuntu, just, use, install, sudo, like, want, file, 10, apt, need, command, help, http, work, sudo apt, try, 04, installed, run

Topic #2:
don, yes, ubuntu, use, paste, know, just, good, http, 10, com, file, ubuntu com, http paste, paste ubuntu, paste ubuntu com, http paste ubuntu, right, install, using

Topic #3:
ubuntu, thanks, just, did, work, know, install, windows, like, try, want, need, use, does, help, hello, kernel, installed, got, wireless

Topic #4:
ubuntu, just, 10, install, ok, help, try, 04, hi, know, version, http, want, windows, use, does, linux, com, using, sorry


In [166]:
#transform text messages into their respective topic distributions
message_lda_topics_vectors = lda.transform(ngrams_cv).tolist()

# Print topic distribution for 5 sample messages.
for n in range(5):    
    topic_pr = message_lda_topics_vectors[n]    
    print("msg: {} topic: {}\n".format(n, topic_pr))

msg: 0 topic: [0.9181300775065853, 0.02052566915588772, 0.020343327921143427, 0.020245327556367237, 0.02075559786001648]

msg: 1 topic: [0.0038677920532152223, 0.0038847804172280178, 0.003868724054411985, 0.9845072055409038, 0.003871497934240878]

msg: 2 topic: [0.7973447692873161, 0.05074649496463424, 0.050684552999278674, 0.0506045808495457, 0.05061960189922517]

msg: 3 topic: [0.2, 0.2, 0.2, 0.2, 0.2]

msg: 4 topic: [0.003319293173789529, 0.0033502062415268863, 0.0033047064978815295, 0.9867073179284446, 0.0033184761583574204]



In [167]:
topics_df = pd.DataFrame(message_lda_topics_vectors, columns = ['Topic 1', 'Topic 2','Topic 3','Topic 4','Topic 5'])
topics_df.head(10)

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5
0,0.91813,0.020526,0.020343,0.020245,0.020756
1,0.003868,0.003885,0.003869,0.984507,0.003871
2,0.797345,0.050746,0.050685,0.050605,0.05062
3,0.2,0.2,0.2,0.2,0.2
4,0.003319,0.00335,0.003305,0.986707,0.003318
5,0.05031,0.050262,0.79888,0.050304,0.050244
6,0.012563,0.012557,0.012598,0.012553,0.94973
7,0.949647,0.012602,0.012539,0.012576,0.012637
8,0.592818,0.101435,0.101327,0.102204,0.102216
9,0.02891,0.028807,0.02893,0.884636,0.028717


In [171]:
# Adding a column 'message_id' to use it for join with the topic vectors
data_sample['message_id'] = range(1, len(data_sample) + 1)
data_sample.head(10)

Unnamed: 0,conversation_id,datetime,from,to,text,message_id
466543,12-8340,2010-03-23 02:49:00+00:00,Maranatha,,I rebooted my computer and I am still having t...,1
2736300,8-10957,2011-07-11 07:56:00+00:00,Bipul,,svn co http://svn.asterisk.org/svn/asterisk/br...,2
2552438,7-14641,2010-09-03 16:06:00+00:00,iceroot,logan_wolf,boot what? ubuntu?,3
2410262,6-40992,2010-10-04 20:29:00+00:00,azertyuio,,where i have to find it ?,4
1341548,3-108934,2010-03-03 07:10:00+00:00,stooj,,Maybe someone in here can tell me. Updates are...,5
2860799,8-67156,2010-01-12 17:04:00+00:00,erUSUL,tcr,about the description? sure.,6
440743,12-5840,2011-10-08 13:04:00+00:00,martin_PL,,/dev/sda1 * 1 26 204...,7
939884,18-6446,2010-01-11 04:17:00+00:00,Izinucs,LinuX2half,when you first start the computer watch the sc...,8
146366,11-10137,2010-07-13 19:35:00+00:00,icebreaker,,ubuntu,9
1526138,3-80116,2010-08-31 05:46:00+00:00,ZykoticK9,VinnyParker,that would reset your panels to default,10


In [172]:
# Assigning message ids to allow join with the data  
topics_df['message_id'] = range(1, len(topics_df) + 1)
topics_df.head(10)

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,message_id
0,0.91813,0.020526,0.020343,0.020245,0.020756,1
1,0.003868,0.003885,0.003869,0.984507,0.003871,2
2,0.797345,0.050746,0.050685,0.050605,0.05062,3
3,0.2,0.2,0.2,0.2,0.2,4
4,0.003319,0.00335,0.003305,0.986707,0.003318,5
5,0.05031,0.050262,0.79888,0.050304,0.050244,6
6,0.012563,0.012557,0.012598,0.012553,0.94973,7
7,0.949647,0.012602,0.012539,0.012576,0.012637,8
8,0.592818,0.101435,0.101327,0.102204,0.102216,9
9,0.02891,0.028807,0.02893,0.884636,0.028717,10


In [185]:
#original dataset extended with topic columns
data_sample = pd.merge(data_sample, topics_df, on='message_id')
data_sample.head(10)

Unnamed: 0,conversation_id,datetime,from,to,text,message_id,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5
0,12-8340,2010-03-23 02:49:00+00:00,Maranatha,,I rebooted my computer and I am still having t...,1,0.91813,0.020526,0.020343,0.020245,0.020756
1,8-10957,2011-07-11 07:56:00+00:00,Bipul,,svn co http://svn.asterisk.org/svn/asterisk/br...,2,0.003868,0.003885,0.003869,0.984507,0.003871
2,7-14641,2010-09-03 16:06:00+00:00,iceroot,logan_wolf,boot what? ubuntu?,3,0.797345,0.050746,0.050685,0.050605,0.05062
3,6-40992,2010-10-04 20:29:00+00:00,azertyuio,,where i have to find it ?,4,0.2,0.2,0.2,0.2,0.2
4,3-108934,2010-03-03 07:10:00+00:00,stooj,,Maybe someone in here can tell me. Updates are...,5,0.003319,0.00335,0.003305,0.986707,0.003318
5,8-67156,2010-01-12 17:04:00+00:00,erUSUL,tcr,about the description? sure.,6,0.05031,0.050262,0.79888,0.050304,0.050244
6,12-5840,2011-10-08 13:04:00+00:00,martin_PL,,/dev/sda1 * 1 26 204...,7,0.012563,0.012557,0.012598,0.012553,0.94973
7,18-6446,2010-01-11 04:17:00+00:00,Izinucs,LinuX2half,when you first start the computer watch the sc...,8,0.949647,0.012602,0.012539,0.012576,0.012637
8,11-10137,2010-07-13 19:35:00+00:00,icebreaker,,ubuntu,9,0.592818,0.101435,0.101327,0.102204,0.102216
9,3-80116,2010-08-31 05:46:00+00:00,ZykoticK9,VinnyParker,that would reset your panels to default,10,0.02891,0.028807,0.02893,0.884636,0.028717


#### Choosing n agents for a given topic 

In [189]:
# View top texts for a topic
data_sample.sort_values(by=['Topic 4'], ascending=False)

Unnamed: 0,conversation_id,datetime,from,to,text,message_id,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5
3934,6-32394,2010-01-03 08:32:00+00:00,meway,,Hello I am running ubuntu v 9.10 and the GNU G...,3935,0.001453,0.001449,0.001453,0.994189,0.001456
6587,7-29769,2010-05-02 16:29:00+00:00,solofight,,"i have windows home edition, i deleted a parti...",6588,0.001629,0.001629,0.001625,0.993486,0.001632
9103,67-58,2010-04-21 23:24:00+00:00,LzrdKing,,"when X is running (nvidia drivers), aplay -L r...",9104,0.001666,0.001666,0.001666,0.993334,0.001667
7516,48-120,2010-08-15 07:11:00+00:00,peepingtom,pmp6nl,You probably won't see any benefit from compil...,7517,0.002015,0.002015,0.002016,0.991930,0.002024
6259,6-16183,2011-11-08 20:19:00+00:00,redmage,,Question for the group. I'm running Ubuntu 11...,6260,0.002027,0.002014,0.002011,0.991913,0.002035
8974,21-6044,2010-07-29 05:58:00+00:00,maco,arrrghhh,"webchat: when you use gnome apps in kde, they ...",8975,0.002216,0.002235,0.002223,0.991092,0.002234
8911,15-2661,2011-02-11 04:29:00+00:00,Gizmo_the_Great,schnuffle,"mongy OK, I think I am there. Let me recap - C...",8912,0.002296,0.002302,0.002288,0.990811,0.002303
2454,6-66988,2011-10-19 21:46:00+00:00,Me2,,I have a question. I have a Pavilion dv6z with...,2455,0.002305,0.002303,0.002298,0.990780,0.002314
1239,20-1018,2011-04-07 17:24:00+00:00,Gucci,,Can anyone help me create a custom menu entry ...,1240,0.002379,0.002393,0.002394,0.990455,0.002380
2505,35-399,2010-05-08 17:01:00+00:00,hmw,krazykrivda_,Start in the System menu (top left) and look f...,2506,0.002459,0.002462,0.002460,0.990161,0.002458


In [199]:
# coverage of a given topic by different users across all messages
topic = 'Topic 1'
data_sample_topic = data_sample[['from', topic]].groupby(['from'])\
                    .sum().sort_values(by=[topic], ascending=False)

data_sample_topic.head(10)

Unnamed: 0_level_0,Topic 1
from,Unnamed: 1_level_1
ActionParsnip,30.62841
Dr_Willis,17.814588
ikonia,17.804104
edbian,14.973948
bazhang,14.104671
iceroot,11.153458
Jordan_U,10.873295
jrib,10.546092
coz_,9.122646
rww,7.474943
