## Sentiment analysis with Vader
Calculate a sentiment for all chats in parallel.

In [None]:
import codecs, os
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from nltk.sentiment import vader

data_folder = '/home/daniel/s2ds/Data/'
input_file = '03_doc2vecTrainingData.txt'
input_file = codecs.open(os.path.join(data_folder, input_file), 'rU', 'utf-8')
docs = []
for line in input_file:
    docs.append(line.strip())

vaderize = vader.SentimentIntensityAnalyzer()
# results = np.zeros((len(docs), 4))

def get_sentiment(doc, i):
    sentiment = vaderize.polarity_scores(doc)
    result = np.zeros((1, 4))
    result[0, 0] = sentiment['compound']
    result[0, 1] = sentiment['neg']
    result[0, 2] = sentiment['neu']
    result[0, 3] = sentiment['pos']
    return result

results = np.array(Parallel(n_jobs=-1)(delayed(get_sentiment)(doc, i) for i, doc in enumerate(docs)))
results = results.reshape((results.shape[0], 4))
cols = ['Compound', 'Negative', 'Neutral', 'Positive']
saveto = '~/s2ds/Data/sentiment.csv'
pd.DataFrame(results, columns=cols).to_csv(saveto)

## Predict sentiment for each client and agent sentence


In [None]:
import codecs, os
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from nltk.sentiment import vader

input_files = ['03_clientMessages.txt', '03_agentMessages.txt']
save_tos = ['~/s2ds/Data/sentiment_client.csv', '~/s2ds/Data/sentiment_agent.csv']

for file_i, input_file in enumerate(input_files):
    data_folder = '/home/daniel/s2ds/Data/'
    input_file = codecs.open(os.path.join(data_folder, input_file), 'rU', 'utf-8')
    docs = []
    for line in input_file:
        docs.append(line.strip())

    vaderize = vader.SentimentIntensityAnalyzer()
    # results = np.zeros((len(docs), 4))

    def get_sentiment(doc, i):
        sentiment = vaderize.polarity_scores(doc)
        result = np.zeros((1, 4))
        result[0, 0] = sentiment['compound']
        result[0, 1] = sentiment['neg']
        result[0, 2] = sentiment['neu']
        result[0, 3] = sentiment['pos']
        return result

    results = np.array(Parallel(n_jobs=-1)(delayed(get_sentiment)(doc, i) for i, doc in enumerate(docs)))
    results = results.reshape((results.shape[0], 4))
    cols = ['Compound', 'Negative', 'Neutral', 'Positive']
    save_to = save_tos[file_i]
    pd.DataFrame(results, columns=cols).to_csv(saveto)

Then these two files were merged into one, with the convID column from the client_agent_summary2.csv, which resulted in the client_agent_sentiment.csv

## Calculate Spearman rho through the conversations

In [None]:
from scipy import stats
def delayed_spearman(df, min_num=10):
    """
    Calculates the Spearman correlation between a +1 delayed pairs of
    sentiment  values. I.e. it connects the first agent sentiment value
    with the 2nd of the client, and the 2nd agent with the 3rd client,
    etc. If there isn't enough (min_num) pairs, it returns None.
    
    :param df [pandas DataFrame], holding sentiment for each message for
               a whole conversation, with agent and client columns.
    :param min_num [int], the minimum number of sentiment value pairs 
                    that are needed to calculate a Spearman coef.
    :return Spearman correlation coefficient.
    """
    n, p = df.shape
    # shift client column up by one index (discarding first cell in it)
    client = df.client.values
    agent = df.agent.values
    client[0:n-1] = client[1:n]
    client = client[:n-1]
    agent = agent[:n-1]
    if client.shape[0] < min_num:
        return None
    else:
        return stats.spearmanr(agent, client)

Calculates Spearman rho in parallel, for each conversation which has at least 10 messages. 

In [None]:
import pandas as pd
import numpy as np
from joblib import Parallel, delayed

sentiment = pd.read_csv('/home/daniel/s2ds/Data/sentiment_client_agent.csv', index_col=0)
conv_num = sentiment.convID.max() + 1
results = np.zeros((conv_num, 2))

def spearman_wrapper(i, sentiment):
    chat = sentiment[sentiment.convID==i][['agent','client']]
    result =  delayed_spearman(chat)
    if result is not None:
        return np.array([result[0], result[1]])
    else:
        return np.array([0, 1])

results = np.array(Parallel(n_jobs=-1)(delayed(spearman_wrapper)(i, sentiment) for i in xrange(conv_num)))

Save results

In [None]:
r = pd.DataFrame(results, columns=['rho', 'p-val'])
r.to_csv('/home/daniel/s2ds/Data/sentiment_client_agent_spearman.csv')

## Plot conversations with significant p-values

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pylab as plt

# setup the plot
sns.set_context('poster', font_scale=1.5)
plt.figure(figsize=(20, 16))

# find conversations with significant correlations between agent and client sentiment
sentiment = pd.read_csv('/home/daniel/s2ds/Data/sentiment_client_agent.csv', index_col=0)
results = pd.read_csv('/home/daniel/s2ds/Data/sentiment_client_agent_spearman.csv', index_col=0).values
significant = np.where(results[:,1] < .05)[0]
for i, ID in enumerate(significant):
    # build title of figure
    r = "{0:.2f}".format(results[ID, 0])
    p = "{0:.2f}".format(results[ID, 1])
    t = "Rho: %s, p-val: %s" % (r, p)
    # get sentiment values for the conversation
    df = sentiment[sentiment.convID==ID][['agent','client']]
    df.index = range(df.shape[0])
    # shift the client values up as done in delayed_spearman
    n, p = df.shape
    df.client[0:n-1] = df.client[1:n]
    df = df[:n-1]
    # plot and save
    ax = df.plot(title=t)
    fig = ax.get_figure()
    fig.savefig('/home/daniel/Desktop/sentiment/conversation' + str(ID) + '.png')
    plt.close()

To check an actual conversation corresponding to one of these images run the  following with one of conversation ID's as line numbers

In [None]:
%%script bash
cd ~/s2ds/Data/
sed -n 3079,3079p 03_doc2vecTrainingData.txt | xsel -b

## plot boxplots by level in conversation

In [None]:
sentiment = pd.read_csv('/home/daniel/s2ds/Data/sentiment_client_agent.csv', index_col=0)

In [None]:
%matplotlib qt
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pylab as plt

# setup the plot
sns.set_context('talk', font_scale=1.5)
#plt.figure(figsize=(20, 16))

sentiment = pd.read_csv('/home/daniel/s2ds/Data/sentiment_client_agent.csv', index_col=0)
# discard very long conversations
sentiment = sentiment[sentiment.convPos<21]
sentiment = sentiment[['agent','client', 'convPos', 'convID', 'convLen']]
mu = sentiment.groupby('convPos').mean()
sd = sentiment.groupby('convPos').std()
count = sentiment.groupby('convPos').count()

oldschool ploting 

In [None]:
lengths = mu.shape[0] + 1
plt.figure()
plt.errorbar(range(1, lengths), mu.agent, yerr=sd.agent.values/np.sqrt(count.agent))
plt.title("Average agent sentiment of 51000 conversations")
plt.xlabel("Line of conversation")
plt.ylabel("Sentiment: negative (-1) to positive (+1)")

fancier plots with sns

In [None]:
# quick reshufffle of the data for the sns plot, only use a subsample of 1000
#s = sentiment.iloc[np.random.choice(xrange(sentiment.shape[0]),size=1000,replace=False), :]
s = sentiment
a = s[['agent', 'client']].stack(0)
a = a.reset_index([1])
a.columns = ['actor', 'sentiment']
df = a.join(s[['convPos', 'convID', 'convLen']])

In [None]:
df

In [None]:
# we can only plot conversations of a given length
%matplotlib qt
length = 10
df = df[df.convLen == length]
sns.tsplot(data=df, time="convPos", unit="convID", condition="actor", value="sentiment")

In [None]:
ax = sns.regplot(x="convPos", y="sentiment", data=df, x_estimator=np.mean, logx=True, truncate=True)

In [None]:
%matplotlib inline
import seaborn as sns
sns.set(style="darkgrid")

# Load the example titanic dataset

# Make a custom palette with gendered colors
pal = dict(agent="#6495ED", client="#F08080")

# Show the survival proability as a function of age and sex
g = sns.lmplot(x="convPos", y="sentiment", col="actor", hue="actor", data=df,
               palette=pal, y_jitter=.02)
g.set(xlim=(0, 80), ylim=(-.05, 1.05))