### Load data

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import exploration.commons as pltutil
from exploration.comments_exploration import cloud


utterances_df = pd.read_csv('./temp_data/pp_utterances.csv')
len(utterances_df)


In [None]:
df_grp = utterances_df[['id','author_role']].groupby('author_role').count().reset_index().rename(columns={'author_role':'Author Role','id':'Count'})
df_grp = df_grp.set_index('Author Role')
df_grp.transpose()

In [None]:
# utterances_df = utterances_df[utterances_df['author_role'] == 'assignee']

### Words exploration

#### Words cloud

In [None]:
fig = plt.figure(figsize=(10,6))
ax = fig.add_subplot(111)
cloud_df = utterances_df[['pp_actionbody','author','author_role']].reset_index()
cloud_df = cloud_df[~pd.isna(cloud_df['pp_actionbody'])]
cloud_df.drop_duplicates()
cloud(cloud_df,ax,max_words=150)

In [None]:
utterances_df[['words_count','pp_words_count']].describe().rename(columns={'words_count':'Words Count','pp_words_count':'Words Count (Preprocessed)'}).transpose()

#### Words count per issue

In [None]:
words_per_issue_df = utterances_df[['issueid','words_count','pp_words_count']].groupby("issueid").sum()

In [None]:
words_per_issue_df.describe().rename(columns={'words_count':'Words Count','pp_words_count':'Words Count (Preprocessed)'}).transpose()

In [None]:
fig = plt.figure(figsize=(16,6))
ax = fig.add_subplot(121)

pltutil.plot_hist(ax, 
        words_per_issue_df['words_count'], 
        bins=200,
        xlabel='Words Count',
        ylabel='Frequency',
        grid='both', 
        max_y=1400,
        xsteps=500,
        ysteps=100,
        xrotation=90)

ax = fig.add_subplot(122)

pltutil.plot_hist(ax, 
        words_per_issue_df['pp_words_count'], 
        bins=200,
        xlabel='Words Count (Preprocessed)',
        ylabel='Frequency',
        grid='both', 
        max_y=1500,
        xsteps=500,
        ysteps=100,
        xrotation=90)

fig = plt.figure(figsize=(16,2))
ax = fig.add_subplot(121)
pltutil.plot_box(ax,
                  words_per_issue_df['words_count'], 
                  label='Words Count',
                  vertical=False,
                  steps=500,
                  grid='x',
                  rotation=90)

ax = fig.add_subplot(122)
pltutil.plot_box(ax,
                  words_per_issue_df['pp_words_count'], 
                  label='Words Count (Preprocessed)',
                  vertical=False,
                  steps=500,
                  grid='x',
                  rotation=90)

#### Words per comment

In [None]:
words_per_comment_df = utterances_df[['id','issueid','words_count','pp_words_count']].groupby(['id','issueid']).sum()
words_per_comment_df.head(3)

In [None]:
words_per_comment_df[['words_count','pp_words_count']].describe().rename(columns={'words_count':'Words Count','pp_words_count':'Words Count (Preprocessed)'}).transpose()

In [None]:
fig = plt.figure(figsize=(16,6))
ax = fig.add_subplot(121)

pltutil.plot_hist(ax, 
        words_per_comment_df['words_count'], 
        bins=200,
        xlabel='Words Count',
        ylabel='Frequency',
        grid='both', 
        max_y=35000,
        xsteps=500,
        ysteps=5000,
        xrotation=90)

ax = fig.add_subplot(122)

pltutil.plot_hist(ax, 
        words_per_comment_df['pp_words_count'], 
        bins=200,
        xlabel='Words Count (Preprocessed)',
        ylabel='Frequency',
        grid='both', 
        max_y=35000,
        xsteps=500,
        ysteps=5000,
        xrotation=90)

fig = plt.figure(figsize=(16,2))
ax = fig.add_subplot(121)
pltutil.plot_box(ax,
                  words_per_comment_df['words_count'], 
                  label='Words Count',
                  vertical=False,
                  steps=500,
                  grid='x',
                  rotation=90)

ax = fig.add_subplot(122)
pltutil.plot_box(ax,
                  words_per_comment_df['pp_words_count'], 
                  label='Words Count (Preprocessed)',
                  vertical=False,
                  steps=500,
                  grid='x',
                  rotation=90)

### Utterances exploration

In [None]:
comments_per_issue_df = utterances_df[['issueid', 'id', 'utr_seq']].groupby(['issueid', 'id']).count()
comments_per_issue_df = comments_per_issue_df.rename(columns={'utr_seq': 'utterances_count'})
comments_per_issue_df.head(10)

In [None]:
comments_per_issue_df.describe().transpose()

In [None]:
utter_per_issue_df = comments_per_issue_df.groupby('issueid').sum('utterances_count')
utter_per_issue_df.head(5)

In [None]:
utter_per_issue_df.describe().rename(columns={'utterances_count':'Utterances Count'}).transpose()

In [None]:
fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(111)

pltutil.plot_hist(ax, 
        utter_per_issue_df['utterances_count'], 
        bins=200,
        xlabel='Utterances Count',
        ylabel='Frequency',
        grid='both', 
        max_y=1000,
        xsteps=50,
        ysteps=50,
        xrotation=90)

In [None]:
fig = plt.figure(figsize=(8,2))
ax = fig.add_subplot(111)
pltutil.plot_box(ax,
                  utter_per_issue_df['utterances_count'], 
                  label='Utterances Count',
                  vertical=False,
                  steps=50,
                  grid='x',
                  rotation=90)
