# Packages

##  For conversation threads

In [0]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import requests
from pandas.io.json import json_normalize

## For topic modeling

In [0]:
!pip install gensim
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import gensim
lemmatize = WordNetLemmatizer()
stopwords = set(stopwords.words('english'))
import string
punctuation = set(string.punctuation)
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk.data
#sid = SentimentIntensityAnalyzer()
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

# List of documents

In [0]:
FMlinkList = {'Dec17':'http://educatorinnovator.org/wp-content/uploads/2017/10/Critical-Literacy-And-Our-Students-Lives.pdf',
             'Mar18':'http://educatorinnovator.org/wp-content/uploads/2018/02/The-Stories-They-Tell-.pdf'}

# Helper functions


In [0]:
# Build dataframe from hypothes.is API
def H_API(session,uri):
    n = 0
    payload = {'url':uri}
    r = requests.get('https://hypothes.is/api/search',params=payload)
    tmp = pd.DataFrame(columns=['created', 'document.title', 'group', 'id', 'links.html',
       'links.incontext', 'links.json', 'permissions.admin',
       'permissions.delete', 'permissions.read', 'permissions.update',
       'references', 'tags', 'target', 'text', 'updated', 'uri', 'user'])
    total = r.json()['total']
    print(session)
    print(uri)
    print('Number of Records Found for this Session: ',total)
    print('Scraped:')
    while n < total:
        print(n)
        tmp = tmp.append(json_normalize(r.json()['rows']))
        n = len(tmp)
        payload = {
            'url':uri,
            'offset':n}
        r = requests.get('https://hypothes.is/api/search',params=payload)
    print(n)
    tmp['session'] = session
    tmp['updated'] = pd.to_datetime(tmp['updated'])
    tmp['created'] = pd.to_datetime(tmp['created'])
    tmp = tmp.set_index(tmp['created'])
    return tmp

#returns the number of times a message was in a reference list of other messages
def countReplies(msgId):
    return len(df.dropna()[df['references'].dropna().map(lambda x: msgId in x)])
def listOfIds(msgId):
    return df.dropna()[df['references'].dropna().map(lambda x: msgId in x)]['id'].values
def replyTimeDelta(x):
    if len(df[df['id']==x]['created']) == 0:
      return None
    else:
      return df[df['id']==x]['created'].iloc[0]

#For finding topics
def cleaning(article):
    one = " ".join([i for i in article.lower().split() if i not in stopwords])
    two = "".join(i for i in one if i not in punctuation)
    three = " ".join(lemmatize.lemmatize(i) for i in two.split())
    return three
  
def labelizer(x):
    labels1 = ''.join(x[0][1].split()[::2]).split('"')[1::2]
    labels2 = ''.join(x[1][1].split()[::2]).split('"')[1::2]
    singleLabel = list(labels1[:2]+labels2[:2])
    return singleLabel

# Create dataFrames

## With hypothes.is data

In [0]:
df = pd.DataFrame(columns=['created','document.title','group','id','links.html',
       'links.incontext','links.json','permissions.admin',
       'permissions.delete','permissions.read','permissions.update',
       'references','tags','target','text','updated','uri','user','session'])
for session,link in FMlinkList.items():
    df = df.append(H_API(session,link))
df = df.tz_localize('UTC').tz_convert('US/Mountain')
print('Total Records: ',len(df))

#Modify df for network analysis
df = df[['created','id','references','session','tags','target', 'text','user']]
df['replyTo'] = df['references'].map(lambda x: x[-1],na_action='ignore')
df['user'] = df['user'].map(lambda x: x[5:-12])

#Time betweeen message and reply
df['replyDelay'] = df[df['replyTo'].notnull()]['created']\
- df[df['replyTo'].notnull()]['replyTo'].map(lambda x: replyTimeDelta(x))

#convert to minutes
df['replyDelay'] = df['replyDelay'].map(lambda x: x.total_seconds() / 60)

## Build network of threads (2 or more messages in conversation)

In [0]:
G = nx.from_pandas_edgelist(df[df['references'].notnull()],'id','replyTo'
                           ,edge_attr=['tags','target','text','user','replyDelay']
                           ,create_using=nx.DiGraph())

# Build dataframe of threads with list of ids per thread
threads = []
for x in nx.connected_components(nx.to_undirected(G)):
    threads.append(x)

# list of text per thread
textT = []
timesT = []
usersT = []
for thread in threads:    
    T = []
    t = []
    u = []
    for key,val in df[df['id'].isin(thread)].iterrows():
        T.append(val['text'])
        t.append(val['replyDelay'])
        u.append(val['user'])
    textT.append(T)
    timesT.append(t)
    usersT.append(u)

## Build 'threads' dataframe

In [0]:
dfThreads = pd.DataFrame(pd.Series(data=threads),columns=['ids'])
dfThreads['texts'] = pd.Series(textT)
dfThreads['timeDelays'] = pd.Series(timesT)
dfThreads['users'] = pd.Series(usersT)

## Add topics and labels to thread df

In [0]:
topicsList = []
for key,val in dfThreads.iterrows():
    txt = [cleaning(c) for c in val['texts']]
    text_list = [a.split() for a in txt]
    dictionary = gensim.corpora.Dictionary(text_list)
    corpus = [dictionary.doc2bow(text2) for text2 in text_list]
    ldamodel = gensim.models.ldamodel.LdaModel(corpus,num_topics=2,id2word=dictionary,passes=20)
    topicsList.append(ldamodel.print_topics(num_topics=2,num_words=5))

dfThreads['topics'] = topicsList
dfThreads['labels'] = dfThreads['topics'].map(lambda x: labelizer(x))
dfThreads['totalMsgs'] = dfThreads['ids'].map(lambda x: len(x))
dfThreads['totalMsgsCatagories'] = pd.qcut(dfThreads['totalMsgs'],4,duplicates='drop')
dfThreads = dfThreads.sort_values(by='totalMsgs',ascending=False)
dfThreads.reset_index(drop=True,inplace=True)

# Text output

## Size and topics of individual threads

In [0]:
for key,vals in dfThreads.iterrows():
  print('Message Count: ',vals['totalMsgs'])
  print('Participants :', list(set(vals['users'])))
  print('Topics: ',vals['labels'])
  print('Text: ', vals['texts'])
  print('***')

## Connected conversations as network graphs

In [0]:
for key,val in dfThreads.iterrows():
  ## To display with message ID
  #nx.draw_spring(G.subgraph(i),with_labels=True)
  ## To display with username
  nx.draw(G.subgraph(val['ids']),labels={val2['id']:val2['user'] for 
                                key2,val2 in df[df['id'].isin(val['ids'])].iterrows()})
  plt.title(val['labels'])
  plt.show()

##Threads with 3 or more messages 
###Prints text by participants


In [0]:
for key,vals in dfThreads.iterrows():
  if (vals['totalMsgs'] < 3): continue
  print("**************************************************")
  print('Message Count: ',vals['totalMsgs'])
  print("**************************************************")
  for p in zip(vals['users'], vals['texts']):
    print(p[0]+":",p[1])
    print("-------------------")
  print()