In [1]:
import nltk
from nltk import corpus
from nltk.collocations import *
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk import wordnet
from nltk import punkt
from nltk.util import ngrams
#if these last 2 don't work might need to do nltk.download(package)
import re

from string import punctuation
from collections import Counter
import numpy as np

In [2]:
import gensim
from gensim.corpora.dictionary import Dictionary
from gensim.models.tfidfmodel import TfidfModel

import pandas as pd
from datetime import datetime

#Importing sklearn for bigram finder
from sklearn.feature_extraction.text import CountVectorizer



## Import and Clean Connect Report

In [3]:
report=pd.read_csv('connect_report.csv')

report['Created Date']=pd.to_datetime(report['Created Date'])
report=report[pd.notnull(report['Created Date'])]
report=report[pd.notnull(report['Body'])]

print(report.dtypes)

Network: Name                       object
Name                                object
Created By: Full Name               object
Created By: Company Name            object
Body                                object
Created Date                datetime64[ns]
Like Count                         float64
Comment Count                      float64
Feed Item Type                      object
dtype: object


## Report Generator

In [4]:
final_report=pd.DataFrame

#Use this function to create the final report
# Directions:
# 1. Set Group to a list of group names, a single name of a group, or All to use all groups
# 2. Set Type to a specific Feed Item Type or All
# 3. Set Start Date or leave blank(defaults to: )
# 4. Set End Date or leave blank(defaults to: )

def report_generator(Group,Type,Date_start=pd.to_datetime("2017-01-01"),Date_finish=pd.to_datetime('today')):
    if type(Group) == list:
        final_report=report.loc[report['Name'].isin(Group)] 
    elif (Group=='All'):
        final_report=report
    else:
        final_report=report.loc[report['Name']==Group]
                                
    if (Type=='All'):
        final_report=final_report
    else:
        final_report=final_report.loc[final_report['Feed Item Type']==Type]
    
    if(Date_start=="2017-01-01" and Date_finish== pd.to_datetime('today')):
        final_report=final_report
    else:
        Date_start=pd.to_datetime(Date_start, utc=False)
        Date_finish=pd.to_datetime(Date_finish, utc=False)
        #alt approach-- not working but not sure why-- it worked it worked it worked! had to change the data type to all be annoying ass datatime64... down the line that might be a pain but for now it's dope
        final_report=final_report[(final_report['Created Date'] > Date_start) & (final_report['Created Date']< Date_finish)]   
    return(final_report)

final_report=report_generator(['CPC+ All','NLT Internal Users'],'All',"2017-02-01","2017-03-01")

final_report.head(5)

Unnamed: 0,Network: Name,Name,Created By: Full Name,Created By: Company Name,Body,Created Date,Like Count,Comment Count,Feed Item Type
23,CPC Plus Connect,CPC+ All,Jon Regis,NJ,does everyone have portal access\n,2017-02-02,2.0,47.0,Text Post
24,CPC Plus Connect,CPC+ All,Stephanie Hardin,OH,I am trying to figure out how to document/trac...,2017-02-03,11.0,34.0,Text Post
25,CPC Plus Connect,CPC+ All,Sam Gottuso,BAH,We appreciate your participation in the Care M...,2017-02-03,2.0,4.0,Content Post
26,CPC Plus Connect,CPC+ All,Juliana Fritschel,CO,How is a group created within CPC+ Connect? I ...,2017-02-03,1.0,0.0,Text Post
27,CPC Plus Connect,NLT Internal Users,Bridget Lalley Ryder,,Attention all CPC+ practices! Are there other ...,2017-02-03,0.0,0.0,Link Post


## Tokenizer Function

In [5]:
##Stop word and lemmatizer creation (PLEASE add to stop where you see fit)
stop = stopwords.words('english')
stop.append("cpc+")
stop.append("hi")
stop.append("hello")
stop.append("/p")

wnl = WordNetLemmatizer() 

In [6]:
#this function tokenizes text, removes stop word and punctuation, and lemmatizes the tokens
def tokenizer(text):
 
    tokens_ = [word_tokenize(sent) for sent in sent_tokenize(text.lower())]
    
    tokens = []
    
    for token_by_sent in tokens_:
        tokens += token_by_sent
 
    tokens = list(filter(lambda t: t not in stop, tokens))
    tokens = list(filter(lambda t: t not in punctuation, tokens))
    tokens = list(filter(lambda t: t not in [u"'s", u"n't", u"...", u"''", u'``', u'\u2014', u'\u2026', u'\u2013'], tokens))
     
    filtered_tokens = []
    for token in tokens:
        token = wnl.lemmatize(token)
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
 
    filtered_tokens = list(map(lambda token: token.lower(), filtered_tokens))
 
    return filtered_tokens

In [7]:
#tokenizes final report
final_report['tokens'] = final_report['Body'].map(tokenizer)
final_report['tokens'][0:10]

23                           [everyone, portal, access]
24    [trying, figure, document/track, use, cmf, pay...
25    [appreciate, participation, care, management, ...
26    [group, created, within, connect, saw, 005t000...
27    [attention, practice, payer, region, currently...
28    [attention, practice, payer, region, currently...
29    [anyone, conducted, need, assessment, self-man...
30    [thinking, driver, requirement, inform, progra...
31                            [important, announcement]
32                      [also, important, announcement]
Name: tokens, dtype: object

In [8]:
#extends tokens list with bi-grams
for i,t in final_report['Body'].iteritems():
    t_list = t.split()
    filtered_words = [word for word in t_list if word not in stop]
    #filtered_words2 = [word for word in t_list if word not in punctuation]
    text=" ".join(filtered_words)
    
    ng = [ngrams(word_tokenize(sent),2) for sent in sent_tokenize(text.lower())]
    n_gram = []
    for n in ng:
            n_gram += n
    two_grams = []        
    for n in n_gram:
            new = ' '.join(n)
            two_grams.append(new)
    final_report['tokens'][i].extend(two_grams)

In [None]:
##Bi-Grams using Sci-Kit Learn
##Finding bigrams and frequencies across all data

unlisted_report=final_report['tokens'].apply(lambda x: ', '.join(x))

word_vectorizer_2=CountVectorizer(ngram_range=(2,2), analyzer='word')
word_vectorizer_fit_2=word_vectorizer_2.fit_transform(unlisted_report)

frequencies=sum(word_vectorizer_fit_2).toarray()[0]

bigrams_counter=pd.DataFrame(frequencies,index=word_vectorizer_2.get_feature_names(),columns=['frequency'])
f = bigrams_counter[bigrams_counter['frequency']>5]

#The tokenizer cuts it down significantly to 639


##how can we integrate bigrams into the keywords functions-- we have identified the most common ones, how do we match them up with the original body comments?
print(word_vectorizer_fit_2.shape)
len(final_report['Body'])
print(f[0:10])
#so they produce the same number of rows... but what's the 19354 columns?


## Keyword Retrieval

In [9]:
#this function returns n keywords from specfied token list
def get_keywords(tokens, num):
    return Counter(tokens).most_common(num)

In [10]:
#iterates through all text bodies and prints the raw text and the top 5 most common words
for i,t in final_report['tokens'].iteritems():
    print(final_report['Body'][i])
    print( get_keywords(t,10),"\n")

does everyone have portal access

[('everyone', 1), ('portal', 1), ('access', 1), ('everyone portal', 1), ('portal access', 1)] 

I am trying to figure out how to document/track our use of the CMF payments.  I was wondering if we will just be required to submit an excel document with our expenditures or will additional documentation be required?  I have looked everywhere to find clarification and am not seeing anything.  Any thoughts?  

[('required', 2), ('trying', 1), ('figure', 1), ('document/track', 1), ('use', 1), ('cmf', 1), ('payment', 1), ('wondering', 1), ('submit', 1), ('excel', 1)] 

We appreciate your participation in the Care Management Webinar. Click to download the webinar slides, post additional questions and continue the conversation with others here. A video recording will be forthcoming next week.

[('webinar', 2), ('appreciate', 1), ('participation', 1), ('care', 1), ('management', 1), ('click', 1), ('download', 1), ('slide', 1), ('post', 1), ('additional', 1)] 

Ho

In [11]:
#this functions prints the most common words by Feed Item Type
def keywords_bytype(feed_type):
    tokens = final_report[final_report['Feed Item Type'] == feed_type]['tokens']
    alltokens = []
    for token_list in tokens:
        alltokens += token_list
    counter = Counter(alltokens)
    return counter.most_common(10)

In [12]:
#iterates through all text bodies abd prints most common words by Feed Item Type
for t in set(final_report['Feed Item Type']):
    print('category :', t)
    print('top 10 keywords:', keywords_bytype(t))
    print('---')

category : Poll
top 10 keywords: [('better', 2), ('link', 2), ('file', 2), ("what 's", 2), ("'s better", 2), ('better -', 2), ('posting', 1), ('user', 1), ('- link', 1), ('link file', 1)]
---
category : Advanced Text Post
top 10 keywords: [('announcement', 3), ('important', 2), ('important announcement', 2), ('announcement !', 2), ('this important', 1), ('also', 1), ('i also', 1), ('also important', 1), ('test', 1), ('post', 1)]
---
category : Content Post
top 10 keywords: [('webinar', 4), ('check', 4), ('latest', 4), ('newsletter', 4), ('check latest', 4), ('care', 3), ('download', 3), ('slide', 3), ('recording', 3), ('week', 3)]
---
category : Text Post
top 10 keywords: [('practice', 24), ('anyone', 18), ('patient', 14), ('group', 13), ('care', 11), ('risk', 9), ('portal', 8), ('payment', 8), ('post', 8), ('would', 8)]
---
category : Link Post
top 10 keywords: [('> <', 15), ('group', 14), ('practice', 11), ('step', 10), ('health', 9), ('risk', 9), ('health it', 8), ('nbsp', 8), ('/i'

## TF-IDF

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

# min_df is minimum number of documents that contain a term t
# max_features is maximum number of unique tokens (across documents) that we'd consider
# TfidfVectorizer preprocesses the descriptions using the tokenizer we defined above

In [14]:
#create a vectorizer object for our data with min docs. = 2, max feats. = 10K, and single and paired words
vectorizer = TfidfVectorizer(min_df=2, max_features=10000, tokenizer=tokenizer, ngram_range=(1, 2))
vz = vectorizer.fit_transform(list(final_report['Body']))

In [15]:
#create a dictionary mapping the tokens to their tfidf values
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
tfidf = pd.DataFrame(columns=['tfidf']).from_dict(dict(tfidf), orient='index')
tfidf.columns = ['tfidf']

In [16]:
#low tfidf scores
tfidf.sort_values(by=['tfidf'], ascending=True).head(15)

Unnamed: 0,tfidf
practice,1.965081
anyone,2.252763
care,2.504077
patient,2.745239
check,2.84055
would,2.94591
group,2.94591
share,2.94591
click,3.063693
available,3.063693


In [17]:
#high tfidf scores
tfidf.sort_values(by=['tfidf'], ascending=False).head(15)

Unnamed: 0,tfidf
instead,4.044522
message post,4.044522
need assessment,4.044522
type ehr,4.044522
navigation bar,4.044522
navigation,4.044522
name,4.044522
methodology,4.044522
message text,4.044522
message,4.044522
