# Goal: pre-process abstracts corpus for wordcloud

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime,re, string, timeit, nltk
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import sentiwordnet as swn
from nltk.corpus.reader.wordnet import WordNetError
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from  sklearn.externals import joblib
from nltk.tag import StanfordNERTagger
from nltk.tag.stanford import CoreNLPNERTagger
from nltk.tag.stanford import StanfordNERTagger
import polyglot
from SentimentTrendPlot import plot_sentiment_trends
from PubmedSentimentFunctions import abs_tagger,map_pos,tag_get_score
%matplotlib inline
from scipy.cluster.hierarchy import dendrogram, linkage

In [2]:
# Load in cleaned dataset, this one has been pre-scored
df = pd.read_csv("pubmed_cleaned_scored.csv")

#### Create TF-IDF Matrix

In [None]:
# key args:
# max_df: maximum frequency within the documents a given
#    feature can have to be used in the tfi-idf matrix
# min_idf: number or %  of of documents the term must be in to be considered in matrix
# ngram_range - 1 for unigram (1 word) 2 for bigram, etc.

tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.01, stop_words='english',
                                 use_idf=True, ngram_range=(1,3))
#%time 
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Clean_Abstract'])

print(tfidf_matrix.shape)

In [None]:
# get terms in matrix
terms = tfidf_vectorizer.get_feature_names()

#### Melt and Reshape TF-IDF Matrix into tidy format

In [None]:
# pre-process tf-idf matrix
tfidf = pd.DataFrame(tfidf_matrix.todense(), index=df.index, 
                     columns=tfidf_vectorizer.get_feature_names())

# add Pubmed ID for joining in future
tfidf['PMID'] = df.PMID

In [None]:
# melt long dataframe to tidy dataframe
tfidf_tidy = pd.melt(tfidf.reset_index(), 
                     id_vars=['level_0','PMID'], 
                     value_name='tfidf').query('tfidf > 0')

#### Group Resulting Dataframe to get Mean Weight by word

In [None]:
# group by word and aggregegate
mean_tfidf = pd.DataFrame(tfidf_tidy.groupby("variable",
                        group_keys=False)['tfidf'].mean())

In [None]:
mean_tfidf.reset_index(inplace=True)

#### Export as csv

In [None]:
mean_tfidf.to_csv("mean_tfidf_corpus.csv")