In [1]:
import pandas as pd
import numpy as np
import sqlite3

In [2]:
filepath = '/Users/gracegupta/Downloads/project final with type 2_all unique_1_31_2019 (1).csv'

In [3]:
# Look at the first few rows of the CSV file
pd.read_csv(filepath, nrows=2).head()

Unnamed: 0,id,activity,administering_ic,application_id,application_type,arra_funded,award_notice_date,budget_end,budget_start,direct_cost_amt,...,project_start,study_section,study_section_name,subproject_id,suffix,support_year,total_cost,total_cost_sub_project,abstract_text,uni_type
0,0,A03,AH,2056338,1,,1994-07-01,1995-06-30,1994-07-01,,...,1994-07-01,NSS,,,,1,,,,
1,0,A03,AH,2056372,1,,1995-05-19,1996-06-30,1995-07-01,,...,1995-07-01,NSS,,,,1,,,,


In [4]:
appended_data = []
chunksize = 10 ** 6
for chunk in pd.read_csv(filepath, chunksize=chunksize):
    appended_data.append(chunk[['abstract_text', 'uni_type']])

In [5]:
appended_data = pd.concat(appended_data)

In [6]:
print(appended_data.head())

  abstract_text uni_type
0           NaN      NaN
1           NaN      NaN
2           NaN      NaN
3           NaN      NaN
4           NaN      NaN


In [7]:
print(appended_data.shape)

(2458227, 2)


# Get all abstracts from R1 and R2 schools.

In [8]:
isR1 = appended_data['uni_type'] == 'R1'

In [9]:
print(appended_data[isR1].head())

                                        abstract_text uni_type
20     DESCRIPTION (provided by applicant):    The...       R1
39  Project 2 - Project Summary/Abstract The proje...       R1
45  Abstract/Summary (Administrative Core; Core 1)...       R1
46  The analysis and visualization of high field m...       R1
49  Invasive cervical cancer (ICC) is the most com...       R1


In [10]:
print(appended_data[isR1].shape)

(250497, 2)


In [11]:
isR2 = appended_data['uni_type'] == 'R2'

In [12]:
print(appended_data[isR2].head())

                                         abstract_text uni_type
218  ?    DESCRIPTION (provided by applicant): Exec...       R2
322  PROJECT SUMMARY (See instructions):  African-A...       R2
327  To understand how signaling proteins function,...       R2
358  Innate immunity is an ancient system that prev...       R2
589  PROJECT SUMMARY  Fibrolamellar hepatocellular ...       R2


In [13]:
print(appended_data[isR2].shape)

(22241, 2)


# Get cancer-only abstracts for R1 and R2 schools.

In [14]:
abstracts_R2 = appended_data[isR2]['abstract_text']

In [15]:
abstracts_R2 = abstracts_R2.dropna()

In [16]:
print(abstracts_R2[:1])

218    ?    DESCRIPTION (provided by applicant): Exec...
Name: abstract_text, dtype: object


In [17]:
cancer_abstracts_R2 = []
for doc in abstracts_R2:
    if 'cancer' in doc:
        cancer_abstracts_R2.append(doc)

In [18]:
print(len(cancer_abstracts_R2))

2238


In [19]:
print(cancer_abstracts_R2[:1])

['To understand how signaling proteins function, it is crucial to know the timeordered sequence of events that lead to the signaling state. When the messenger is chemical, the time required to diffuse to and bind in the active site of a signaling protein is typically far longer than the timescale for protein conformational change [1]. For the structural determination of the kinetics of enzymatic reactions we will focus on small GTPases and their co-enzymes. Small GTPases are molecular switches that cycle between a GTP-bound active and a GDP-bound inactive form. The switch is catalyzed by Guanine nucleotide Exchange Factors (GEFs) and GTPase-Activating Proteins (GAPs), the latter catalyze the hydrolysis of GTP to GDP to deactivate the small GTPase. This system is of very high, general importance in cell biology with particular impact on disease processes, especially cancer, but also several infectious diseases. For proof-ofprinciple, we chose the Arl3-RP2 complex as GTPase-GAP pair [2].

In [20]:
abstracts_R1 = appended_data[isR1]['abstract_text']

In [21]:
abstracts_R1 = abstracts_R1.dropna()

In [22]:
print(abstracts_R1[:1])

20       DESCRIPTION (provided by applicant):    The...
Name: abstract_text, dtype: object


In [23]:
cancer_abstracts_R1 = []
for doc in abstracts_R1:
    if 'cancer' in doc:
        cancer_abstracts_R1.append(doc)

In [24]:
print(len(cancer_abstracts_R1))

36128


In [25]:
print(cancer_abstracts_R1[:10]);

['   DESCRIPTION (provided by applicant):    The major goal of the Biomedical Research Tower (BRT) is to create a multidisciplinary biomedical research and education center for The Ohio State University Medical Center (OSUMC) that will be a centerpiece of a dramatically enhanced health sciences campus. Integral to the University\'s Academic Plan for becoming a top public research institution, the BRT will greatly advance the academic mission of the University while bringing enormous value in improved health care, advanced technology, and economic growth to the State of Ohio. The specific renovation project being proposed is for the "buildout" of a floor of the BRT (approximately 24,000 asf) to centralize and support one of the fastest growing and developing areas of cancer research, the Experimental Therapeutics Program (ETP) of The Ohio State University Comprehensive Cancer Center (OSUCCC).       The ETP plays a critical role in the discovery and development of new cancer therapies. A

# Preprocessing with lemmatization.

In [26]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/gracegupta/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/gracegupta/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [27]:
from nltk.corpus import stopwords

In [28]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/gracegupta/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [29]:
import string

In [30]:
import re
re.compile('<title>(.*)</title>')

re.compile(r'<title>(.*)</title>', re.UNICODE)

In [31]:
from nltk.stem import WordNetLemmatizer

In [32]:
def preprocessing(text):
    #tokenize into words
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    
    #remove stopwords
    stopwords = set(nltk.corpus.stopwords.words('english') + ['reuter', '\x03'])
    tokens = [token for token in tokens if token not in stopwords]
    
    #lower capitalization
    tokens = [word.lower() for word in tokens]
    
    #lemmatize
    lmtzr = WordNetLemmatizer()
    tokens = [lmtzr.lemmatize(word) for word in tokens]
    preprocessed_text= ' '.join(tokens)

    return preprocessed_text

In [33]:
processed_abstracts_R1 = []
for item in cancer_abstracts_R1:
    processed_abstracts_R1.append(preprocessing(item))

In [34]:
processed_abstracts_R2 = []
for item in cancer_abstracts_R2:
    processed_abstracts_R2.append(preprocessing(item))

# Get top words.

In [35]:
def get_top_n_words(corpus, n=None):
    """
    List the top n words in a vocabulary according to occurrence in a text corpus.
    """
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [36]:
get_top_n_words(processed_abstracts_R1, n=40)

[('cancer', 153103),
 ('cell', 111942),
 ('the', 99742),
 ('research', 63670),
 ('study', 57103),
 ('tumor', 53767),
 ('aim', 42709),
 ('protein', 38524),
 ('clinical', 37077),
 ('we', 36015),
 ('specific', 35479),
 ('mechanism', 34077),
 ('gene', 33924),
 ('in', 33666),
 ('human', 31612),
 ('program', 31480),
 ('patient', 30986),
 ('new', 30946),
 ('development', 30365),
 ('disease', 29152),
 ('this', 28929),
 ('project', 28759),
 ('model', 27535),
 ('role', 25809),
 ('treatment', 24409),
 ('function', 24177),
 ('data', 24166),
 ('provided', 23840),
 ('molecular', 23342),
 ('breast', 22744),
 ('dna', 22401),
 ('also', 22345),
 ('determine', 22215),
 ('applicant', 22108),
 ('description', 22077),
 ('pathway', 21893),
 ('using', 21393),
 ('activity', 21054),
 ('provide', 20899),
 ('novel', 20861)]

In [37]:
get_top_n_words(processed_abstracts_R2, n=40)

[('cell', 8756),
 ('cancer', 7847),
 ('the', 5775),
 ('study', 3580),
 ('protein', 3574),
 ('research', 3370),
 ('tumor', 2805),
 ('mechanism', 2554),
 ('aim', 2541),
 ('we', 2335),
 ('specific', 2189),
 ('disease', 2111),
 ('gene', 2103),
 ('in', 2094),
 ('development', 2080),
 ('human', 2068),
 ('dna', 2031),
 ('role', 2027),
 ('new', 1856),
 ('function', 1786),
 ('activity', 1722),
 ('this', 1698),
 ('expression', 1643),
 ('project', 1636),
 ('also', 1555),
 ('pathway', 1544),
 ('health', 1522),
 ('molecular', 1478),
 ('provided', 1469),
 ('program', 1444),
 ('patient', 1415),
 ('factor', 1413),
 ('unreadable', 1368),
 ('determine', 1362),
 ('applicant', 1344),
 ('proposed', 1339),
 ('may', 1334),
 ('description', 1324),
 ('understanding', 1323),
 ('model', 1290)]

# Use Word2Vec.

In [38]:
import gensim
from gensim.models import Word2Vec
import multiprocessing

In [100]:
processed_abstracts_R1 = [doc.split() for doc in processed_abstracts_R1];

In [102]:
# build vocabulary and train model
model = gensim.models.Word2Vec(
        processed_abstracts_R1,
        size=150,
        window=10,
        min_count=2,
        workers=10,
        iter=10)

In [103]:
len(model.wv.vocab)

76556

In [104]:
type(model.wv.vocab)

dict

In [109]:
list(model.wv.vocab);

In [108]:
w1 = "research"
model.wv.most_similar(positive=w1, topn=3)

[('researcher', 0.62552410364151),
 ('scientific', 0.6052536964416504),
 ('science', 0.5511459112167358)]

In [111]:
w2 = "protein"
model.wv.most_similar(positive=w2, topn=3)

[('motif', 0.5320680737495422),
 ('substrate', 0.5166242718696594),
 ('subunit', 0.5162673592567444)]

In [112]:
w3 = "mechanism"
model.wv.most_similar(positive=w3, topn=3)

[('pathway', 0.5750380754470825),
 ('mode', 0.514131486415863),
 ('basis', 0.5070115327835083)]

# Tutorial that didn't work.

Train model on preprocessed R1 cancer abstracts.

In [39]:
EMB_DIM = 300

w2v = Word2Vec(processed_abstracts_R1, size=EMB_DIM, window=5, min_count=5, negative=15, iter=10, 
              workers=multiprocessing.cpu_count())

In [40]:
word_vectors = w2v.wv #get trained embeddings

Using embeddings in neural model.

In [41]:
from nltk.corpus import conll2000
import tensorflow as tf
from tensorflow.keras.layers import Dense, Embedding, Activation, Flatten
from tensorflow.keras import Sequential
from tensorflow.keras.utils import to_categorical
import collections

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  from ._conv import register_converters as _register_converters


In [43]:
import nltk
nltk.download('conll2000')

[nltk_data] Downloading package conll2000 to
[nltk_data]     /Users/gracegupta/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.


True

In [44]:
#getting tokenized and part-of-speech tagged data from the corpus
train_words = conll2000.tagged_words("train.txt")
test_words = conll2000.tagged_words("test.txt")
print(train_words[:10])

[('Confidence', 'NN'), ('in', 'IN'), ('the', 'DT'), ('pound', 'NN'), ('is', 'VBZ'), ('widely', 'RB'), ('expected', 'VBN'), ('to', 'TO'), ('take', 'VB'), ('another', 'DT')]


In [45]:
def get_tag_vocabulary(tagged_words):
    """
    Accepts text in the form of (word, pos) tuples and returns
    a dictionary mapping POS-tags to unique ids.
    """
    tag2id = {}
    for item in tagged_words:
        tag = item[1]
        tag2id.setdefault(tag,len(tag2id))
    return tag2id

In [46]:
#the word_vectors.vocab dictionary stores Vocab objects, rather than integers
#but we would like our dictionary to map words to ints
word2id = {k: v.index for k, v in word_vectors.vocab.items()}
tag2id = get_tag_vocabulary(train_words)

In [57]:
def add_new_word(new_word, new_vector, new_index, embedding_matrix, word2id):
    """
    Adds a new word to the existing matrix of word embeddings.
    """
    #inserting the vector before given index, along axis 0.
    embedding_matrix = np.insert(embedding_matrix, [new_index], [new_vector], axis=0)
    
    #updating the indexes of words that follow the new word.
    word2id = {word: (index + 1) if index >= new_index else index
              for word, index in word2id.items()}
    word2id[new_word] = new_index
    return embedding_matrix, word2id

UNK_INDEX = 0 
UNK_TOKEN = "UNK"

embedding_matrix = word_vectors.vectors
unk_vector = embedding_matrix.mean(0)
embedding_matrix, word2id = add_new_word(UNK_TOKEN, unk_vector,
                                        UNK_INDEX, embedding_matrix, word2id)

In [58]:
def get_int_data(tagged_words, word2id, tag2id):
    """
    Replaces all words and tags with their corresponding ids 
    and separates words (features) from the tags (labels).
    """
    X, Y = [], [] #X will hold word ids, Y will hold ids of their tags
    unk_count = 0 #to keep track of the number of unknown words
    
    for word, tag in tagged_words:
        Y.append(tag2id.get(tag))
        if word in word2id:
            X.append(word2id.get(word))
        else:
            X.append(UNK_INDEX)
            unk_count += 1
    print("Data created. Percentage of unknown words: %.3f" % (unk_count/len(tagged_words)))
    return np.array(X), np.array(Y)

In [59]:
X_train, Y_train = get_int_data(train_words, word2id, tag2id)
X_test, Y_test = get_int_data(test_words, word2id, tag2id)

Y_train, Y_test = to_categorical(Y_train), to_categorical(Y_test)

Data created. Percentage of unknown words: 0.866
Data created. Percentage of unknown words: 0.868


Defining and training the model.

In [61]:
HIDDEN_SIZE = 50
BATCH_SIZE = 128

def define_model(embedding_matrix, class_count):
    """
    Creates and returns a POS model, which only takes
    one word as input.
    """
    vocab_length = len(embedding_matrix)
    model = Sequential()
    
    #A layer which turns word indices into vectors
    model.add(Embedding(input_dim=vocab_length,
                       output_dim=EMB_DIM,
                       weights=[embedding_matrix],
                       input_length=1))
    model.add(Flatten())
    model.add(Dense(HIDDEN_SIZE))
    model.add(Activation("tanh"))
    model.add(Dense(class_count))
    model.add(Activation("softmax"))
    
    model.compile(optimizer=tf.train.AdamOptimizer(),
                 loss="categorical_crossentropy",
                 metrics=["accuracy"])
    return model

pos_model = define_model(embedding_matrix, len(tag2id))
pos_model.summary()

#Training the model
pos_model.fit(X_train,
             Y_train,
             batch_size=BATCH_SIZE,
             epochs=1,
             verbose=1)

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1, 300)            20700     
_________________________________________________________________
flatten (Flatten)            (None, 300)               0         
_________________________________________________________________
dense (Dense)                (None, 50)                15050     
_________________________________________________________________
activation (Activation)      (None, 50)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 44)                2244      
_________________________________________________________________
activation_1 (Activation)    (None, 44)                0         
Total params: 37,994
Trainable params: 37,994
Non-trainable params: 0


InvalidArgumentError: indices[120,0] = 70 is not in [0, 69)
	 [[{{node embedding/embedding_lookup}}]]

Evaluating the model

In [65]:
def evaluate_model(model, id2word, x_test, y_test):
    """
    Evaluates the given model by computing the accuracy of its predictions
    on the given test data and prints out 10 most mistagged words.
    """
    _, acc = model.evaluate(x_test, y_test) #get accuracy of the model
    print("Accuracy: %.2f" % acc)
    
    #the following lines are used to get most commonly mistagged words
    y_pred = model.predict_classes(x_test) #get model predictions
    error_counter = collections.Counter()
    
    for i in range(len(x_test)):
        correct_tag_id = np.argmax(y_test[i])
        if y_pred[i] != correct_tag_id:
            word = id2word[x_test[i]]
            error_counter[word] += 1
    print("Most common errors:\n", error_counter.most_common(10))
    
id2word = sorted(word2id, key=word2id.get)
evaluate_model(pos_model, id2word, X_test, Y_test)

Accuracy: 0.27
Most common errors:
 [('UNK', 34713), ('$', 5), ('7', 5), (';', 2)]


Building a context dependent model

In [66]:
EOS_INDEX = 1
EOS_TOKEN = "EOS"

#creating a random end-of-sequence vector
eos_vector = np.random.standard_normal(EMB_DIM)
embedding_matrix, word2id = add_new_word(EOS_TOKEN, eos_vector, EOS_INDEX,
                                        embedding_matrix, word2id)

Prepare data for context-dependent model.

In [67]:
CONTEXT_SIZE = 2 #define the size of the context window
def get_window_int_data(tagged_words, word2id, tag2id):
    """
    Replaces all words and tags with their corresponding ids and
    generates an array of label ids Y and the training data set X, which
    consists of arrays of word indexes (of tagged word and its context).
    """
    X, Y = [], []
    unk_count = 0
    
    span = 2*CONTEXT_SIZE + 1
    buffer = collections.deque(maxlen=span)
    padding = [(EOS_TOKEN, None)]*CONTEXT_SIZE
    buffer += padding + tagged_words[:CONTEXT_SIZE]
    
    for item in (tagged_words[CONTEXT_SIZE:] + padding):
        buffer.append(item)
        
        #the input to the model is the ids of all words in the window
        window_ids = np.array([word2id.get(word) if (word in word2id) else UNK_INDEX
                              for (word, _) in buffer])
        
        X.append(window_ids)
        
        #the label is the tag of the middle word
        middle_word, middle_tag = buffer[CONTEXT_SIZE]
        Y.append(tag2id.get(middle_tag))
        
        if middle_word not in word2id:
            unk_count +=1
        print("Data created. Percentage of unknown words: %.3f" % (unk_count/len(tagged_words)))
        return np.array(X), np.array(Y)

Define the model.

In [71]:
def define_context_sensitive_model(embedding_matrix, class_count):
    """
    Creates and returns a parts of speech model, which takes as
    input a tagged word and its context.
    """
    vocab_length = len(embedding_matrix)
    total_span = CONTEXT_SIZE * 2 + 1
    
    model = Sequential()
    model.add(Embedding(input_dim=vocab_length,
                       output_dim=EMB_DIM,
                       weights=[embedding_matrix],
                       input_length=total_span)),
    model.add(Flatten())
    model.add(Dense(HIDDEN_SIZE))
    model.add(Activation("tanh"))
    model.add(Dense(class_count))
    model.add(Activation("softmax"))
    
    model.compile(optimizer=tf.train.AdamOptimizer(),
                 loss="sparse_categorical_crossentropy",
                 metrics=["accuracy"])
    return model

In [69]:
def evaluate_model(model, id2word, x_test, y_test):
    """
    Evaluates the given model by computing the accuracy of its predictions
    on the given test data and prints out 10 most mistagged words.
    """
    _, acc = model.evaluate(x_test, y_test) #get accuracy of model
    print("Accuracy: %.2f" % acc)
    
    #get most commonly mistagged words
    y_pred = model.predict_classes(x_test)
    error_counter = collections.Counter()
    
    for i in range(len(x_test)):
        correct_tag_id = np.argmax(y_test[i])
        if y_new[i] != correct_tag_id:
            if isinstance(x_test[i], np.ndarray):
                word = id2word[x_test[i][CONTEXT_SIZE]]
            else:
                word = id2word[x_test[i]]
            error_counter[word] += 1
    
    print("Most common errors:\n", error_counter.most_common(10))

In [72]:
X_train2, Y_train2 = get_window_int_data(train_words, word2id, tag2id)
X_test2, Y_test2 = get_window_int_data(test_words, word2id, tag2id)
Y_train2, Y_test2 = to_categorical(Y_train2), to_categorical(Y_test2)

cs_pos_model = define_context_sensitive_model(embedding_matrix, len(tag2id))
cs_pos_model.fit(X_train2,
                Y_train2,
                batch_size=BATCH_SIZE,
                epochs=1,
                verbose=1)

evaluate_model(cs_pos_model, id2word, X_test2, Y_test2)

Data created. Percentage of unknown words: 0.000
Data created. Percentage of unknown words: 0.000


InvalidArgumentError: Can not squeeze dim[1], expected a dimension of 1, got 11
	 [[{{node metrics_2/acc/Squeeze}}]]

# Use LDA to extract main topics for R1 abstracts.

In [None]:
processed_abstracts_R1 = [doc.split() for doc in processed_abstracts_R1];

In [None]:
dictionary = gensim.corpora.Dictionary(processed_abstracts_R1);

In [None]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_abstracts_R1];

In [None]:
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                    num_topics = 3, 
                                   id2word = dictionary,                                    
                                   passes = 10,
                                   workers = 2)

In [None]:
lda_model.print_topics(num_topics=3, num_words=10)

# Use LDA to extract main topics from R2 abstracts

In [None]:
processed_abstracts_R2 = [doc.split() for doc in processed_abstracts_R2];

In [None]:
dictionary = gensim.corpora.Dictionary(processed_abstracts_R2);

In [None]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_abstracts_R2];

In [None]:
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                    num_topics = 3, 
                                   id2word = dictionary,                                    
                                   passes = 10,
                                   workers = 2)

In [None]:
lda_model.print_topics(num_topics=8, num_words=10)

# MISC

In [None]:
cancer_abstracts_R1= [text.lower() for text in cancer_abstracts_R1];

In [None]:
# Strip all punctuation from each article
# This uses str.translate to map all punctuation to the empty string
table = str.maketrans('', '', string.punctuation)
cancer_abstracts_R1 = [text.translate(table) for text in cancer_abstracts_R1]

In [None]:
# Convert all numbers in the article to the word 'num' using regular expressions
cancer_abstracts_R1 = [re.sub(r'\d+', 'num', text) for text in cancer_abstracts_R1]

In [None]:
# Create stopwords list, convert to a set for speed
stopwords = set(nltk.corpus.stopwords.words('english') + ['reuter', '\x03'])
cancer_abstracts_R1 = [[word for word in text.split() if word not in stopwords] for text in cancer_abstracts_R1]

In [None]:
from nltk.stem import WordNetLemmatizer

In [None]:
# lemmatize
lmtzr = WordNetLemmatizer()
cancer_abstracts_R1 = ["".join([lmtzr.lemmatize(word) for word in text]) for text in cancer_abstracts_R1];

In [None]:
vectorizer = CountVectorizer(max_features=10)
vectorizer.fit(cancer_abstracts_R1)

In [None]:
import sklearn
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
# Generate bag of words object with maximum vocab size of 1000
counter = sklearn.feature_extraction.text.CountVectorizer(max_features = 5)

In [None]:
# Get bag of words model as sparse matrix
bag_of_words = counter.fit_transform(cancer_abstracts_R1)

In [None]:
bag_of_words.shape

In [None]:
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(bag_of_words)

In [None]:
# print idf values
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=counter.get_feature_names(),columns=["idf_weights"]);
 
# sort ascending
df_idf.sort_values(by=['idf_weights']);

In [None]:
# Generate tf-idf object with maximum vocab size of 1000
tf_counter = sklearn.feature_extraction.text.TfidfVectorizer(max_features = 5)

In [None]:
# Get tf-idf matrix as sparse matrix
tfidf = tf_counter.fit_transform(cancer_abstracts_R1)