In [1]:
import pandas as pd
import numpy as np
import sqlite3

In [2]:
filepath = '/Users/gracegupta/Downloads/project final with type 2_all unique_1_31_2019 (1).csv'

In [3]:
# Look at the first few rows of the CSV file
pd.read_csv(filepath, nrows=2).head()

Unnamed: 0,id,activity,administering_ic,application_id,application_type,arra_funded,award_notice_date,budget_end,budget_start,direct_cost_amt,...,project_start,study_section,study_section_name,subproject_id,suffix,support_year,total_cost,total_cost_sub_project,abstract_text,uni_type
0,0,A03,AH,2056338,1,,1994-07-01,1995-06-30,1994-07-01,,...,1994-07-01,NSS,,,,1,,,,
1,0,A03,AH,2056372,1,,1995-05-19,1996-06-30,1995-07-01,,...,1995-07-01,NSS,,,,1,,,,


In [4]:
appended_data = []
chunksize = 10 ** 6
for chunk in pd.read_csv(filepath, chunksize=chunksize):
    appended_data.append(chunk[['abstract_text', 'uni_type']])

In [5]:
appended_data = pd.concat(appended_data)

In [6]:
print(appended_data.head())

  abstract_text uni_type
0           NaN      NaN
1           NaN      NaN
2           NaN      NaN
3           NaN      NaN
4           NaN      NaN


In [7]:
print(appended_data.shape)

(2458227, 2)


# Get all abstracts from R1 and R2 schools.

In [8]:
isR1 = appended_data['uni_type'] == 'R1'

In [9]:
print(appended_data[isR1].head())

                                        abstract_text uni_type
20     DESCRIPTION (provided by applicant):    The...       R1
39  Project 2 - Project Summary/Abstract The proje...       R1
45  Abstract/Summary (Administrative Core; Core 1)...       R1
46  The analysis and visualization of high field m...       R1
49  Invasive cervical cancer (ICC) is the most com...       R1


In [10]:
print(appended_data[isR1].shape)

(250497, 2)


In [11]:
isR2 = appended_data['uni_type'] == 'R2'

In [12]:
print(appended_data[isR2].head())

                                         abstract_text uni_type
218  ?    DESCRIPTION (provided by applicant): Exec...       R2
322  PROJECT SUMMARY (See instructions):  African-A...       R2
327  To understand how signaling proteins function,...       R2
358  Innate immunity is an ancient system that prev...       R2
589  PROJECT SUMMARY  Fibrolamellar hepatocellular ...       R2


In [13]:
print(appended_data[isR2].shape)

(22241, 2)


# Get cancer-only abstracts for R1 and R2 schools.

In [15]:
abstracts_R2 = appended_data[isR2]['abstract_text']

In [29]:
abstracts_R2 = abstracts_R2.dropna()

In [152]:
print(abstracts_R2[:1])

218    ?    DESCRIPTION (provided by applicant): Exec...
Name: abstract_text, dtype: object


In [155]:
cancer_abstracts_R2 = []
for doc in abstracts_R2:
    if 'cancer' in doc:
        cancer_abstracts_R2.append(doc)

In [156]:
print(len(cancer_abstracts_R2))

2238


In [157]:
print(cancer_abstracts_R2[:1])

['To understand how signaling proteins function, it is crucial to know the timeordered sequence of events that lead to the signaling state. When the messenger is chemical, the time required to diffuse to and bind in the active site of a signaling protein is typically far longer than the timescale for protein conformational change [1]. For the structural determination of the kinetics of enzymatic reactions we will focus on small GTPases and their co-enzymes. Small GTPases are molecular switches that cycle between a GTP-bound active and a GDP-bound inactive form. The switch is catalyzed by Guanine nucleotide Exchange Factors (GEFs) and GTPase-Activating Proteins (GAPs), the latter catalyze the hydrolysis of GTP to GDP to deactivate the small GTPase. This system is of very high, general importance in cell biology with particular impact on disease processes, especially cancer, but also several infectious diseases. For proof-ofprinciple, we chose the Arl3-RP2 complex as GTPase-GAP pair [2].

In [158]:
abstracts_R1 = appended_data[isR1]['abstract_text']

In [159]:
abstracts_R1 = abstracts_R1.dropna()

In [160]:
print(abstracts_R1[:1])

20       DESCRIPTION (provided by applicant):    The...
Name: abstract_text, dtype: object


In [161]:
cancer_abstracts_R1 = []
for doc in abstracts_R1:
    if 'cancer' in doc:
        cancer_abstracts_R1.append(doc)

In [162]:
print(len(cancer_abstracts_R1))

36128


In [163]:
print(cancer_abstracts_R1[:1])

['   DESCRIPTION (provided by applicant):    The major goal of the Biomedical Research Tower (BRT) is to create a multidisciplinary biomedical research and education center for The Ohio State University Medical Center (OSUMC) that will be a centerpiece of a dramatically enhanced health sciences campus. Integral to the University\'s Academic Plan for becoming a top public research institution, the BRT will greatly advance the academic mission of the University while bringing enormous value in improved health care, advanced technology, and economic growth to the State of Ohio. The specific renovation project being proposed is for the "buildout" of a floor of the BRT (approximately 24,000 asf) to centralize and support one of the fastest growing and developing areas of cancer research, the Experimental Therapeutics Program (ETP) of The Ohio State University Comprehensive Cancer Center (OSUCCC).       The ETP plays a critical role in the discovery and development of new cancer therapies. A

# Do preprocessing with lemmatization.

In [164]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/gracegupta/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/gracegupta/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [165]:
from nltk.corpus import stopwords

In [166]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/gracegupta/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [167]:
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize

In [168]:
def preprocessing(text):
    # tokenize into words
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]

    # remove stopwords
    stop = stopwords.words('english')
    tokens = [token for token in tokens if token not in stop]

    # remove words less than three letters
    tokens = [word for word in tokens if len(word) >= 3]

    # lower capitalization
    tokens = [word.lower() for word in tokens]

    # lemmatize
    lmtzr = WordNetLemmatizer()
    tokens = [lmtzr.lemmatize(word) for word in tokens]
    preprocessed_text= ' '.join(tokens)

    return preprocessed_text 

In [169]:
processed_abstracts_R1 = []
for item in cancer_abstracts_R1:
    processed_abstracts_R1.append(preprocessing(item))

In [170]:
def get_top_n_words(corpus, n=None):
    """
    List the top n words in a vocabulary according to occurrence in a text corpus.
    """
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [171]:
get_top_n_words(processed_abstracts_R1, n=20)

[('cancer', 153103),
 ('cell', 111942),
 ('the', 99742),
 ('research', 63670),
 ('study', 57103),
 ('tumor', 53767),
 ('aim', 42709),
 ('protein', 38524),
 ('clinical', 37077),
 ('specific', 35479),
 ('mechanism', 34077),
 ('gene', 33924),
 ('human', 31612),
 ('program', 31480),
 ('patient', 30986),
 ('new', 30946),
 ('development', 30365),
 ('disease', 29152),
 ('this', 28929),
 ('project', 28759)]

In [172]:
word_count_vector=cvec.fit_transform(processed_abstracts_R1)

In [173]:
list(cvec.vocabulary_.keys())[:20]

['description',
 'provided',
 'applicant',
 'goal',
 'research',
 'center',
 'university',
 'health',
 'growth',
 'specific',
 'project',
 'proposed',
 'support',
 'therapeutic',
 'program',
 'role',
 'development',
 'new',
 'therapy',
 'trial']

In [174]:
processed_abstracts_R2 = []
for item in cancer_abstracts_R2:
    processed_abstracts_R2.append(preprocessing(item))

In [143]:
get_top_n_words(processed_abstracts_R2, n=20)

[('cell', 8756),
 ('cancer', 7847),
 ('the', 5775),
 ('study', 3580),
 ('protein', 3574),
 ('research', 3370),
 ('tumor', 2805),
 ('mechanism', 2554),
 ('aim', 2541),
 ('specific', 2189),
 ('disease', 2111),
 ('gene', 2103),
 ('development', 2080),
 ('human', 2068),
 ('dna', 2031),
 ('role', 2027),
 ('new', 1856),
 ('function', 1786),
 ('activity', 1722),
 ('this', 1698)]

In [175]:
cvec2=CountVectorizer(ngram_range=(1,2), min_df=0.1, max_df=0.7, max_features=100)

In [176]:
word_count_vector2=cvec2.fit_transform(processed_abstracts_R2)

In [178]:
list(cvec2.vocabulary_.keys())[:20]

['signaling',
 'protein',
 'function',
 'lead',
 'molecular',
 'factor',
 'this',
 'system',
 'high',
 'biology',
 'disease',
 'process',
 'also',
 'complex',
 'gene',
 'patient',
 'result',
 'description',
 'provided',
 'applicant']