# Pubmed Pre-processing & EDA


#### TO DO For scraping script:
- check if has abstract
- check if title is duplicate
- clean titles (remove parenthesis)
- clean dates - make sure year is greater than 1997 or not empty

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime,re, string, timeit, nltk
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import sentiwordnet as swn
from nltk.corpus.reader.wordnet import WordNetError
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from  sklearn.externals import joblib

In [None]:
# if want smaller, 2-piece dataset, else the large 64k dataset
df1 = False

if df1:

    df1 = pd.read_csv("pubmed_abstracts_batchall_1.csv")
    df2 = pd.read_csv("pubmed_abstracts_batchall_2.csv")

    df = pd.concat([df1,df2],ignore_index=True)

    print(df.info())
else:
    df = pd.read_csv("pubmed_abstracts_export_all_2018-07-09.csv")
    df.drop(['article_title','date'],axis=1,inplace=True)
    print(df.info())

# Preprocessing

#### Checking for duplicates

In [None]:
df[df.duplicated(subset='Article_title')].info()

In [None]:
df[df['Article_title']=='2017 ACC/AHA/AAPA/ABC/ACPM/AGS/APhA/ASH/ASPC/NMA/PCNA Guideline for the Prevention, Detection, Evaluation, and Management of High Blood Pressure in Adults: A Report of the American College of Cardiology/American Heart Association Task Force on Clinical Practice Guidelines.']

In [None]:
df[df['Article_title'].isin(df[df.duplicated(subset='Article_title')]['Article_title'])].info()

There are 208 duplicate articles (using title as the ID), with 403 total affected. 

#### Dropping duplicates by Article Title

In [None]:
df.drop_duplicates(subset='Article_title',inplace=True)

#### Dropping articles w/o Abstracts

In [None]:
df = df[df['abstract'].isnull() ==False]

In [None]:
df[df['ISSN'].isnull()==True]

#### Cleaning the article titles

In [None]:
df['Article_title'][1000]

In [None]:
def remove_parentheses(x):
    
    return x.replace("[",'').replace(']','')

In [None]:
df['Article_title'] = df['Article_title'].apply(lambda x: remove_parentheses(x))

#### Cleaning Dates

In [None]:
type(df['Date'])

In [None]:
month_map = {
    'Jan': 1,
    'Feb': 2,
    'Mar': 3,
    'Apr': 4,
    'May': 5,
    'Jun': 6,
    "Jul": 7,
    'Aug': 8,
    'Sep': 9,
    'Oct': 10,
    'Nov': 11,
    'Dec': 12
}


def clean_month(x):
    x = str(x)
    date = x.split('/')
    year = date[0]
    month = date[1]
    day = date[2]
    
    clean_month = None
    
    if month in month_map.keys():
        clean_month = month_map[month]
    else:
        clean_month = month
    
    return str(year) + '/' + str(clean_month) + '/' + str(day)
    

In [None]:
df['Date'].fillna('1900/01/01',inplace=True)

df['Clean_Date'] = pd.to_datetime(df['Date'].apply(lambda x: clean_month(x)))

In [None]:
df['Month'] = df['Clean_Date'].apply(str).apply(lambda x: x.split('-')[1])
df['Year'] = df['Clean_Date'].apply(str).apply(lambda x: x.split('-')[0])
df['Day']  = df['Clean_Date'].apply(str).apply(lambda x: x.split('-')[2][:2])

## what years were most of the papers published in?

In [None]:
plt.style.use('dark_background')

fig,ax = plt.subplots(figsize=(17,8))

#ax.figure(figsize=(18,8))
ax.plot(df.groupby(by=[df['Year']])['Article_title'].count()) #+ df['Month']
plt.xlim('1999','2018')
ax.set_title("Number of Papers in dataset by Year")
ax.set_ylabel('Count')
ax.set_xlabel('Year')
#plt.xlim([datetime.date(1975, 1, 1), datetime.date(2019, 1, 1)])

#ax.set_xticks()

Most of our papers are from the years 2009 on.

#### Removing the papers from longer than 20+ years ago

In [None]:
df = df[df['Year'].astype(int) > 1997]

In [None]:
df.info()

## Cleaning the words in the abstract

In [None]:
def my_tokenizer(x,sentences = False):
    '''
    Function that takes in an abstract, and cleans it.
    1. lowers all characters
    2. gets rid of words like a or I or as
    3. gets root of words using wornet lemmatizer
    4. gets words that are not stopwords
    
    '''
    
    from nltk.stem import WordNetLemmatizer
    from nltk.corpus import stopwords
    
    #turns words into base form, so dogs == dog
    wordnet_lemmatizer = WordNetLemmatizer()

    #loading stopwords
    stop_words = set(stopwords.words('english'))

    if sentences:
        regex = re.compile('[%s]' % re.escape(string.punctuation))

        x = x.lower()
        
        x = x.split(".")
        
        results = []
        
        for sent in x:
            sent = regex.sub(' ', sent)
            
            tokens = nltk.tokenize.word_tokenize(sent)
            tokens = [t for t in tokens if len(t) > 2]
            tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens]

            clean_tokens = []
            for token in tokens:
                if token not in stop_words:
                    if token.isdigit() == False:
                        clean_tokens.append(token)

            #clean_tokens = [token for token in clean_tokens if not token.isdigit()]
            result = ' '.join(token for token in clean_tokens)
            
            if result != '':
                results.append(result)
            
        return results   
    
    else:
    
        regex = re.compile('[%s]' % re.escape(string.punctuation))

        x = x.lower()
        x = regex.sub(' ', x)

        tokens = nltk.tokenize.word_tokenize(x)
        tokens = [t for t in tokens if len(t) > 2]
        tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens]

        clean_tokens = []
        for token in tokens:
            if token not in stop_words:
                if token.isdigit() == False:
                    clean_tokens.append(token)

        #clean_tokens = [token for token in clean_tokens if not token.isdigit()]
        result = ' '.join(token for token in clean_tokens)

        return result

In [None]:
## Cleaning the abstracts

In [None]:
# if want to clean the abstracts before processing
df['Clean_Abstract'] = df['abstract'].apply(lambda x: my_tokenizer(x))
df = df[df['Clean_Abstract'].isnull() == False]
df.to_csv("pubmed_cleaned.csv")