# Tuesday Code Challenge

Below is a code snippet that reads in a JSON file containing data extracted from academic papers as a Pandas DataFrame 

In [5]:
import pandas as pd
import json


with open('documents.json') as f:
    df = pd.DataFrame(json.load(f)).T.drop(columns = 'emails')

df.head()

Unnamed: 0,contents,filename,institutions,people,places
Navigation to Small Bodies,"See discussions, stats, and author profiles fo...",txt_files/Navigation to Small Bodies.txt,"[IEEE Aerospace Conference, Arizona State Univ...","[Raviteja Nallapu, Jekan Thangavelautham, gdek...","[Arizona, Tucson, AZ, Arizona, Tucson, AZ, AZ ..."
ASTRONOMICAL ENGINEERING,ASTRONOMICAL ENGINEERING: A STRATEGY FOR MODIF...,txt_files/ASTRONOMICAL ENGINEERING.txt,[ORBITSD.G. KORYCANSKY CODEP Dept Earth Scienc...,"[CA 95064, GREGORY LAUGHLIN, gpl@acetylene.arc...","[Santa Cruz, CA 94035, U.S.A., Ann Arbor, MI 4..."
Phase II of the Main Belt Asteroid Spectrosopic Survey,"Icarus 158, 146�177 (2002) doi:10.1006/icar.20...",txt_files/Phase II of the Main Belt Asteroid S...,"[Planetary Sciences, Massachusetts Institute o...","[Icarus 158, Richard P. BinzelDepartment, doi,...","[Cambridge, Massachusetts, Hilo, albedos, albe..."
Devlopment of Xenon Hall Thrusters,NASA/CR--2004-213099https://ntrs.nasa.gov/sear...,txt_files/Devlopment of Xenon Hall Thrusters.txt,"[NASA, Characterization of High-Efficiency, Mi...","[Richard R., Thermalized, Gaussmeter.............","[Ann Arbor, Michigan, Ann Arbor, MD, VA, Wien,..."
Mine planning for Asteroid Ore Bodies,Space Resources Roundtable II (2000)7030.pdfMI...,txt_files/Mine planning for Asteroid Ore Bodie...,"[Michigan Technological University, Mining Eng...","[L. S. Gertsch1, R. E. Gertsch2, L. S. Gertsch...","[Houghton, MI, Friable Rock, Hard Rock, Univ, ..."


## 1. Create a new column containing the tokenized contents of each paper

### Tokens should
- Be all lower case characters
- Contain only alpha numeric characters
- Be stored as a list

In [6]:
import re
    
def tokenize(x):
    text = x.lower()
    text = re.sub(r'[^a-zA-Z ^0-9]', '', str(text))
    return text.split()

df['tokens'] = df['contents'].apply(lambda x: tokenize(x))

In [7]:
df['tokens'][0][:20]

['see',
 'discussions',
 'stats',
 'and',
 'author',
 'profiles',
 'for',
 'this',
 'publication',
 'at',
 'httpswwwresearchgatenetpublication323600217navigating',
 'to',
 'smallbodies',
 'using',
 'small',
 'satellitesarticle',
 'in',
 'ieee',
 'aerospace',
 'conference']

## 2. Using regular expressions, create a new column containing the emails from each paper

In [9]:
def extract_emails(x):
    return re.findall('\S+@\S+', x)

df['emails'] = df['contents'].apply(lambda x: extract_emails(x))

In [10]:
df['emails'][0][:20]

['85721srs51@email.arizona.eduRavi',
 '85721rnallapu@email.arizona.eduPranay',
 'aravin11@asu.eduGraham',
 'gdektor@asu.eduVishnu',
 '85721reddy@lpl.arizona.eduErik',
 '85721easphaug@lpl.arizona.eduJekan',
 'jekan@email.arizona.eduAbstract--Small-satellites']

## 3. Using Spacy, create a new column containing the lemmas from each paper (this might take a second)

In [11]:
import spacy

# python -m spacy download en_core_web_lg
nlp = spacy.load("en_core_web_lg")

def spacy_lemmatize(x):
    doc = nlp.tokenizer(x)
    return [token.lemma_ for token in doc]

df['lemmas'] = df['tokens'].apply(lambda x: spacy_lemmatize(' '.join(x)))

In [12]:
df['lemmas'][0][:20]

['see',
 'discussion',
 'stats',
 'and',
 'author',
 'profile',
 'for',
 'this',
 'publication',
 'at',
 'httpswwwresearchgatenetpublication323600217navigating',
 'to',
 'smallbodies',
 'use',
 'small',
 'satellitesarticle',
 'in',
 'ieee',
 'aerospace',
 'conference']

## 4. Using Spacy, create a column containing the 10 most common words in each paper after stopwords are removed

### Stretch: Remove more common stopwords

In [14]:
from collections import Counter
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS


def remove_stopwords(tokens):
    cleaned_tokens = []
    
    for token in tokens:
        if token not in spacy_stopwords:
            cleaned_tokens.append(token)
    
    return cleaned_tokens


def word_counter(tokens):
    word_counts = Counter()
    word_counts.update(tokens)
    return word_counts

In [15]:
df['10_most_common'] = df['lemmas'].apply(lambda x: word_counter(remove_stopwords(x)).most_common(10))

In [16]:
df['10_most_common'][0][:20]

[('asteroid', 58),
 ('use', 57),
 ('spacecraft', 44),
 ('small', 39),
 ('target', 39),
 ('perform', 38),
 ('system', 32),
 ('mission', 29),
 ('body', 28),
 ('star', 25)]

In [17]:
df.head()

Unnamed: 0,contents,filename,institutions,people,places,tokens,emails,lemmas,10_most_common
Navigation to Small Bodies,"See discussions, stats, and author profiles fo...",txt_files/Navigation to Small Bodies.txt,"[IEEE Aerospace Conference, Arizona State Univ...","[Raviteja Nallapu, Jekan Thangavelautham, gdek...","[Arizona, Tucson, AZ, Arizona, Tucson, AZ, AZ ...","[see, discussions, stats, and, author, profile...","[85721srs51@email.arizona.eduRavi, 85721rnalla...","[see, discussion, stats, and, author, profile,...","[(asteroid, 58), (use, 57), (spacecraft, 44), ..."
ASTRONOMICAL ENGINEERING,ASTRONOMICAL ENGINEERING: A STRATEGY FOR MODIF...,txt_files/ASTRONOMICAL ENGINEERING.txt,[ORBITSD.G. KORYCANSKY CODEP Dept Earth Scienc...,"[CA 95064, GREGORY LAUGHLIN, gpl@acetylene.arc...","[Santa Cruz, CA 94035, U.S.A., Ann Arbor, MI 4...","[astronomical, engineering, a, strategy, for, ...","[kory@ucolick.org, gpl@acetylene.arc.nasa.govF...","[astronomical, engineer, a, strategy, for, mod...","[(earth, 110), (encounter, 109), (orbit, 77), ..."
Phase II of the Main Belt Asteroid Spectrosopic Survey,"Icarus 158, 146�177 (2002) doi:10.1006/icar.20...",txt_files/Phase II of the Main Belt Asteroid S...,"[Planetary Sciences, Massachusetts Institute o...","[Icarus 158, Richard P. BinzelDepartment, doi,...","[Cambridge, Massachusetts, Hilo, albedos, albe...","[icarus, 158, 146177, 2002, doi101006icar20026...",[sjb@ifa.hawaii.eduReceived],"[icarus, 158, 146177, 2002, doi101006icar20026...","[(s, 346), (asteroid, 325), (spectrum, 211), (..."
Devlopment of Xenon Hall Thrusters,NASA/CR--2004-213099https://ntrs.nasa.gov/sear...,txt_files/Devlopment of Xenon Hall Thrusters.txt,"[NASA, Characterization of High-Efficiency, Mi...","[Richard R., Thermalized, Gaussmeter.............","[Ann Arbor, Michigan, Ann Arbor, MD, VA, Wien,...",[nasacr2004213099httpsntrsnasagovsearchjspr200...,[help@sti.nasa.gov�],[nasacr2004213099httpsntrsnasagovsearchjspr200...,"[(thruster, 1121), (current, 793), (voltage, 7..."
Mine planning for Asteroid Ore Bodies,Space Resources Roundtable II (2000)7030.pdfMI...,txt_files/Mine planning for Asteroid Ore Bodie...,"[Michigan Technological University, Mining Eng...","[L. S. Gertsch1, R. E. Gertsch2, L. S. Gertsch...","[Houghton, MI, Friable Rock, Hard Rock, Univ, ...","[space, resources, roundtable, ii, 20007030pdf...","[1lgertsch@mtu.edu,, 2rgertsch@mtu.edu.Introdu...","[space, resource, roundtable, ii, 20007030pdfm...","[(asteroid, 32), (process, 15), (bag, 14), (co..."
