# Problem 10
# Mahtab Nejati
# 98209434
## Please download the data and the pickled results from the link below
### https://drive.google.com/drive/folders/1-IwpWHjtZDzpFXUHo5n7M_9ZmMpw_GG5?usp=sharing

In [1]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.functions import *
MAX_MEMORY = "8g"
spark = SparkSession.builder.appName('App Name').master("local[*]").config("spark.executor.memory", MAX_MEMORY) \
    .config("spark.driver.memory", MAX_MEMORY).getOrCreate()
sqlc = SQLContext(sparkContext=spark.sparkContext, sparkSession=spark)

In [2]:
import pickle
import pandas as pd
import numpy as np
import networkx as nx

## Get data and Preprocessed 

In [3]:
def cleanDate(text):
    date = text
    parts = date.split('-')
    again = []
    for part in parts:
        again += part.split(' ')
    for part in again:
        try:
            val = int(part)
            if val>1000:
                return val
        except:
            pass

def countCitations(citations):
    return len(citations)
        
cleanDate_udf = udf(cleanDate)
countCitations_udf = udf(countCitations)

def getPreprocessedData(filePath='./HW3_P10_Data_MahtabNejati_98209434/hep_records.json'):
    df = spark.read.json(filePath)
    df = df.filter(size('references') > 0)
    df = df.filter(size('citations') > 0)
    df = df.filter(size('authors') > 0)
    df = df.filter(~df.title.contains('cancelled'))
    df = df.filter(~df.title.contains('Cancelled'))
    df = df.filter(~df.title.contains('CANCELLED'))
    df = df.filter(~df.title.contains('withdrawn'))
    df = df.filter(~df.title.contains('Withdrawn'))
    df = df.filter(~df.title.contains('WITHDRAWN'))
    df = df.filter(df.creation_date. isNotNull())
    df = df.withColumn('creation_date', cleanDate_udf(df.creation_date))
    df = df.withColumn('citation_count',countCitations_udf(df.citations))
    return df

df = getPreprocessedData()

minYear = df.agg({'creation_date': 'min'}).collect()[0]['min(creation_date)']
maxYear = df.agg({'creation_date': 'max'}).collect()[0]['max(creation_date)']

## Getting the PageRank of all papers
## Running the following cell will take some time. I have pickled the results. Skip this cell and run the next to load the data.

In [4]:
# def getEdges(row):
#     edges = []
#     for cited in row['citations']:
#         edges.append((cited,row['recid']))
#     return edges

# def makeNetwork(df):
#     cols = df.columns
#     cols.remove('recid')
#     cols.remove('citations')
#     nodes = df.drop(*cols)
#     edges = nodes.rdd.flatMap(getEdges).collect()
#     network = nx.DiGraph()
#     network.add_edges_from(edges)
#     return network

# network = makeNetwork(df)
# pagerank = pd.DataFrame(list(nx.pagerank(network).items()),columns=['recid','pagerank'])
# pagerank = pagerank.set_index('recid')
# pagerank = pagerank.sort_values(by=['pagerank'],ascending=False)
# with open('HW3_P10_MahtabNejati_98209434_Results_pagerank','wb') as f:
#     pickle.dump(pagerank,f)

In [5]:
with open('HW3_P10_MahtabNejati_98209434_Results_pagerank','rb') as f:
    pagerank = pickle.load(f)

## Getting the top papers in the specified time-span

In [6]:
def getTopInPeriod(df,pagerank):
    top_ten = pd.DataFrame([],columns=df.columns+['pagerank'])
    count = 0
    for index,pagerank in pagerank.iterrows():
        row = df.where(df.recid == index).collect()
        if len(row) == 1:
            entry = row[0].asDict()
            entry['pagerank'] = pagerank['pagerank']
            top_ten = top_ten.append(entry,ignore_index=True)
            count += 1
            print('Found you the top '+str(count)+' papers in your request time-span.')
            if count == 10:
                break
    top_ten.set_index('recid')
    top_ten = top_ten.drop([
        'free_keywords',
        'abstract',
        'citations',
        'references',
        'standardized_keywords'],axis=1)
    return top_ten

## Getting 10 most cited papers

In [7]:
def getTopTen(df):
    psdf = df.drop(*[
        'free_keywords',
        'abstract',
        'citations',
        'references',
        'standardized_keywords'])
    top_ten = psdf.toPandas()
    top_ten['citation_count'] = pd.to_numeric(top_ten['citation_count'])
    top_ten = top_ten.sort_values(by=['citation_count'],ascending=False)
    top_ten.set_index('recid')
    return top_ten.head(10).copy()

## Main

In [8]:
again = True
while(again):
    print('\n'+75*'*'+'\n')
    sYear = int(input("From year\t(minimum="+str(minYear)+", maximum="+str(maxYear)+"):\t"))
    fYear = int(input("To year\t\t(minimum="+str(minYear)+", maximum="+str(maxYear)+"):\t"))
    valid = df.filter(df['creation_date']>=sYear)
    valid = valid.filter(df['creation_date']<=fYear)
    print('\n'+75*'*'+'\n')
    print('\n'+20*'!'+'NOTICE'+20*'!')
    print('\nThis will take some time since the queries are running on SQLContext DataFrames.')
    print('Please be patient...\n')
    tops = getTopInPeriod(valid,pagerank)
    print(25*'#'+' PageRank-Based results '+25*'#')
    display(tops)
    print(25*'#'+' Most-Cited-Based results '+25*'#')
    tops = getTopTen(valid)
    display(tops)
    print('\n\n'+75*'#')
    again = (input("\nWish to continue (y/n): ").lower() == 'y')
    print('\n'+75*'#')


***************************************************************************

From year	(minimum=1816, maximum=2020):	1816
To year		(minimum=1816, maximum=2020):	2020

***************************************************************************


!!!!!!!!!!!!!!!!!!!!NOTICE!!!!!!!!!!!!!!!!!!!!

This will take some time since the queries are running on SQLContext DataFrames.
Please be patient...

Found you the top 1 papers in your request time-span.
Found you the top 2 papers in your request time-span.
Found you the top 3 papers in your request time-span.
Found you the top 4 papers in your request time-span.
Found you the top 5 papers in your request time-span.
Found you the top 6 papers in your request time-span.
Found you the top 7 papers in your request time-span.
Found you the top 8 papers in your request time-span.
Found you the top 9 papers in your request time-span.
Found you the top 10 papers in your request time-span.
######################### PageRank-Based results #############

Unnamed: 0,authors,co-authors,creation_date,recid,title,citation_count,pagerank
0,"[Weinberg, Steven]",[],1967,51188,A Model of Leptons,12139,0.000805
1,"[Agostinelli, S.]","[Allison, J., Amako, K., Apostolakis, J., Arau...",2002,593382,GEANT4: A Simulation toolkit,11747,0.000564
2,"[Wilson, Kenneth G.]",[],1974,89145,Confinement of Quarks,5094,0.000465
3,"[Hawking, S.W.]",[],1975,101338,Particle Creation by Black Holes,8091,0.00044
4,"[Cardelli, Jason A.]","[Clayton, Geoffrey C., Mathis, John S.]",1989,293079,"The relationship between infrared, optical, an...",4477,0.00043
5,"[Kobayashi, Makoto]","[Maskawa, Toshihide]",1973,81350,CP Violation in the Renormalizable Theory of W...,10181,0.000406
6,"[Anders, E.]","[Grevesse, N.]",1989,291099,Abundances of the elements: Meteroritic and solar,2635,0.000396
7,"[Schlegel, David J.]","[Finkbeiner, Douglas P., Davis, Marc]",1997,462477,Maps of dust IR emission for use in estimation...,7904,0.000391
8,"[Maldacena, Juan Martin]",[],1997,451647,The Large N limit of superconformal field theo...,15382,0.000382
9,"[Salam, Abdus]",[],1968,53083,Weak and Electromagnetic Interactions,4698,0.00038


######################### Most-Cited-Based results #########################


Unnamed: 0,authors,co-authors,creation_date,recid,title,citation_count
184665,"[Maldacena, Juan Martin]",[],1997,451647,The Large N limit of superconformal field theo...,15382
203431,"[Perlmutter, S.]","[Aldering, G., Goldhaber, G., Knop, R.A., Nuge...",1998,484837,Measurements of $\Omega$ and $\Lambda$ from 42...,12273
194211,"[Riess, Adam G.]","[Filippenko, Alexei V., Challis, Peter, Clocch...",1998,470671,Observational evidence from supernovae for an ...,12141
25459,"[Weinberg, Steven]",[],1967,51188,A Model of Leptons,12139
272946,"[Agostinelli, S.]","[Allison, J., Amako, K., Apostolakis, J., Arau...",2002,593382,GEANT4: A Simulation toolkit,11747
352976,"[Sjostrand, Torbjorn]","[Mrenna, Stephen, Skands, Peter Z.]",2006,712925,PYTHIA 6.4 Physics and Manual,10829
519336,"[Aad, Georges]","[Abajyan, Tatevik, Abbott, Brad, Abdallah, Jal...",2012,1124337,Observation of a new particle in the search fo...,10586
519337,"[Chatrchyan, Serguei]","[Khachatryan, Vardan, Sirunyan, Albert M, Tuma...",2012,1124338,Observation of a New Boson at a Mass of 125 Ge...,10327
37773,"[Kobayashi, Makoto]","[Maskawa, Toshihide]",1973,81350,CP Violation in the Renormalizable Theory of W...,10181
191575,"[Witten, Edward]",[],1998,467400,Anti-de Sitter space and holography,9941




###########################################################################

Wish to continue (y/n): n

###########################################################################
