In [0]:
import pickle
import boto3
import pandas as pd
import numpy as np
# import tensorflow as tf
# from transformers import RobertaTokenizer, TFRobertaModel, AlbertTokenizerFast, AlbertModel

In [0]:
from pyspark.sql import SparkSession
sc = spark.sparkContext
from pyspark.sql import SQLContext
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import IntegerType, StringType, FloatType, ArrayType, DoubleType, StructType, StructField
sqlContext = SQLContext(sc)

In [0]:
base_save_path = "s3://mag-model-data/raw_mag_data/"
iteration_save_path = "s3://mag-model-data/iteration_1/"

## Getting all data

In [0]:
journal_join_query = \
"""
SELECT paper_id, doc_type, paper_title, journal_name, publication_date, topic_name, level
FROM (
SELECT  a.paper_id, a.doc_type, a.paper_title, a.year, 
        a.publication_date, b.normalized_name as journal_name,
        d.normalized_name as topic_name, d.level
FROM (SELECT paper_id, doc_type, paper_title, book_title, year, 
             publication_date, online_date, journal_id
      FROM mag_main_papers) a
LEFT JOIN (SELECT journal_id, normalized_name
           FROM mag_main_journals) b
ON a.journal_id=b.journal_id
LEFT JOIN (SELECT *
           FROM mag_advanced_paper_fields_of_study) c
ON a.paper_id=c.paper_id
LEFT JOIN (SELECT *
           FROM mag_advanced_fields_of_study) d
ON c.field_of_study=d.field_of_study_id )
WHERE topic_name IS NOT NULL
"""

In [0]:
all_data = spark.read \
.format("com.databricks.spark.redshift") \
.option("url", redshift_url) \
.option("user", "app_user") \
.option("password", redshift_password) \
.option("query", journal_join_query) \
.option("tempdir", base_save_path) \
.option("forward_spark_s3_credentials", True) \
.load()

In [0]:
all_data.printSchema()

In [0]:
all_data.show(5)

## Getting the Supplemental Data for Sample of Training Data and Whole Test Set

In [0]:
# train = spark.read.parquet(f"{base_save_path}train")
val = spark.read.parquet(f"{base_save_path}val")
test = spark.read.parquet(f"{base_save_path}test")

In [0]:
test.cache().count()

In [0]:
# val.cache().count()

#### Getting Data for Each Paper

##### Test

In [0]:
extra_paper_data = test.select('paper_id') \
.join(all_data.select('paper_id', 'paper_title','doc_type', 
                      F.to_date(F.col('publication_date'), 'yyyy-MM-dd').alias('publication_date'), 'topic_name') \
      .groupby(['paper_id','paper_title','doc_type', 'publication_date']).agg(F.collect_list(F.col('topic_name')).alias('topics')), 
      how='left', on='paper_id')
extra_paper_data.cache().count()



In [0]:
final_extra_paper_data = extra_paper_data.select('paper_id',
                                                 'paper_title',
                                                 F.year(F.col('publication_date')).alias('year'), 
                                                 F.month(F.col('publication_date')).alias('month'), 
                                                 F.size(F.col('topics')).alias('topic_len'))

final_extra_paper_data \
.coalesce(1).write.mode('overwrite').parquet(f"{base_save_path}test_extra_data")

##### Val

In [0]:
extra_paper_data = val.select('paper_id') \
.join(all_data.select('paper_id', 'paper_title','doc_type', 
                      F.to_date(F.col('publication_date'), 'yyyy-MM-dd').alias('publication_date'), 'topic_name') \
      .groupby(['paper_id','doc_type', 'publication_date']).agg(F.collect_list(F.col('topic_name')).alias('topics')), 
      how='left', on='paper_id')
extra_paper_data.cache().count()

final_extra_paper_data = extra_paper_data.select('paper_id',
                                                 F.year(F.col('publication_date')).alias('year'), 
                                                 F.month(F.col('publication_date')).alias('month'), 
                                                 F.size(F.col('topics')).alias('topic_len'))

final_extra_paper_data \
.coalesce(5).write.mode('overwrite').parquet(f"{base_save_path}val_extra_data")

#### Just Get Topic to Level Mapping

In [0]:
journal_join_query = \
"""
SELECT normalized_name as topic_name, level
FROM mag_advanced_fields_of_study
"""

In [0]:
levels = spark.read \
.format("com.databricks.spark.redshift") \
.option("url", redshift_url) \
.option("user", "app_user") \
.option("password", redshift_password) \
.option("query", journal_join_query) \
.option("tempdir", base_save_path) \
.option("forward_spark_s3_credentials", True) \
.load()

In [0]:
levels.count()

In [0]:
levels.show(10)

In [0]:
levels.coalesce(1).write.mode('overwrite').parquet(f"{base_save_path}levels_data")

# NEED TO GET TOPICS FOR EACH LEVEL BY PAPER ID (COLUMN FOR L1 TOPICS, COLUMN FOR L2 TOPICS, ETC.)

## Error Analysis

Things to look into for error analysis
* Metrics over time (especially looking at post-training data date range)
* Different doc types
* Different levels of topics
* Journal vs no journal
* Doc type vs no doc type
* The two variables above missing vs not missing
* Out of the target topics (~400K), which are the worst
* Out of the L1 topics, which ones are the worst/best
* Out of the L2 topics, which ones are the worst/best
* Number of topics (documents with 1-2 topics vs documents with over 8 labels)