In [2]:
import numpy as np
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DateType, FloatType, IntegerType
import pyspark.sql.functions as F

In [3]:
conf = SparkConf().setAll((
    ("spark.task.maxFailures", "10"),
    ("spark.serializer", "org.apache.spark.serializer.KryoSerializer"),
    ("spark.sql.execution.arrow.enabled", "true"),
    ("spark.shuffle.service.enabled", "true"),
    ("spark.dynamicAllocation.enabled", "true")))
#     ("spark.master", "yarn")))
# Can't get anything to work with this on, so we can't use the cluster

spark = SparkSession.builder \
    .appName("FNMA Spark -  Notebook") \
    .config(conf=conf) \
    .getOrCreate()

In [4]:
# Only used in master-yarn mode
# import os
# os.environ["HADOOP_USER_NAME"] = "hdfs"

In [5]:
schema_acq = [['LoanID', str],
              ['Channel', str],
              ['SellerName', str],
              ['OrInterestRate', np.dtype('float64')],
              ['OrUnpaidPrinc', np.dtype('int64')],
              ['OrLoanTerm', np.dtype('int64')],
              ['OrDate', str],
              ['FirstPayment', str],
              ['OrLTV', np.dtype('float64')],
              ['OrCLTV', np.dtype('float64')],
              ['NumBorrow', np.dtype('float64')],
              ['DTIRat', np.dtype('float64')],
              ['CreditScore', np.dtype('float64')],
              ['FTHomeBuyer', str],
              ['LoanPurpose', str],
              ['PropertyType', str],
              ['NumUnits', np.dtype('int64')],
              ['OccStatus', str],
              ['PropertyState', str],
              ['Zip', np.dtype('int64')],
              ['MortInsPerc', np.dtype('float64')],
              ['ProductType', str],
              ['CoCreditScore', np.dtype('float64')],
              ['MortInsType', np.dtype('float64')],
              ['RelMortInd', str]]

schema_per = [['LoanID', str],
              ['ReportingDate', str],
              ['Servicer', str],
              ['CurrInterestRate', np.dtype('float64')],
              ['CAUPB', np.dtype('float64')],
              ['LoanAge', np.dtype('int64')],
              ['MonthsToMaturity', np.dtype('float64')],
              ['AdMonthsToMaturity', np.dtype('float64')],
              ['MaturityDate', str],
              ['MSA', np.dtype('int64')],
              ['CurDelStatus', str],
              ['ModFlag', str],
              ['ZeroBalCode', np.dtype('float64')],
              ['ZeroBalEffDate', str],
              ['LastInstallDate', str],
              ['ForeclosureDate', str],
              ['DispositionDate', str],
              ['ForeclosureCost', np.dtype('float64')],
              ['RepairCost', np.dtype('float64')],
              ['AssetRecCost', np.dtype('float64')],
              ['MiscCostsPF', np.dtype('float64')],
              ['ATFHP', np.dtype('float64')],
              ['NetSaleProceeds', np.dtype('float64')],
              ['CreditEnhProceeds', np.dtype('float64')],
              ['RPMWP', np.dtype('float64')],
              ['OtherForePro', np.dtype('float64')],
              ['NonInterestUPB', np.dtype('float64')],
              ['PricipleForgiven', np.dtype('float64')],
              ['RMWPF', str],
              ['FPWA', np.dtype('float64')],
              ['ServicingIndicator', str]]

In [8]:
schemap = {
    str: StringType(),
    np.dtype('float64'): FloatType(),
    np.dtype('int64'): IntegerType()
    
}

In [None]:
%%time

schema_acq_spark = StructType([StructField(k, schemap[v], True) for k,v in schema_acq])
schema_per_spark = StructType([StructField(k, schemap[v], True) for k,v in schema_per])

acq = spark.read.load("/data/FNMA/Acquisition_2012Q4.txt", format="csv", header="false",
                     sep='|', schema=schema_acq_spark)

per = spark.read.load("/data/FNMA/Performance_2012Q4.txt", format="csv", header="false",
                     sep='|', schema=schema_per_spark)

per = per.withColumn('date', F.to_date(per.ReportingDate, 'MM/dd/yyyy')) \
    .drop('ReportingDate') \
    .withColumnRenamed('date', 'ReportingDate') \
    .orderBy("LoanID", F.desc("ReportingDate")) \
    .dropDuplicates(["LoanID"])

df = acq.join(per, 'LoanID', 'outer').persist()

print(df.count())

In [None]:
len(df.columns)