In [1]:
import pandas as pd 
import json
import matplotlib.pyplot as plt 

#Graph network imports
from graphframes import *
from pyspark import *
from pyspark.sql import *
import numpy as np
from pyspark.ml.linalg import *
from pyspark.ml.linalg import *
from pyspark.sql.types import * 
from pyspark.sql.functions import *

from pyspark.sql.functions import udf #user defined function
from pyspark.sql.types import * #Import types == IntegerType, StringType etc.

import nltk

In [12]:
#import statements
from pyspark.sql import SparkSession

#create Spark session
spark = SparkSession.builder.enableHiveSupport().appName('Final_project_read_write').getOrCreate()

conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '15g'), ('spark.app.name', 'Final_project_read_write'), ('spark.executor.cores', '10'), ('spark.cores.max', '10'), ('spark.driver.memory','20g')])

#print spark configuration settings
#spark.sparkContext.getConf().getAll()

In [13]:
modeling_data = spark.read.parquet('modeling_data')

In [4]:
modeling_data.printSchema()

root
 |-- id: string (nullable = true)
 |-- year: long (nullable = true)
 |-- inCitations_count: integer (nullable = true)
 |-- outCitations_count: integer (nullable = true)
 |-- abstract_wcount: integer (nullable = true)
 |-- title_wcount: integer (nullable = true)
 |-- abstract_tfidf: vector (nullable = true)
 |-- title_tfidf: vector (nullable = true)
 |-- SJR: string (nullable = true)
 |-- author_count: integer (nullable = true)
 |-- fieldsOfStudyVec: vector (nullable = true)
 |-- sourcesVec: vector (nullable = true)



In [5]:
modeling_data.count() - modeling_data.dropna().count()

192481

In [6]:
import pyspark.sql.functions as f
from functools import reduce

modeling_data.where(reduce(lambda x, y: x | y, (f.col(x).isNull() for x in modeling_data.columns))).show(5)

+--------------------+----+-----------------+------------------+---------------+------------+--------------+--------------------+---+------------+-----------------+-------------+
|                  id|year|inCitations_count|outCitations_count|abstract_wcount|title_wcount|abstract_tfidf|         title_tfidf|SJR|author_count| fieldsOfStudyVec|   sourcesVec|
+--------------------+----+-----------------+------------------+---------------+------------+--------------+--------------------+---+------------+-----------------+-------------+
|57d4efa8939189a64...|null|                0|                 0|              0|          12|(262144,[],[])|(262144,[2196,954...|124|           1|(2179,[12],[1.0])|(4,[0],[1.0])|
|5c1c3f390fd575dd1...|null|                0|                 0|              0|          12|(262144,[],[])|(262144,[13957,21...|124|           1| (2179,[9],[1.0])|(4,[0],[1.0])|
|c8351f487ab3c8caf...|null|                0|                 0|              0|           2|(262144,[],[

In [7]:
from pyspark.sql.functions import isnan, when, count, col

modeling_data.select([count(when(col(c).isNull(), c)).alias(c) for c in modeling_data.columns]).toPandas().T

Unnamed: 0,0
id,0
year,192481
inCitations_count,0
outCitations_count,0
abstract_wcount,0
title_wcount,0
abstract_tfidf,0
title_tfidf,0
SJR,0
author_count,0


In [15]:
year_stats = modeling_data.select('year').summary()
median_year = int(year_stats.collect()[5].year)
print('the median year is = ', median_year)

#Impute median year
modeling_data = modeling_data.na.fill({'year': median_year})

#Convert to integer type column
modeling_data = modeling_data.withColumn("year", modeling_data["year"].cast(IntegerType()))
modeling_data = modeling_data.withColumn("SJR", modeling_data["SJR"].cast(IntegerType()))

In [16]:
from pyspark.ml.feature import VectorAssembler
#gather feature vector and identify features
assembler = VectorAssembler(inputCols = ['year', 'outCitations_count','abstract_wcount','title_wcount','abstract_tfidf',\
                                         'title_tfidf','SJR','author_count','fieldsOfStudyVec', 'sourcesVec'],
                            outputCol = 'features', handleInvalid='skip')


modeling_data = assembler.transform(modeling_data)

In [17]:
#split data into train and test
train_df, test_df = modeling_data.select('id','inCitations_count', 'features').randomSplit([0.8, 0.2], seed=42)
train_df.show(1)

+--------------------+-----------------+--------------------+
|                  id|inCitations_count|            features|
+--------------------+-----------------+--------------------+
|000011af6d4e69b95...|                0|(526477,[0,3,2949...|
+--------------------+-----------------+--------------------+
only showing top 1 row



In [18]:
print('Train Length = ', train_df.count())
print('Test Length = ', test_df.count())

Train Length =  7470695
Test Length =  1864404


In [19]:
%%time
from pyspark.ml.regression import LinearRegression

#Elastic Net
lr = LinearRegression(featuresCol = 'features', labelCol='inCitations_count', regParam=0.3, maxIter=10)
lrm = lr.fit(train_df)

#coefficients
print("Coefficients: " + str(lrm.coefficients))
print("Intercept: " + str(lrm.intercept))

#model summary
print("RMSE: %f" % lrm.summary.rootMeanSquaredError)
print("r2: %f" % lrm.summary.r2)

#p-values are not provided in this model for the solver being used
#print("pValues: " + str(lrm.summary.pValues))

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [25]:
#model summary
print("RMSE: %f" % lrm.summary.rootMeanSquaredError)
print("r2: %f" % lrm.summary.r2)

RMSE: 30.617903
r2: 0.067576


In [20]:
#make predictions
predictions = lrm.transform(test_df)

In [21]:
from itertools import chain
attrs = sorted(
    (attr["idx"], attr["name"]) for attr in (chain(*predictions
        .schema[lrm.summary.featuresCol]
        .metadata["ml_attr"]["attrs"].values())))

In [None]:
attrs

In [23]:
len(attrs)

526477

In [24]:
#[(name, lrm.summary.pValues[idx]) for idx, name in attrs]

from pyspark.ml.evaluation import RegressionEvaluator

eval = RegressionEvaluator(labelCol="inCitations_count", predictionCol="prediction", metricName="rmse")

# Root Mean Square Error
rmse = eval.evaluate(predictions)
print("RMSE: %.3f" % rmse)

# Mean Square Error
mse = eval.evaluate(predictions, {eval.metricName: "mse"})
print("MSE: %.3f" % mse)

# Mean Absolute Error
mae = eval.evaluate(predictions, {eval.metricName: "mae"})
print("MAE: %.3f" % mae)

# r2 - coefficient of determination
r2 = eval.evaluate(predictions, {eval.metricName: "r2"})
print("r2: %.3f" %r2)

RMSE: 25.949
MSE: 673.376
MAE: 5.579
r2: -0.013


In [None]:
%%time
from pyspark.ml.regression import RandomForestRegressor

# Set parameters for the Random Forest.
rfr = RandomForestRegressor(maxDepth=5, numTrees=20, labelCol="inCitations_count", predictionCol="prediction")

# Fit the model to the data.
rfrm = rfr.fit(train_df)

In [None]:
# Given a dataset, predict each point's label, and show the results.
predictions = rfcm.transform(test_df)

Exercise: Build a feature importance selector
Reference:
https://www.timlrx.com/2018/06/19/feature-selection-using-feature-importance-score-creating-a-pyspark-estimator/