# Regression and Spark (42 points)

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

# Regression (16 points)
In this regression task, we will use the following dataset. Please take a look at the description output.

In [2]:
import sklearn.datasets as mldata
data_dict = mldata.load_boston()
print(data_dict['DESCR']) # output 1 point

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

Coding Question 1: Similarly as our classification task in workshop4, you are required to use the data to fit a linear regression model, conduct similar prediction, and show the performance (MSE). (15 points)

In [3]:
bostonFeatures = pd.DataFrame(data_dict['data'], columns=data_dict['feature_names'])

bostonFeatures.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [4]:
# bostonFeatures['CHAS'] = bostonFeatures['CHAS'].astype(bool)
# bostonFeatures['CHAS'] = bostonFeatures['CHAS'].astype(int)
# bostonFeatures['RAD'] = bostonFeatures['RAD'].astype(int)
# bostonFeatures['TAX'] = bostonFeatures['TAX'].astype(int)
bostonFeatures.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [5]:
bostonPredict = pd.DataFrame(data_dict['target'], columns=['MDEV'])
bostonPredict.head()

Unnamed: 0,MDEV
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2


In [6]:
boston = bostonFeatures.copy()
boston['MDEV'] = bostonPredict['MDEV'].copy()
boston.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MDEV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [7]:
boston.dtypes

CRIM       float64
ZN         float64
INDUS      float64
CHAS       float64
NOX        float64
RM         float64
AGE        float64
DIS        float64
RAD        float64
TAX        float64
PTRATIO    float64
B          float64
LSTAT      float64
MDEV       float64
dtype: object

In [8]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(boston, test_size=0.25, random_state=100)
x_train = train.drop('MDEV', axis=1).values
y_train = train['MDEV'].values
x_test = test.drop('MDEV', axis=1).values
y_test = test['MDEV'].values

print("Training Data Size: ", len(train))
print("Test Data Size: ", len(test))

Training Data Size:  379
Test Data Size:  127


In [9]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [10]:
# Show the average train accuracy of the model

from sklearn.metrics import mean_squared_error, r2_score

y_train_pred = model.predict(x_train)
print(mean_squared_error(y_train, y_train_pred))
print(r2_score(y_train, y_train_pred))

20.506781668028975
0.7421573768304615


In [11]:
# Show the average test accuracy of the model

y_test_pred = model.predict(x_test)
print(mean_squared_error(y_test, y_test_pred))
print(r2_score(y_test, y_test_pred))

27.17314417304355
0.7246154314616744


# Spark (25  points)

In workshop 4, we have learned how to write a word count task in Spark using both notebook and cluster node.

Now you will write your first Spark job to accomplish the following task (you are required to finish the task in both ways, that is, jupyter notebook and cluster node, similarly as workshop 4):

• Outputs the number of words that start with each letter (i.e., 52 letters as A, B, C, ... Z, and a, b, c, ..., z). This means that for every letter we want to count the total number of (non-unique) words that start with that letter. In your implementation, you need to ignore all non-alphabetic characters.

• Run your program over the same input data pg100.txt as in workshop 4.

What to hand-in: 
1. Jupyter notebook version as workshop 4 in the following. (15 points) 
2. Submit a zip file contains: 1) The output file containing results we want (e.g., print out or txt output by the application) [3 points]; 2) Your screen shot that finishes running your application [1 point]; 3) Your source code (.py) [5 points]; 4) Include your full name in the source code [1 point]. (10 points in total)

In [12]:
import os
import findspark
os.environ["PYSPARK_PYTHON"] = "python3"
findspark.init("spark-2.4.2-bin-hadoop2.7/",)

In [13]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
        .master("local[*]")
        .appName("LectureExample")
        .getOrCreate()
)
sc = spark.sparkContext

In [14]:
import re #regular expression used to split lines of text into words

lines = sc.textFile("./pg100.txt") # download pg100.txt from canvas in fold of Spark

#Split the lines into words (including all alphanumeric characters)
words = lines.flatMap(lambda line: re.split(r'[^\w]+', line))

#Mapper
pairs = words.map(lambda word: (word, 1))

#Reducer
word_counts = pairs.reduceByKey(lambda n1, n2: n1 + n2)

char_counts = word_counts.flatMap(lambda each: each[0]).filter(lambda char: char.isalpha()).map(lambda char: char).map(lambda c: (c, 1)).reduceByKey(lambda v1, v2: v1 + v2)

char_counts_DF = char_counts.toDF().toPandas()
char_counts_DF

Unnamed: 0,_1,_2
0,h,5074
1,r,14847
2,j,283
3,c,6900
4,S,1421
5,s,16157
6,p,5278
7,i,14645
8,y,2953
9,g,5403
