In [2]:
import findspark
findspark.init()
import pyspark

In [3]:
sc = pyspark.SparkContext()

In [5]:
from pyspark.sql import SQLContext
sqlctx = SQLContext(sc)

In [8]:
df = sqlctx.read.csv('../../../datasets/iowa_liquor/Iowa_Liquor_sales_sample_10pct.csv', header=True)

In [9]:
df.printSchema()

root
 |-- Date: string (nullable = true)
 |-- Store Number: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zip Code: string (nullable = true)
 |-- County Number: string (nullable = true)
 |-- County: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Category Name: string (nullable = true)
 |-- Vendor Number: string (nullable = true)
 |-- Item Number: string (nullable = true)
 |-- Item Description: string (nullable = true)
 |-- Bottle Volume (ml): string (nullable = true)
 |-- State Bottle Cost: string (nullable = true)
 |-- State Bottle Retail: string (nullable = true)
 |-- Bottles Sold: string (nullable = true)
 |-- Sale (Dollars): string (nullable = true)
 |-- Volume Sold (Liters): string (nullable = true)
 |-- Volume Sold (Gallons): string (nullable = true)



In [13]:
df.select("Store Number", "Date", "Bottles Sold", "Sale (Dollars)").show(5)

+------------+----------+------------+--------------+
|Store Number|      Date|Bottles Sold|Sale (Dollars)|
+------------+----------+------------+--------------+
|        3717|11/04/2015|          12|        $81.00|
|        2614|03/02/2016|           2|        $41.26|
|        2106|02/11/2016|          24|       $453.36|
|        2501|02/03/2016|           6|        $85.50|
|        3654|08/18/2015|          12|       $129.60|
+------------+----------+------------+--------------+
only showing top 5 rows



In [14]:
df = df.withColumn("Store Number", df["Store Number"].cast("integer"))

In [17]:
from pyspark.sql.types import StringType, IntegerType, DoubleType
from pyspark.sql.functions import udf, regexp_replace

df = df.withColumn("Store Number", df["Store Number"].cast("integer"))\
.withColumn("Sale (Dollars)",        regexp_replace("Sale (Dollars)", "\\$", "").cast("double")) \
.withColumn("Zip Code",              df["Zip Code"].cast("integer")) \
.withColumn("County Number",         df["County Number"].cast("integer")) \
.withColumn("Vendor Number",         df["Vendor Number"].cast("integer")) \
.withColumn("Item Number",           df["Item Number"].cast("integer")) \
.withColumn("Bottle Volume (ml)",    df["Bottle Volume (ml)"].cast("integer")) \
.withColumn("State Bottle Cost",     regexp_replace("State Bottle Cost", "\\$", "")) \
.withColumn("State Bottle Retail",   regexp_replace("State Bottle Retail", "\\$", "")) \
.withColumn("Bottles Sold",          df["Bottles Sold"].cast("integer")) \
.withColumn("Volume Sold (Liters)",  df["Volume Sold (Liters)"].cast("double")) \
.withColumn("Volume Sold (Gallons)", df["Volume Sold (Gallons)"].cast("double")) \


In [18]:
df.printSchema()

root
 |-- Date: string (nullable = true)
 |-- Store Number: integer (nullable = true)
 |-- City: string (nullable = true)
 |-- Zip Code: integer (nullable = true)
 |-- County Number: integer (nullable = true)
 |-- County: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Category Name: string (nullable = true)
 |-- Vendor Number: integer (nullable = true)
 |-- Item Number: integer (nullable = true)
 |-- Item Description: string (nullable = true)
 |-- Bottle Volume (ml): integer (nullable = true)
 |-- State Bottle Cost: string (nullable = true)
 |-- State Bottle Retail: string (nullable = true)
 |-- Bottles Sold: integer (nullable = true)
 |-- Sale (Dollars): double (nullable = true)
 |-- Volume Sold (Liters): double (nullable = true)
 |-- Volume Sold (Gallons): double (nullable = true)



In [20]:
df.select("Store Number", "Sale (Dollars)").show(5)

+------------+--------------+
|Store Number|Sale (Dollars)|
+------------+--------------+
|        3717|          81.0|
|        2614|         41.26|
|        2106|        453.36|
|        2501|          85.5|
|        3654|         129.6|
+------------+--------------+
only showing top 5 rows



In [22]:
df.select(["Bottle Volume (ml)", "Bottles Sold", "Sale (Dollars)", "Volume Sold (Liters)"]).describe().show()

+-------+------------------+-----------------+------------------+--------------------+
|summary|Bottle Volume (ml)|     Bottles Sold|    Sale (Dollars)|Volume Sold (Liters)|
+-------+------------------+-----------------+------------------+--------------------+
|  count|            270955|           270955|            270955|              270955|
|   mean| 924.8303408315033|9.871284899706593| 128.9023747485706|   8.981351183775748|
| stddev|493.08848860663403|24.04091157393874|383.02736884240466|  28.913690130072464|
|    min|                50|                1|              1.34|                 0.1|
|    max|              6000|             2508|           36392.4|              2508.0|
+-------+------------------+-----------------+------------------+--------------------+



In [23]:
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD, LinearRegressionModel

In [42]:
features = ["Bottles Sold", "Sale (Dollars)", "Bottle Volume (ml)"]
response = "Volume Sold (Liters)"

X = df.rdd.map(lambda row: LabeledPoint(row[response], 
                                        [row[feature] for feature in features]))

In [43]:
X.take(5)

[LabeledPoint(9.0, [12.0,81.0,750.0]),
 LabeledPoint(1.5, [2.0,41.26,750.0]),
 LabeledPoint(24.0, [24.0,453.36,1000.0]),
 LabeledPoint(10.5, [6.0,85.5,1750.0]),
 LabeledPoint(21.0, [12.0,129.6,1750.0])]

In [44]:
trainingData, testData = X.randomSplit([0.7, 0.3])

In [47]:
LinearModel = LinearRegressionWithSGD.train(trainingData, iterations=100, step=0.000001)

In [48]:
zip(features, LinearModel.weights.array)

[('Bottles Sold', 0.0025768025681086881),
 ('Sale (Dollars)', 0.036311185894537704),
 ('Bottle Volume (ml)', 0.0055465188167585977)]

In [50]:
from pyspark.mllib.evaluation import RegressionMetrics
predObserRDD = testData.map(lambda row: (float(LinearModel.predict(row.features)), row.label)).cache()

In [53]:
reg_metrics = RegressionMetrics(predObserRDD)
print "R2", reg_metrics.r2
print "Explained Variance", reg_metrics.explainedVariance

R2 0.607561304991
Explained Variance 251.152338334


In [58]:
## Logistic Regression 

# Let's give it the schema 

from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import DoubleType, IntegerType, StringType

schema = StructType([
    StructField("PassengerId", IntegerType()),
    StructField("Survived",    IntegerType()),
    StructField("Pclass",      IntegerType()),
    StructField("Name",        StringType()),
    StructField("Sex",         StringType()),
    StructField("Age",         DoubleType()),
    StructField("SibSp",       IntegerType()),
    StructField("Parch",       IntegerType()),
    StructField("Fare",        DoubleType()),
    StructField("Embarked",    StringType()) 
])

In [71]:
df = sqlctx.read.csv('../../../datasets/titanic/titanic_clean.csv', header=True, schema=schema)

In [72]:
df.printSchema()
df.show(5)

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Embarked: string (nullable = true)

+-----------+--------+------+--------------------+------+----+-----+-----+-------+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|   Fare|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+-------+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|   7.25|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|71.2833|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|  7.925|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.

In [75]:
features = ["Pclass", "Age", "SibSp", "Parch"]
response = "Survived"

X = df.rdd.map( 
    lambda row: LabeledPoint(row[response], [row[feature] for feature in features])
)

# Split the data into training and test sets (30% held out for testing)
trainingData, testData = X.randomSplit([0.7, 0.3])

In [66]:
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.evaluation import BinaryClassificationMetrics

In [77]:
logistic = LogisticRegressionWithSGD.train(trainingData)

In [78]:
zip(features, logistic.weights.array)

[('Pclass', -0.99063305227512943),
 ('Age', 0.94853124126974875),
 ('SibSp', 0.21820941775786651),
 ('Parch', 0.83089895065672759)]

In [80]:
predRDD = testData.map(lambda row: (float(logistic.predict(row.features)), row.label)).cache()

In [84]:
metrics = BinaryClassificationMetrics(predRDD)
print "Area PR Curve:", metrics.areaUnderPR
print "Area ROC Curve:", metrics.areaUnderROC

Area PR Curve: 0.69528311601
Area ROC Curve: 0.501957241795


In [86]:
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils
rf_model =RandomForest.trainClassifier(trainingData, numClasses=2, numTrees=3, 
                                       categoricalFeaturesInfo={}, impurity='gini', maxDepth=4, maxBins=32)

In [90]:
predictions = rf_model.predict(testData.map(lambda row: row.features))
labeslsandpreds = testData.map(lambda row: row.label).zip(predictions)

In [91]:
print rf_model.toDebugString()

TreeEnsembleModel classifier with 3 trees

  Tree 0:
    If (feature 0 <= 2.0)
     If (feature 3 <= 0.0)
      If (feature 2 <= 0.0)
       If (feature 0 <= 1.0)
        Predict: 1.0
       Else (feature 0 > 1.0)
        Predict: 0.0
      Else (feature 2 > 0.0)
       If (feature 2 <= 2.0)
        Predict: 1.0
       Else (feature 2 > 2.0)
        Predict: 1.0
     Else (feature 3 > 0.0)
      If (feature 2 <= 1.0)
       If (feature 3 <= 1.0)
        Predict: 1.0
       Else (feature 3 > 1.0)
        Predict: 1.0
      Else (feature 2 > 1.0)
       Predict: 1.0
    Else (feature 0 > 2.0)
     If (feature 1 <= 17.0)
      If (feature 2 <= 1.0)
       If (feature 2 <= 0.0)
        Predict: 1.0
       Else (feature 2 > 0.0)
        Predict: 0.0
      Else (feature 2 > 1.0)
       If (feature 2 <= 4.0)
        Predict: 0.0
       Else (feature 2 > 4.0)
        Predict: 0.0
     Else (feature 1 > 17.0)
      If (feature 2 <= 2.0)
       If (feature 3 <= 1.0)
        Predict: 0.0
       E