In [9]:
import csv
import os
import sys
# Spark imports
from pyspark.rdd import RDD
from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
from pyspark.sql.functions import desc
# Dask imports
import dask.bag as db
import dask.dataframe as df  # you can use Dask bags or dataframes
from csv import reader
import pandas as pd

In [10]:
# Initialize a spark session.
def init_spark():
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()
    return spark

In [11]:
spark = init_spark()

# Extract the data and build the model

In [12]:
filename_train = "./dataset/train.csv"
filename_test = "./dataset/valid.csv"

### Using pandas

In [21]:
df_2 = pd.read_csv(filename_train)
len(df_2)

45000

In [25]:
df_2.head()

Unnamed: 0,Id,Title,Body,Tags,CreationDate,Y
0,34552656,Java: Repeat Task Every Random Seconds,<p>I'm already familiar with repeating tasks e...,<java><repeat>,2016-01-01 00:21:59,LQ_CLOSE
1,34553034,Why are Java Optionals immutable?,<p>I'd like to understand why Java 8 Optionals...,<java><optional>,2016-01-01 02:03:20,HQ
2,34553174,Text Overlay Image with Darkened Opacity React...,<p>I am attempting to overlay a title over an ...,<javascript><image><overlay><react-native><opa...,2016-01-01 02:48:24,HQ
3,34553318,Why ternary operator in swift is so picky?,"<p>The question is very simple, but I just cou...",<swift><operators><whitespace><ternary-operato...,2016-01-01 03:30:17,HQ
4,34553755,hide/show fab with scale animation,<p>I'm using custom floatingactionmenu. I need...,<android><material-design><floating-action-but...,2016-01-01 05:21:48,HQ


### Using spark dataframe

In [27]:
df_train = spark.read.csv(filename_train,  sep=',', header=True)

In [28]:
df_train.count()

869081

In [11]:
df_val = spark.read.csv(filename_test, header=True)

In [12]:
df_val.count()

287339

In [25]:
df_train_rdd = spark.sparkContext.textFile(filename)

In [15]:
df_train_rdd.map(lambda l: l.split(',')).take(5)

NameError: name 'df_train_rdd' is not defined

In [19]:
df_train.show(4)

+--------------------+--------------------+--------------------+----+------------+----+
|                  Id|               Title|                Body|Tags|CreationDate|   Y|
+--------------------+--------------------+--------------------+----+------------+----+
|            34552656|Java: Repeat Task...|"<p>I'm already f...|null|        null|null|
|,<java><repeat>,2...|                null|                null|null|        null|null|
|            34553034|Why are Java Opti...|<p>I'd like to un...|null|        null|null|
|,<java><optional>...|                null|                null|null|        null|null|
+--------------------+--------------------+--------------------+----+------------+----+
only showing top 4 rows



In [29]:
df_rdd = spark.read.csv(filename_train, header=True)

In [30]:
df_rdd.take(2)

[Row(Id='34552656', Title='Java: Repeat Task Every Random Seconds', Body='"<p>I\'m already familiar with repeating tasks every n seconds by using Java.util.Timer and Java.util.TimerTask. But lets say I want to print ""Hello World"" to the console every random seconds from 1-5. Unfortunately I\'m in a bit of a rush and don\'t have any code to show so far. Any help would be apriciated.  </p>', Tags=None, CreationDate=None, Y=None),
 Row(Id=',<java><repeat>,2016-01-01 00:21:59,LQ_CLOSE', Title=None, Body=None, Tags=None, CreationDate=None, Y=None)]

In [32]:
fixed_df_format = spark.read.csv(filename_train, header=True, multiLine=True, inferSchema=True, escape='"', quote='"')

In [33]:
fixed_df_format.show(4)


+--------+--------------------+--------------------+--------------------+-------------------+--------+
|      Id|               Title|                Body|                Tags|       CreationDate|       Y|
+--------+--------------------+--------------------+--------------------+-------------------+--------+
|34552656|Java: Repeat Task...|<p>I'm already fa...|      <java><repeat>|2016-01-01 00:21:59|LQ_CLOSE|
|34553034|Why are Java Opti...|<p>I'd like to un...|    <java><optional>|2016-01-01 02:03:20|      HQ|
|34553174|Text Overlay Imag...|<p>I am attemptin...|<javascript><imag...|2016-01-01 02:48:24|      HQ|
|34553318|Why ternary opera...|<p>The question i...|<swift><operators...|2016-01-01 03:30:17|      HQ|
+--------+--------------------+--------------------+--------------------+-------------------+--------+
only showing top 4 rows



In [34]:
fixed_df_format.count()

45000