This notebook reads the processed csv file and converts it into parquet files to optimize storage and speedup next analysis

In [1]:
#Set up environment

%%configure -f
{
    "conf": {
        "spark.pyspark.python": "python3",
        "spark.pyspark.virtualenv.enabled": "true",
        "spark.pyspark.virtualenv.type":"native",
        "spark.pyspark.virtualenv.bin.path":"/usr/bin/virtualenv"
    }
}

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
0,application_1715895756248_0002,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

An error was encountered:
unknown magic command '%configure'
UnknownMagic: unknown magic command '%configure'



In [4]:
#Read CSV file from S3 bucket
data_csv = spark.read.csv("s3://finalproject-nat-s3/processed_submissions.csv", header=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
#See structure of the dataframe 
print('Total Columns: %d' % len(data_csv.dtypes))
print('Total Rows: %d' % data_csv.count())
data_csv.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Total Columns: 7
Total Rows: 4357389
root
 |-- id: string (nullable = true)
 |-- created: string (nullable = true)
 |-- author: string (nullable = true)
 |-- score: string (nullable = true)
 |-- title: string (nullable = true)
 |-- selftext: string (nullable = true)
 |-- num_comments: string (nullable = true)

In [7]:
#See if data was parsed correctly
data_csv.show(100)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----+----------------+--------------------+-----+--------------------+--------------------+------------+
|   id|         created|              author|score|               title|            selftext|num_comments|
+-----+----------------+--------------------+-----+--------------------+--------------------+------------+
|8shht|2009-06-14 18:22|         u/[deleted]|    9|Introducing the R...|             deleted|           1|
|8si6m|2009-06-14 20:42|         u/[deleted]|   18|Potential Reddit ...|             deleted|          18|
|8sur3|2009-06-16 00:19|         u/[deleted]|    7|How do I get over...|             deleted|           8|
|8sw2f|2009-06-16 03:30|       u/notmyrealsn|    3|Long distance rel...|                null|           9|
|8swor|2009-06-16 05:01|               u/cr3|   10|Name your favouri...|                null|          20|
|8sx2g|2009-06-16 05:58|         u/[deleted]|    2|Dear Reddit need ...|             deleted|           7|
|8t0lu|2009-06-16 12:13|       u/roge

In [8]:
#Seems OK. Now, I can convert to parquet 

df_repartitioned = data_csv.repartition(2)
df_repartitioned.write.parquet("s3://finalproject-nat-s3/submissions_parquet", mode = 'overwrite')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [10]:
#Open parquet files
data_parquet = spark.read.parquet('s3://finalproject-nat-s3/submissions_parquet/*.parquet')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [11]:
#See structure of the dataframe 
print('Total Columns: %d' % len(data_parquet.dtypes))
print('Total Rows: %d' % data_parquet.count())
data_parquet.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Total Columns: 7
Total Rows: 4357389
root
 |-- id: string (nullable = true)
 |-- created: string (nullable = true)
 |-- author: string (nullable = true)
 |-- score: string (nullable = true)
 |-- title: string (nullable = true)
 |-- selftext: string (nullable = true)
 |-- num_comments: string (nullable = true)

In [12]:
#Check if it was parsed correctly in parquet 
data_parquet.show(100)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------+----------------+--------------------+-----+--------------------+--------------------+------------+
|    id|         created|              author|score|               title|            selftext|num_comments|
+------+----------------+--------------------+-----+--------------------+--------------------+------------+
|1fceq2|2013-05-30 11:28|           u/rlh1271|   59|24m My girlfriend...|Weve been dating ...|         124|
| cw6d6|2010-08-01 16:23|u/longdistance_throw|    4|What are some goo...|Hi friends youve ...|          12|
|1j2am7|2013-07-25 18:54|         u/[deleted]|    2|24m My wife 23f C...|Hi Reddit Been co...|           7|
|16k90k|2013-01-14 12:04|        u/RoboDinner|    1|Need gift advice ...|Ive been seeing a...|          13|
| yvqn2|2012-08-26 20:45|   u/MaddenInGeneral|    1|21m Help dealing ...|I need some help ...|           6|
| bn3ib|2010-04-06 07:21|         u/[deleted]|    2|Unhealthy relatio...|Note sorry but to...|          15|
|14a0go|2012-12-04 14:00|   

In [13]:
#Sources
#https://spark.apache.org/docs/latest/sql-data-sources-csv.html


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…