#### `Return the number of distinct words`

In [2]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F


In [3]:
spark = SparkSession.builder.appName(
    "Counting word occurences from a book."
).getOrCreate()

spark.sparkContext.setLogLevel("WARN")

In [22]:
# If you need to read multiple text files, replace `1342-0` by `*`.
results = (
    spark.read.text("./DataAnalysisPythonPySpark/data/data/gutenberg_books/1342-0.txt")
    .select(F.split(F.col("value"), " ").alias("line"))
    .select(F.explode(F.col("line")).alias("word"))
    .select(F.lower(F.col("word")).alias("word"))
    .select(F.regexp_extract(F.col("word"), "[a-z']*", 0).alias("word"))
    .where(F.col("word") != "")
    # .groupby(F.col("word"))
    .distinct()
    .count()
)
results
# results.orderBy("count", ascending=False).show(10)
# results.coalesce(1).write.csv("./results_single_partition.csv")

6595

##### ```Getting the words that appears only once.```

In [36]:
# If you need to read multiple text files, replace `1342-0` by `*`.
results = (
    spark.read.text("./DataAnalysisPythonPySpark/data/data/gutenberg_books/1342-0.txt")
    .select(F.split(F.col("value"), " ").alias("line"))
    .select(F.explode(F.col("line")).alias("word"))
    .select(F.lower(F.col("word")).alias("word"))
    .select(F.regexp_extract(F.col("word"), "[a-z']*", 0).alias("word"))
    .where(F.col("word") != "")
    .groupby(F.col("word"))
    # .distinct()
    .count()
    # .where(F.col('count') == 1)
    # .orderBy('word', ascending=True)
)
print(results.show(10))
print(results.count())


+---------+-----+
|     word|count|
+---------+-----+
|   online|    4|
|     some|  203|
|    still|   72|
|      few|   72|
|     hope|  122|
|    those|   60|
| cautious|    4|
|   lady's|    8|
|imitation|    1|
|      art|    3|
+---------+-----+
only showing top 10 rows

None
6595


In [40]:
results.withColumn('first_letter', F.substring(F.col('word'), 1, 1)).groupby(F.col('first_letter')).sum() \
    .orderBy('Sum(Count)', ascending=False).show(5)


+------------+----------+
|first_letter|sum(count)|
+------------+----------+
|           t|     16101|
|           a|     13684|
|           h|     10419|
|           w|      9091|
|           s|      8791|
+------------+----------+
only showing top 5 rows



In [41]:
results.withColumn('First_letter_Vowel',
F.substring(F.col('word'), 1, 1).isin(['a','e','i','o','u']),) \
    .groupby(F.col("First_letter_Vowel")).sum().show()

+------------------+----------+
|First_letter_Vowel|sum(count)|
+------------------+----------+
|              true|     33522|
|             false|     88653|
+------------------+----------+



In [11]:
logs = spark.read.csv('./DataAnalysisPythonPySpark/data/data/broadcast_logs/BroadcastLogs_2018_Q3_M8_sample.csv', sep='|',
header=True, inferSchema=True, timestampFormat='yyyy-MM-dd')

In [None]:
logs.show(5, vertical=True)

In [9]:
sample = spark.read.csv('./sample.csv',header=True,inferSchema=True,
quote='$')

In [62]:
print(sample.printSchema())
print(sample.show())

root
 |-- Item: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: double (nullable = true)

None
+---------------+--------+-----+
|           Item|Quantity|Price|
+---------------+--------+-----+
|Banana, organic|       1| 0.99|
|           Pear|       7| 1.24|
|Cake, chocolate|       1| 14.5|
+---------------+--------+-----+

None


In [13]:
print(logs.select('BroadcastLogID', 'LogServiceID', 'LogDate').show(5,False))

+--------------+------------+-------------------+
|BroadcastLogID|LogServiceID|LogDate            |
+--------------+------------+-------------------+
|1196192316    |3157        |2018-08-01 00:00:00|
|1196192317    |3157        |2018-08-01 00:00:00|
|1196192318    |3157        |2018-08-01 00:00:00|
|1196192319    |3157        |2018-08-01 00:00:00|
|1196192320    |3157        |2018-08-01 00:00:00|
+--------------+------------+-------------------+
only showing top 5 rows

None


In [14]:
logs.columns

['BroadcastLogID',
 'LogServiceID',
 'LogDate',
 'SequenceNO',
 'AudienceTargetAgeID',
 'AudienceTargetEthnicID',
 'CategoryID',
 'ClosedCaptionID',
 'CountryOfOriginID',
 'DubDramaCreditID',
 'EthnicProgramID',
 'ProductionSourceID',
 'ProgramClassID',
 'FilmClassificationID',
 'ExhibitionID',
 'Duration',
 'EndTime',
 'LogEntryDate',
 'ProductionNO',
 'ProgramTitle',
 'StartTime',
 'Subtitle',
 'NetworkAffiliationID',
 'SpecialAttentionID',
 'BroadcastOriginPointID',
 'CompositionID',
 'Producer1',
 'Producer2',
 'Language1',
 'Language2']

In [1]:
import numpy as np
import pandas as pd

In [18]:
col_split = np.array_split(
    np.array(logs.columns), len(logs.columns)//4
)
for i in col_split:
    print("Columns: ", i, end="\n")

Columns:  ['BroadcastLogID' 'LogServiceID' 'LogDate' 'SequenceNO'
 'AudienceTargetAgeID']
Columns:  ['AudienceTargetEthnicID' 'CategoryID' 'ClosedCaptionID'
 'CountryOfOriginID' 'DubDramaCreditID']
Columns:  ['EthnicProgramID' 'ProductionSourceID' 'ProgramClassID'
 'FilmClassificationID']
Columns:  ['ExhibitionID' 'Duration' 'EndTime' 'LogEntryDate']
Columns:  ['ProductionNO' 'ProgramTitle' 'StartTime' 'Subtitle']
Columns:  ['NetworkAffiliationID' 'SpecialAttentionID' 'BroadcastOriginPointID'
 'CompositionID']
Columns:  ['Producer1' 'Producer2' 'Language1' 'Language2']


In [19]:
for x in  col_split:
    logs.select(*x).show(5, False)

+--------------+------------+-------------------+----------+-------------------+
|BroadcastLogID|LogServiceID|LogDate            |SequenceNO|AudienceTargetAgeID|
+--------------+------------+-------------------+----------+-------------------+
|1196192316    |3157        |2018-08-01 00:00:00|1         |4                  |
|1196192317    |3157        |2018-08-01 00:00:00|2         |null               |
|1196192318    |3157        |2018-08-01 00:00:00|3         |null               |
|1196192319    |3157        |2018-08-01 00:00:00|4         |null               |
|1196192320    |3157        |2018-08-01 00:00:00|5         |null               |
+--------------+------------+-------------------+----------+-------------------+
only showing top 5 rows

+----------------------+----------+---------------+-----------------+----------------+
|AudienceTargetEthnicID|CategoryID|ClosedCaptionID|CountryOfOriginID|DubDramaCreditID|
+----------------------+----------+---------------+-----------------+---

In [87]:
display(logs)

DataFrame[BroadcastLogID: int, LogServiceID: int, LogDate: timestamp, SequenceNO: int, AudienceTargetAgeID: int, AudienceTargetEthnicID: int, CategoryID: int, ClosedCaptionID: int, CountryOfOriginID: int, DubDramaCreditID: int, EthnicProgramID: int, ProductionSourceID: int, ProgramClassID: int, FilmClassificationID: int, ExhibitionID: int, Duration: string, EndTime: string, LogEntryDate: timestamp, ProductionNO: string, ProgramTitle: string, StartTime: string, Subtitle: string, NetworkAffiliationID: int, SpecialAttentionID: int, BroadcastOriginPointID: int, CompositionID: int, Producer1: string, Producer2: string, Language1: int, Language2: int]

In [89]:
# Dropping BroadCastLogID and SequennceNo not usefull columns.
logs = logs.drop(*['BroadCastLogID', 'SequenceNo'])

# Testing columns presence.
print("BroadCastLogID" in logs.columns)
print("SequenceNo" in logs.columns)

False
False


In [20]:
# here False means show result without truncate
logs.select("BroadcastLogID","LogServiceID","LogDate").show(5, False)

+--------------+------------+-------------------+
|BroadcastLogID|LogServiceID|LogDate            |
+--------------+------------+-------------------+
|1196192316    |3157        |2018-08-01 00:00:00|
|1196192317    |3157        |2018-08-01 00:00:00|
|1196192318    |3157        |2018-08-01 00:00:00|
|1196192319    |3157        |2018-08-01 00:00:00|
|1196192320    |3157        |2018-08-01 00:00:00|
+--------------+------------+-------------------+
only showing top 5 rows



In [22]:
# use of date and time case.

logs.select(F.col("Duration")).dtypes

[('Duration', 'string')]

In [24]:
# use of pyspark substring functionality.
logs.select(
    F.col("Duration"),
    F.col("Duration").substr(1,2).cast("int").alias("dur_hours"),
    F.col("Duration").substr(4,2).cast("int").alias("dur_minutes"),
    F.col("Duration").substr(7,2).cast("int").alias("dur_seconds"),
).distinct().show(5)
    

+----------------+---------+-----------+-----------+
|        Duration|dur_hours|dur_minutes|dur_seconds|
+----------------+---------+-----------+-----------+
|00:04:52.0000000|        0|          4|         52|
|00:10:06.0000000|        0|         10|          6|
|00:26:41.0000000|        0|         26|         41|
|00:09:52.0000000|        0|          9|         52|
|00:04:26.0000000|        0|          4|         26|
+----------------+---------+-----------+-----------+
only showing top 5 rows



In [25]:
# renaming one column
logs_rename = logs.withColumnRenamed("Duration_seconds", "duration_seconds")
logs_rename.printSchema()

root
 |-- BroadcastLogID: integer (nullable = true)
 |-- LogServiceID: integer (nullable = true)
 |-- LogDate: timestamp (nullable = true)
 |-- SequenceNO: integer (nullable = true)
 |-- AudienceTargetAgeID: integer (nullable = true)
 |-- AudienceTargetEthnicID: integer (nullable = true)
 |-- CategoryID: integer (nullable = true)
 |-- ClosedCaptionID: integer (nullable = true)
 |-- CountryOfOriginID: integer (nullable = true)
 |-- DubDramaCreditID: integer (nullable = true)
 |-- EthnicProgramID: integer (nullable = true)
 |-- ProductionSourceID: integer (nullable = true)
 |-- ProgramClassID: integer (nullable = true)
 |-- FilmClassificationID: integer (nullable = true)
 |-- ExhibitionID: integer (nullable = true)
 |-- Duration: string (nullable = true)
 |-- EndTime: string (nullable = true)
 |-- LogEntryDate: timestamp (nullable = true)
 |-- ProductionNO: string (nullable = true)
 |-- ProgramTitle: string (nullable = true)
 |-- StartTime: string (nullable = true)
 |-- Subtitle: string 

- use toDF method to 
    - this method return the new data frame.
    

In [31]:
# use of sorted in select method
# logs.select(sorted(logs.columns)).printSchema()

# use of description
logs.describe("LogServiceID").show()

# use of summary method.
logs.select("LogServiceID").summary().show()
 

+-------+------------------+
|summary|      LogServiceID|
+-------+------------------+
|  count|            238945|
|   mean| 3450.890284375065|
| stddev|199.50673962554782|
|    min|              3157|
|    max|              3925|
+-------+------------------+

+-------+------------------+
|summary|      LogServiceID|
+-------+------------------+
|  count|            238945|
|   mean| 3450.890284375065|
| stddev|199.50673962554782|
|    min|              3157|
|    25%|              3287|
|    50%|              3379|
|    75%|              3627|
|    max|              3925|
+-------+------------------+

