<a href="https://colab.research.google.com/github/monicawtavares/awesome-deep-learning/blob/master/run_spark_on_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Java, Spark, and Findspark
This installs Apache Spark 2.2.1, Java 8, and [Findspark](https://github.com/minrk/findspark), a library that makes it easy for Python to find Spark.

# Set Environment Variables
Set the locations where Spark and Java are installed.

In [11]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.2.1-bin-hadoop2.7"
# Add the following line to include python/lib to the search path:
os.environ["PYSPARK_PYTHON"] = os.environ.get("SPARK_HOME") + "/python/lib"


# Start a SparkSession
This will start a local Spark session.

In [25]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-2.2.1/spark-2.2.1-bin-hadoop2.7.tgz
!tar xf spark-2.2.1-bin-hadoop2.7.tgz
!pip install -q findspark


In [3]:
import findspark
findspark.init()
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[*]").getOrCreate()

# Use Spark!
That's all there is to it - you're ready to use Spark!

In [4]:
df = spark.createDataFrame([{"hello": "world"} for x in range(1000)])
df.show(3)

+-----+
|hello|
+-----+
|world|
|world|
|world|
+-----+
only showing top 3 rows



In [6]:
import pandas as pd
import os
import random
from datetime import datetime, timedelta

In [7]:
#Function to generate random dates between 2019 and 2021
def random_date (start_year=2019, end_year=2021):
    start = datetime(start_year, 1, 1)
    end = datetime(end_year + 1, 1, 1)
    return(start + (end -start)* random.random()).strftime('%Y-%m-%d')

In [8]:
#generate customer table
customer_data = {
    "CustomerID": [i for i in range(1,1001)],
    "Name": [f"Customer_{i}" for i in range(1,1001)],
    "Email": [f"Customer_{i}@example.com"for i in range(1,1001)],
    "RegistrationDate": [random_date() for _ in range(1000)]
}

customer_df = pd.DataFrame(customer_data)
display(customer_df)


Unnamed: 0,CustomerID,Name,Email,RegistrationDate
0,1,Customer_1,Customer_1@example.com,2020-09-11
1,2,Customer_2,Customer_2@example.com,2021-12-16
2,3,Customer_3,Customer_3@example.com,2020-09-21
3,4,Customer_4,Customer_4@example.com,2021-10-07
4,5,Customer_5,Customer_5@example.com,2021-08-27
...,...,...,...,...
995,996,Customer_996,Customer_996@example.com,2020-03-08
996,997,Customer_997,Customer_997@example.com,2019-02-05
997,998,Customer_998,Customer_998@example.com,2020-03-15
998,999,Customer_999,Customer_999@example.com,2021-08-18


In [9]:
#generate products table
product_data = {
    "ProductID": [i for i in range(1,1001)],
    "ProductName": [f"Customer_{i}" for i in range(1,1001)],
    "Category": [random.choice(["Eletronics", "Clothing", "Books"]) for i in range(1,1001)],
    "Price": [round(random.uniform(10, 1000), 2) for _ in range(1000)]
}

product_df = pd.DataFrame(product_data)
display(product_df)


Unnamed: 0,ProductID,ProductName,Category,Price
0,1,Customer_1,Eletronics,307.29
1,2,Customer_2,Clothing,332.91
2,3,Customer_3,Eletronics,237.77
3,4,Customer_4,Eletronics,586.32
4,5,Customer_5,Clothing,368.86
...,...,...,...,...
995,996,Customer_996,Clothing,667.23
996,997,Customer_997,Eletronics,983.20
997,998,Customer_998,Books,886.81
998,999,Customer_999,Clothing,360.92


In [10]:
#generate products table
product_data = {
    "ProductID": [i for i in range(1,1001)],
    "ProductName": [f"Customer_{i}" for i in range(1,1001)],
    "Category": [random.choice(["Eletronics", "Clothing", "Books"]) for i in range(1,1001)],
    "Price": [round(random.uniform(10, 1000), 2) for _ in range(1000)]
}

product_df = pd.DataFrame(product_data)
display(product_df)


Unnamed: 0,ProductID,ProductName,Category,Price
0,1,Customer_1,Eletronics,118.56
1,2,Customer_2,Eletronics,722.16
2,3,Customer_3,Books,61.31
3,4,Customer_4,Clothing,159.05
4,5,Customer_5,Books,807.21
...,...,...,...,...
995,996,Customer_996,Clothing,973.34
996,997,Customer_997,Eletronics,226.61
997,998,Customer_998,Eletronics,480.07
998,999,Customer_999,Clothing,948.41
