In [None]:
!pip install pyspark
!pip install findspark
!pip install pyarrow==0.14.1
!pip install pandas
!pip install numpy==1.19.5

In [None]:
import findspark
findspark.init()

In [None]:
import pandas as pd
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [None]:
sc = SparkContext()
spark = SparkSession \
    .builder \
    .appName("Python Spark DataFrames basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [None]:
mtcars = pd.read_csv('https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-BD0225EN-SkillsNetwork/labs/data/mtcars.csv')

In [None]:
mtcars.rename( columns={'Unnamed: 0':'name'}, inplace=True )

In [None]:
sdf = spark.createDataFrame(mtcars)

In [None]:
sdf.printSchema()

In [None]:
sdf_new = sdf.withColumnRenamed("vs", "versus")

In [None]:
sdf_new.head(5)

In [None]:
sdf.createTempView("cars")

In [None]:
spark.sql("SELECT * FROM cars").show()

In [None]:
spark.sql("SELECT mpg FROM cars").show(5)

In [None]:
spark.sql("SELECT * FROM cars where mpg>20 AND cyl < 6").show(5)

In [None]:
sdf.where(sdf['mpg'] < 18).show(3)

In [None]:
spark.sql("SELECT count(*), cyl from cars GROUP BY cyl").show()

In [None]:
from pyspark.sql.functions import pandas_udf, PandasUDFType

In [None]:
@pandas_udf("float")
def convert_wt(s: pd.Series) -> pd.Series:
    # The formula for converting from imperial to metric tons
    return s * 0.45

spark.udf.register("convert_weight", convert_wt)

In [None]:
spark.sql("SELECT *, wt AS weight_imperial, convert_weight(wt) as weight_metric FROM cars").show()

In [None]:
data = [("A101", "John"), ("A102", "Peter"), ("A103", "Charlie")]

columns = ["emp_id", "emp_name"]

dataframe_1 = spark.createDataFrame(data, columns)

In [None]:
data = [("A101", 3250), ("A102", 6735), ("A103", 8650)]

columns = ["emp_id", "salary"]

dataframe_2 = spark.createDataFrame(data, columns)

In [None]:
combined_df = dataframe_1.join(dataframe_2, on="emp_id", how="inner")

In [None]:
combined_df.collect()

In [None]:
data = [("A101", 1000), ("A102", 2000), ("A103",None)]

columns = ["emp_id", "salary"]

dataframe_1 = spark.createDataFrame(data, columns)

In [None]:
filled_df = dataframe_1.fillna({"salary": 3000})

In [None]:
spark.sql("SELECT * FROM cars where name like 'Merc%'").show()


In [None]:
from pyspark.sql.functions import pandas_udf

@pandas_udf("float")
def convert_mileage(s: pd.Series) -> pd.Series:
    return s * 0.425

spark.udf.register("convert_mileage", convert_mileage)

spark.sql("SELECT *, mpg AS mpg, convert_mileage(mpg) as kmpl FROM cars").show()