In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkConf,SparkContext

In [2]:
spark=SparkSession.builder.appName('DataFrame').getOrCreate()

In [3]:
spark

In [4]:
# Read the dataset

df=spark.read.csv('homeprices.csv',header=True,inferSchema=True)
df.show()

+----+--------+---+------+
|area|bedrooms|age| price|
+----+--------+---+------+
|2600|       3| 20|550000|
|3000|       4| 15|565000|
|3200|    null| 18|610000|
|3600|       3| 30|595000|
|4000|       5|  8|760000|
|4100|       6|  8|810000|
+----+--------+---+------+



In [5]:
# Prints out the schema in the tree format.

df.printSchema()

root
 |-- area: integer (nullable = true)
 |-- bedrooms: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- price: integer (nullable = true)



In [6]:
# Columns in DF

df.columns

['area', 'bedrooms', 'age', 'price']

In [7]:
# head(3)

df.head(3)

[Row(area=2600, bedrooms=3, age=20, price=550000),
 Row(area=3000, bedrooms=4, age=15, price=565000),
 Row(area=3200, bedrooms=None, age=18, price=610000)]

In [8]:
# Selecting columns

print(df.select('area'))
print(df.select('area').show())

DataFrame[area: int]
+----+
|area|
+----+
|2600|
|3000|
|3200|
|3600|
|4000|
|4100|
+----+

None


In [9]:
type(df.select('area'))

pyspark.sql.dataframe.DataFrame

In [10]:
df.select(['area','age']).show()

+----+---+
|area|age|
+----+---+
|2600| 20|
|3000| 15|
|3200| 18|
|3600| 30|
|4000|  8|
|4100|  8|
+----+---+



In [11]:
# Checking data types

df.dtypes

[('area', 'int'), ('bedrooms', 'int'), ('age', 'int'), ('price', 'int')]

In [12]:
# description

df.describe().show()

+-------+------------------+------------------+----------------+------------------+
|summary|              area|          bedrooms|             age|             price|
+-------+------------------+------------------+----------------+------------------+
|  count|                 6|                 5|               6|                 6|
|   mean|3416.6666666666665|               4.2|            16.5| 648333.3333333334|
| stddev| 587.9342366852493|1.3038404810405297|8.28854631404084|109117.67348448493|
|    min|              2600|                 3|               8|            550000|
|    max|              4100|                 6|              30|            810000|
+-------+------------------+------------------+----------------+------------------+



In [13]:
# Adding columns

df.withColumn('ageInMonth',df['age']*12).show()  # need to assign to get changed values

+----+--------+---+------+----------+
|area|bedrooms|age| price|ageInMonth|
+----+--------+---+------+----------+
|2600|       3| 20|550000|       240|
|3000|       4| 15|565000|       180|
|3200|    null| 18|610000|       216|
|3600|       3| 30|595000|       360|
|4000|       5|  8|760000|        96|
|4100|       6|  8|810000|        96|
+----+--------+---+------+----------+



In [14]:
# Drop columns

df.drop('age').show()   # not an inplace operation

+----+--------+------+
|area|bedrooms| price|
+----+--------+------+
|2600|       3|550000|
|3000|       4|565000|
|3200|    null|610000|
|3600|       3|595000|
|4000|       5|760000|
|4100|       6|810000|
+----+--------+------+



In [15]:
# Rename column

df.withColumnRenamed('area','totalArea').show()

+---------+--------+---+------+
|totalArea|bedrooms|age| price|
+---------+--------+---+------+
|     2600|       3| 20|550000|
|     3000|       4| 15|565000|
|     3200|    null| 18|610000|
|     3600|       3| 30|595000|
|     4000|       5|  8|760000|
|     4100|       6|  8|810000|
+---------+--------+---+------+



In [16]:
# filtering 

df.filter(df.bedrooms==3).show()

+----+--------+---+------+
|area|bedrooms|age| price|
+----+--------+---+------+
|2600|       3| 20|550000|
|3600|       3| 30|595000|
+----+--------+---+------+



In [17]:
# Can also use where for filtering

df.where(df.bedrooms==3).show()

+----+--------+---+------+
|area|bedrooms|age| price|
+----+--------+---+------+
|2600|       3| 20|550000|
|3600|       3| 30|595000|
+----+--------+---+------+



In [18]:
# multiple conditions

df.where((df.bedrooms==3)|(df.age==8)).show()

+----+--------+---+------+
|area|bedrooms|age| price|
+----+--------+---+------+
|2600|       3| 20|550000|
|3600|       3| 30|595000|
|4000|       5|  8|760000|
|4100|       6|  8|810000|
+----+--------+---+------+



In [19]:
# Converting DataFrame to SQL table

df.registerTempTable('df_sql')

In [20]:
# Querying SQL table

from pyspark.sql import SQLContext

sqlContext=SQLContext(spark)
sqlContext.sql('select * from df_sql').show()

+----+--------+---+------+
|area|bedrooms|age| price|
+----+--------+---+------+
|2600|       3| 20|550000|
|3000|       4| 15|565000|
|3200|    null| 18|610000|
|3600|       3| 30|595000|
|4000|       5|  8|760000|
|4100|       6|  8|810000|
+----+--------+---+------+



In [21]:
# Selecting row having min value in area

sqlContext.sql('select min(area) from df_sql').show()

+---------+
|min(area)|
+---------+
|     2600|
+---------+



In [22]:
# Nested querying

sqlContext.sql('select * from df_sql where bedrooms in (select min(bedrooms) from df_sql)').show()

+----+--------+---+------+
|area|bedrooms|age| price|
+----+--------+---+------+
|2600|       3| 20|550000|
|3600|       3| 30|595000|
+----+--------+---+------+



In [23]:
spark

In [28]:
userloan=spark.read.csv('userDetails.csv',header=True,inferSchema=True)
userloan

DataFrame[loansActiveCount: int, userId: string, loansSignedAndAcceptedCount: int, loansFullyRepaidCount: int, occupation: string, loansRequestedCount: int, monthlyIncome: string, familySize: string, incomePeriodicity: string, eaScore: double, address.city: string, age: double, repaymentDurationInDays: int, loanId: string, loanAmount: int, interestRatePerAnnumPercent: double, repaymentStatus: string, interestRatePerMonthPercent: double, collateralItemsTotalMarketPrice: double, interestAmount: int, CATPercent: int, refrenceInterestRate: int, repaymentInstallmentAmount: int, amountRepaid: int, repaymentNumberOfInstallments: int, amountNotRepaid: int, interestRatePercent: double, repaymentTotalAmount: int, totalInstallment: int, paidAfterDue: int, paidBeforeDue: int, dueDateNotpassed: int, notPaidAfterDueDate: int, score: int]

In [26]:
type(userloan)

pyspark.sql.dataframe.DataFrame

In [27]:
spark.createDataFrame(userloan)

TypeError: data is already a DataFrame

In [29]:
import plotly.express as px

In [35]:
userloan.isLocal()

False

In [39]:
loans=userloan.localCheckpoint()

In [33]:
px.bar(userloan.persist(),x=userloan.select('occupation').persist(),y=userloan.select('age').persist())

ValueError: DataFrame constructor not properly called!