In [496]:
from pyspark.sql import SparkSession
from pyspark.sql import *
from pyspark.sql import functions as F

In [474]:
spark = SparkSession.builder.appName('Practice_session_1').getOrCreate()

In [475]:
spark

# Pre-processing

In [553]:
# read the dataset
df = spark.read.csv('quarterly_data_test_v2.csv',header=True,inferSchema= True)

In [554]:
df.printSchema()

root
 |-- Year: integer (nullable = true)
 |-- Quarter: string (nullable = true)
 |-- Electricity_Transfers: integer (nullable = true)
 |-- Gas_Transfers: integer (nullable = true)
 |-- Total_Electricity_Customers: integer (nullable = true)
 |-- Total_Gas_Customers: integer (nullable = true)
 |-- Year_/_Quarter: string (nullable = true)
 |-- _c7: string (nullable = true)
 |-- _c8: string (nullable = true)
 |-- _c9: string (nullable = true)
 |-- _c10: string (nullable = true)
 |-- _c11: string (nullable = true)



In [555]:
type(df)

pyspark.sql.dataframe.DataFrame

In [556]:
df = df.select('Year',
 'Quarter',
 'Electricity_Transfers',
 'Gas_Transfers',
 'Total_Electricity_Customers',
 'Total_Gas_Customers',
 'Year_/_Quarter')

In [557]:
# df.drop('_c7', '_c8', '_c9', '_c10', '_c11')

In [558]:
df.printSchema()

root
 |-- Year: integer (nullable = true)
 |-- Quarter: string (nullable = true)
 |-- Electricity_Transfers: integer (nullable = true)
 |-- Gas_Transfers: integer (nullable = true)
 |-- Total_Electricity_Customers: integer (nullable = true)
 |-- Total_Gas_Customers: integer (nullable = true)
 |-- Year_/_Quarter: string (nullable = true)



In [559]:
from pyspark.sql.functions import col, trim

In [560]:
# df = df.withColumn('Quarter', trim(col('Quarter')))

In [561]:
df.printSchema()

root
 |-- Year: integer (nullable = true)
 |-- Quarter: string (nullable = true)
 |-- Electricity_Transfers: integer (nullable = true)
 |-- Gas_Transfers: integer (nullable = true)
 |-- Total_Electricity_Customers: integer (nullable = true)
 |-- Total_Gas_Customers: integer (nullable = true)
 |-- Year_/_Quarter: string (nullable = true)



In [562]:
df = df.toPandas()

In [563]:
df['Quarter'].replace({'Jan to Mar': 'Q1', 'Apr to Jun': 'Q2', 'Jul to Sep': 'Q3', 'Oct to Dec': 'Q4'}, inplace=True)
print(df['Quarter'])


0     Jan to Mar 
1     Apr to Jun 
2     Jul to Sep 
3     Oct to Dec 
4              Q1
         ...     
78    Jul to Sep 
79             Q4
80             Q1
81             Q2
82    Jul to Sep 
Name: Quarter, Length: 83, dtype: object


In [551]:
df = df.withColumn('Quarter', trim(df.Quarter))

AttributeError: 'DataFrame' object has no attribute 'withColumn'

In [509]:
df.printSchema()

root
 |-- Year: integer (nullable = true)
 |-- Quarter: string (nullable = true)
 |-- Electricity_Transfers: integer (nullable = true)
 |-- Gas_Transfers: integer (nullable = true)
 |-- Total_Electricity_Customers: integer (nullable = true)
 |-- Total_Gas_Customers: integer (nullable = true)
 |-- Year_/_Quarter: string (nullable = true)



In [512]:
# Assuming your DataFrame is named 'df'
df = (
    df.groupBy('Quarter')
    .count()
    .withColumn(
        'Quarter', 
        F.when(F.col('Quarter') == 'Jan to Mar', 'Q1')
        .when(F.col('Quarter') == 'Apr to Jun', 'Q2')
        .when(F.col('Quarter') == 'Jul to Sep', 'Q3')
        .when(F.col('Quarter') == 'Oct to Dec', 'Q4')
    )
)

df.show()

+-------+-----+
|Quarter|count|
+-------+-----+
|     Q3|    1|
|     Q4|    1|
|     Q2|    1|
|     Q1|    1|
+-------+-----+



## Gas and Electricity Transfers per Quarter

In [489]:
df.groupBy('Quarter').count().orderBy('Quarter').show()

+----------+-----+
|   Quarter|count|
+----------+-----+
|Apr to Jun|    1|
|Jan to Mar|    1|
|Jul to Sep|    1|
|Oct to Dec|    1|
+----------+-----+



In [490]:
df.groupBy('Quarter').Or.count().show()

AttributeError: 'GroupedData' object has no attribute 'Or'

### Combined Transfers

In [None]:
total_energy_transfers = df.groupBy('Quarter').agg({'Gas_Transfers': 'sum', 'Electricity_Transfers': 'sum'})

# Convert PySpark DataFrame to Pandas DataFrame
pandas_df = total_energy_transfers.toPandas()

# Plotting using Pandas
pandas_df.plot(kind='bar', x='Quarter', y=['sum(Electricity_Transfers)', 'sum(Gas_Transfers)'],
               title='Total Energy Transfers', figsize=(5, 5), rot=0, stacked = True)

### Gas Transfers

In [None]:
total_gas_transfer = df.groupBy('Quarter').sum('Gas_Transfers').toPandas()
total_gas_transfer

total_gas_transfer.plot(kind='line', x='Quarter', y='sum(Gas_Transfers)', 
               title = 'Total Gas Transfers', figsize = (5,5), rot = 0)

### Electricity Transfers

In [None]:
total_electricity_transfer = df.groupBy('Quarter').sum('Electricity_Transfers').toPandas()

total_electricity_transfer.plot(kind='line', x='Quarter', y='sum(Electricity_Transfers)', 
               title = 'Total Electricty Transfers', figsize = (5,5), rot = 0)

## Gas and Electricity Transfers per Year

### Combined

In [None]:
total_energy_transfer_per_year = df.groupBy('Year').agg({'Gas_Transfers': 'sum', 'Electricity_Transfers': 'sum'})

# Convert PySpark DataFrame to Pandas DataFrame
pandas_df_per_year = total_energy_transfer_per_year.toPandas()

# Sort the DataFrame by the 'Year' column
pandas_df_per_year = pandas_df_per_year.sort_values(by='Year')

# Plotting using Pandas
pandas_df.plot(kind='bar', x='Year', y=['sum(Electricity_Transfers)', 'sum(Gas_Transfers)'],
               title='Total Energy Transfers', figsize=(15, 15), rot=0, stacked = True)

### Gas Transfers

In [None]:
gas_transfer_per_year = df.groupBy('Year').sum('Gas_Transfers')

# Convert PySpark DataFrame to Pandas DataFrame
gas_pandas_df_per_year = gas_transfer_per_year.toPandas()

# Sort the DataFrame by the 'Year' column
gas_pandas_df_per_year = gas_pandas_df_per_year.sort_values(by='Year')

gas_pandas_df_per_year.plot(kind='bar', x='Year', y='sum(Gas_Transfers)', 
               title = 'Total Gas Transfers', figsize = (15,15), rot = 0)

### Electricity Transfers

In [None]:
electricity_transfer_per_year = df.groupBy('Year').sum('Electricity_Transfers')

# Convert PySpark DataFrame to Pandas DataFrame
electricity_pandas_df_per_year = electricity_transfer_per_year.toPandas()

# Sort the DataFrame by the 'Year' column
electricity_pandas_df_per_year = electricity_pandas_df_per_year.sort_values(by='Year')

electricity_pandas_df_per_year.plot(kind='bar', x='Year', y='sum(Electricity_Transfers)', 
               title = 'Total Electricity Transfers', figsize = (15,15), rot = 0)

## Gas and Electricity Customers per Quarter

### Combined Customers

In [None]:
total_energy_Customers = df.groupBy('Quarter').agg({'Total_Gas_Customers': 'sum', 'Total_Electricity_Customers': 'sum'})

# Convert PySpark DataFrame to Pandas DataFrame
pandas_df = total_energy_Customers.toPandas()

# Plotting using Pandas
pandas_df.plot(kind='line', x='Quarter', y=['sum(Total_Gas_Customers)', 'sum(Total_Electricity_Customers)'],
               title='Total Energy Customers per Quarter', figsize=(5, 5), rot=0, stacked = True)

### Gas Customers

In [None]:
total_gas_customers = df.groupBy('Quarter').sum('Total_Gas_Customers').toPandas()
total_gas_transfer

total_gas_customers.plot(kind='line', x='Quarter', y='sum(Total_Gas_Customers)', 
               title = 'Total Gas Customers per Quarter', figsize = (5,5), rot = 0)

### Electricity Customers

In [None]:
total_electricity_customers = df.groupBy('Quarter').sum('Total_Electricity_Customers').toPandas()
total_electricity_customers

total_electricity_customers.plot(kind='line', x='Quarter', y='sum(Total_Electricity_Customers)', 
               title = 'Total Electricity Customers per Quarter', figsize = (5,5), rot = 0)

## Gas and Electricity Customers per Year

### Combined Customers

In [None]:
total_energy_customers = df.groupBy('Year').agg({'Total_Gas_Customers': 'sum', 'Total_Electricity_Customers': 'sum'})

# Convert PySpark DataFrame to Pandas DataFrame
pandas_df = total_energy_customers.toPandas()
pandas_df = pandas_df.sort_values(by='Year')

# Plotting using Pandas
pandas_df.plot(kind='line', x='Year', y=['sum(Total_Gas_Customers)', 'sum(Total_Electricity_Customers)'],
               title='Total Energy Customers per Year', figsize=(10, 10), rot=0, stacked = True)

### Gas Customers

In [None]:
total_gas_Customers = df.groupBy('Year').sum('Total_Gas_Customers').toPandas()
total_gas_Customers = total_gas_Customers.sort_values(by='Year')

total_gas_Customers.plot(kind='line', x='Year', y='sum(Total_Gas_Customers)', 
               title = 'Total Gas Customers per Year', figsize = (5,5), rot = 0)

### Electricity Customers

In [None]:
total_electricity_Customers = df.groupBy('Year').sum('Total_Electricity_Customers').toPandas()
total_electricity_Customers = total_electricity_Customers.sort_values(by='Year')

total_electricity_Customers.plot(kind='line', x='Year', y='sum(Total_Electricity_Customers)', 
               title = 'Total Electricity Customers per Year', figsize = (5,5), rot = 0)