### **Importing the libraries required for this project.**

In [None]:
# The Pandas library will be used for preprocessing and organizing text data into a data frame for further analysis.

import pandas as pd

In [None]:
# Numpy will be used for performing numerical computations on textual data converted into numerical vectors.

import numpy as np

In [None]:
# Matplotlib will be used for data visualisation.

import matplotlib.pyplot as plt


In [None]:
# Seaborn will be used for data visualisation.

import seaborn as sns




---



### **Reading in the data**

In [None]:
data_df = pd.read_csv("/content/drive/MyDrive/Colab/5 Renewable Energy/owid-energy-data.csv")



---



### **Inspecting the data**

In [None]:
# Viewing the first 10 rows of the dataset for a brief overview of the structure and composition of the DataFrame.

data_df.head(10)

### **Creating a filtered dataframe showing only data from South Africa**
As this study is focussed on renewable energy generation in South Africa, I can reduce the size of the dataset and optimise my workflow by filtering the rows of data applicable to other countries.


In [None]:
# Filtering rows where 'country' column contains 'South Africa'

df = data_df[data_df['country'] == 'South Africa']


In [None]:
# Retrieving the dimensions of a dataframe. After filtering out the irrelevant rows, I now have 122 rows remaining of the original 21890.
df.shape



---



### **Inspecting the filtered dataset**

In [None]:
# Identifying the columns in the dataframe.
df.columns

In [None]:
# Identifying the types of data in each column.

df.dtypes

In [None]:
# A summary of the dataframe, including the number of non-null values and the data type of each column.

df.info()

In [None]:
# Calculating the percentage of missing values in each column, and printing the column name and percentage of missing values in that column.

for col in df.columns:
  pct_missing = np.mean(df[col].isnull())
  print('{} - {}%'.format(col, pct_missing))

In [None]:
# Returning the sum of missing values in each column of the dataframe.

df.isnull().sum()




---



### **Data cleaning**

In [None]:
# Dropping the rows which correspond with the years between 1900 and 1984, where no data was tracked.

df.drop(df.loc[df['year'] <= 1985].index, inplace=True)

In [None]:
# Dropping the redundant columns.
columns_to_drop = ['country', 'iso_code', 'population', 'gdp']
df = df.drop(columns=columns_to_drop)

In [None]:
# Retrieving the dimensions of a dataframe to verify that 4 columns have been removed. 125 columns remaining of the original 129.
df.shape

In [None]:
# The dataset contains percentage values, which will skew the analysis.
# Dropping columns that contain percentage values.

df = df.filter(regex='^(?!.*(_change_pct|_change_twh|_per_capita|_share_elec|_share_energy)).*$')


In [None]:
# Identifying the remaining columns in the dataframe.
df.columns

In [None]:
# Viewing the first 10 rows of the dataset for a brief overview of the structure and composition of the DataFrame.

df.head(10)

In [None]:
# Replacing all NaN values with zero
df = df.fillna(0)

# Printing the DataFrame with NaN values replaced
print(df)



---



### **Summary statistics on key Category Variables**
Generating summary statistics each variable, providing insights into the central tendency, spread, and distribution of the data.

In [None]:
df.describe()



---



### **Exploratory Data Analysis: Visualisations**


#### **Energy generation over time**


In [None]:
# Plotting electricity generation trends over time
plt.figure(figsize=(12, 6))
plt.plot(df['year'], df['electricity_generation'])
plt.xlabel('Year')
plt.ylabel('Electricity Generation')
plt.title('Electricity Generation Trends')
plt.show()


#### **Energy demand vs Energy Generation**


In [None]:
# Plotting electricity demand vs. electricity generation using a scatter plot
plt.figure(figsize=(12, 6))
plt.scatter(df['electricity_demand'], df['electricity_generation'])
plt.xlabel('Electricity Demand')
plt.ylabel('Electricity Generation')
plt.title('Electricity Demand vs. Generation')
plt.show()


#### **Energy generation per source**


In [None]:
# Selecting columns for stacked area plot
energy_sources = ['biofuel_electricity', 'coal_electricity', 'gas_electricity', 'fossil_electricity', 'low_carbon_electricity', 'hydro_electricity', 'nuclear_electricity', 'oil_electricity', 'solar_electricity', 'wind_electricity', 'other_renewable_electricity']

# Plotting stacked area chart
plt.figure(figsize=(12, 6))
plt.stackplot(df['year'], [df[column] for column in energy_sources], labels=energy_sources)
plt.xlabel('Year')
plt.ylabel('Electricity Generation')
plt.title('Electricity Generation by Energy Source')
plt.legend()
plt.show()


#### **Energy consumption per category**


In [None]:
# Plotting energy consumption comparison using a bar plot
energy_sources = ['biofuel_consumption', 'coal_consumption', 'gas_consumption', 'low_carbon_consumption', 'hydro_consumption', 'nuclear_consumption', 'oil_consumption', 'solar_consumption', 'wind_consumption', 'other_renewable_consumption']
consumption_values = [df[column].sum() for column in energy_sources]

plt.figure(figsize=(30, 6))
plt.bar(energy_sources, consumption_values)
plt.xlabel('Energy Sources')
plt.ylabel('Total Consumption')
plt.title('Energy Consumption Comparison')
plt.show()




---



### **Histograms**


#### **Histogram: Distribution of electricity demand**


In [None]:
# Plot histogram of electricity demand
plt.hist(df['electricity_demand'], bins=5)
plt.xlabel('Electricity demand')
plt.ylabel('Frequency')
plt.title('Distribution of Electricity demand')
plt.show()


#### **Histogram: Distribution of electricity generation**


In [None]:
# Plot histogram of electricity generation
plt.hist(df['electricity_generation'], bins=5)
plt.xlabel('Electricity generation')
plt.ylabel('Frequency')
plt.title('Distribution of Electricity generation')
plt.show()


#### **Histogram: Distribution of greenhouse gas emissions**


In [None]:
# Plot histogram of greenhouse gas emissions
plt.hist(df['greenhouse_gas_emissions'], bins=5)
plt.xlabel('Greenhouse Gas Emissions')
plt.ylabel('Frequency')
plt.title('Distribution of Greenhouse Gas Emissions')
plt.show()


### **Correlation Analysis**


#### **Correlation heatmap: Generation, demand, gas emissions and energy per GDP**

In [None]:
# Select relevant columns for correlation heatmap
correlation = ['electricity_generation', 'electricity_demand', 'greenhouse_gas_emissions', 'energy_per_gdp']

# Compute correlation matrix
corr_matrix = df[correlation].corr()

# Plot correlation heatmap
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()



---



#### **Correlation matrix: Generation, demand, gas emissions and energy per GDP**

In [None]:
# Plot a pairwise scatterplot of all of the variables
sns.pairplot(corr_matrix)

# Show the plot
plt.show()


### **Correlation Co-efficient Matrix: Generation, demand, gas emissions and energy per GDP**

In [None]:
# Calculate the correlation coefficient matrix
correlation_matrix = corr_matrix.corr(method='pearson')

# Visualising the correlation matrix
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")

plt.title('Correlation matrix')
plt.xlabel('Variables')
plt.ylabel('Variables')

plt.show()




---

