In [1]:
import numpy as np
import pandas as pd

## **Task: Transform the data.**
- Techniques used
  1. `Split the column content using delimiters`
  2. `Melt the data`

In [2]:
# Read data

data = pd.read_excel("Roadmap.xlsx", sheet_name= "SQL_Roadmap")
data.head()

Unnamed: 0,Level,Subtopic,Topics Covered
0,Foundational Concepts,Basics,"Introduction, Syntax, Data Types, Queries, Com..."
1,Foundational Concepts,Working with Tables,"Create, Alter, Drop Tables; Keys; Constraints"
2,Foundational Concepts,Data Manipulation,"Insert, Update, Delete, Truncate"
3,Intermediate Concepts,Data Retrieval and Filtering,"WHERE, AND, OR, NOT, LIKE, BETWEEN, NULL Handling"
4,Intermediate Concepts,Aggregations and Grouping,"Aggregate Functions, GROUP BY, HAVING"


### **Solution:**
1. Split the column `Topic Covered` split by `,`.
2. Melt the dataset.
3. Delete the rows with NA.

In [3]:
# Create a function that splits the column by delimiter

def split_column_to_multiple(data, column_name, delimiters):
    
    split_columns = data[column_name].str.split(delimiters, expand= True)

    return pd.concat([data, split_columns], axis= 1)

In [4]:
# Use the function

data_expanded= split_column_to_multiple(data, column_name= 'Topics Covered', delimiters= r",|;|\|" )
data_expanded.head()

Unnamed: 0,Level,Subtopic,Topics Covered,0,1,2,3,4,5,6
0,Foundational Concepts,Basics,"Introduction, Syntax, Data Types, Queries, Com...",Introduction,Syntax,Data Types,Queries,Comments,,
1,Foundational Concepts,Working with Tables,"Create, Alter, Drop Tables; Keys; Constraints",Create,Alter,Drop Tables,Keys,Constraints,,
2,Foundational Concepts,Data Manipulation,"Insert, Update, Delete, Truncate",Insert,Update,Delete,Truncate,,,
3,Intermediate Concepts,Data Retrieval and Filtering,"WHERE, AND, OR, NOT, LIKE, BETWEEN, NULL Handling",WHERE,AND,OR,NOT,LIKE,BETWEEN,NULL Handling
4,Intermediate Concepts,Aggregations and Grouping,"Aggregate Functions, GROUP BY, HAVING",Aggregate Functions,GROUP BY,HAVING,,,,


In [5]:
# Drop the column "Topics Covered" as it is not required

data_expanded.drop(columns= 'Topics Covered', inplace= True)
data_expanded.head()

Unnamed: 0,Level,Subtopic,0,1,2,3,4,5,6
0,Foundational Concepts,Basics,Introduction,Syntax,Data Types,Queries,Comments,,
1,Foundational Concepts,Working with Tables,Create,Alter,Drop Tables,Keys,Constraints,,
2,Foundational Concepts,Data Manipulation,Insert,Update,Delete,Truncate,,,
3,Intermediate Concepts,Data Retrieval and Filtering,WHERE,AND,OR,NOT,LIKE,BETWEEN,NULL Handling
4,Intermediate Concepts,Aggregations and Grouping,Aggregate Functions,GROUP BY,HAVING,,,,


In [6]:
# Melt the data_expended

data_melted= data_expanded.melt(id_vars= ['Level', 'Subtopic'],
                                var_name= 'Topic_number',
                                value_name= 'Topic')
data_melted

Unnamed: 0,Level,Subtopic,Topic_number,Topic
0,Foundational Concepts,Basics,0,Introduction
1,Foundational Concepts,Working with Tables,0,Create
2,Foundational Concepts,Data Manipulation,0,Insert
3,Intermediate Concepts,Data Retrieval and Filtering,0,WHERE
4,Intermediate Concepts,Aggregations and Grouping,0,Aggregate Functions
...,...,...,...,...
135,Specialized Use Cases,JSON and XML in SQL,6,
136,Specialized Use Cases,Advanced Analytical Functions,6,
137,Project-based Practice,Case Studies,6,
138,Project-based Practice,End-to-End Projects,6,


In [7]:
# Drop Topic_number column

data_melted.drop(columns= 'Topic_number', inplace=True)

In [8]:
# Drop Null rows

data_melted.dropna(subset= "Topic", inplace=True)

In [9]:
# Display data

display(data_melted)

Unnamed: 0,Level,Subtopic,Topic
0,Foundational Concepts,Basics,Introduction
1,Foundational Concepts,Working with Tables,Create
2,Foundational Concepts,Data Manipulation,Insert
3,Intermediate Concepts,Data Retrieval and Filtering,WHERE
4,Intermediate Concepts,Aggregations and Grouping,Aggregate Functions
...,...,...,...
80,Foundational Concepts,Basics,Comments
81,Foundational Concepts,Working with Tables,Constraints
83,Intermediate Concepts,Data Retrieval and Filtering,LIKE
103,Intermediate Concepts,Data Retrieval and Filtering,BETWEEN


In [13]:
# Custom order for Level column

custom_order = ['Foundational Concepts', 
                'Intermediate Concepts', 
                'Advanced Concepts', 
                'Specialized Use Cases', 
                'Project-based Practice']

data_melted['Level'] = pd.Categorical(data_melted['Level'], categories= custom_order, ordered=True)

data_melted = data_melted.sort_values(by= 'Level').reset_index(drop=True)
data_melted

Unnamed: 0,Level,Subtopic,Topic
0,Foundational Concepts,Basics,Introduction
1,Foundational Concepts,Data Manipulation,Delete
2,Foundational Concepts,Data Manipulation,Update
3,Foundational Concepts,Basics,Queries
4,Foundational Concepts,Working with Tables,Keys
...,...,...,...
60,Project-based Practice,End-to-End Projects,Custom Data Models
61,Project-based Practice,Interview-style Problems,Optimization
62,Project-based Practice,End-to-End Projects,AdventureWorks Scenarios
63,Project-based Practice,Interview-style Problems,Debugging


In [15]:
# Save the new DataFrame into Roadmap.xlsx

with pd.ExcelWriter("Roadmap.xlsx", engine= "openpyxl", mode="a") as writer:
    data_melted.to_excel(writer, sheet_name= "Transformed_Roadmap", index= False)