## **Import Libraries and Data** 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as datetime

In [2]:
data = pd.read_excel("Badly-Structured-Sales-Data-1.xlsx")
data.head()

Unnamed: 0,Segment>>,Consumer,Unnamed: 2,Unnamed: 3,Unnamed: 4,Consumer Total,Corporate,Unnamed: 7,Unnamed: 8,Unnamed: 9,Corporate Total,Home Office,Unnamed: 12,Unnamed: 13,Unnamed: 14,Home Office Total
0,Ship Mode>>,First Class,Same Day,Second Class,Standard Class,,First Class,Same Day,Second Class,Standard Class,,First Class,Same Day,Second Class,Standard Class,
1,Order ID,,,,,,,,,,,,,,,
2,CA-2011-100293,,,,,,,,,,,,,,91.056,91.056
3,CA-2011-100706,,,129.44,,129.44,,,,,,,,,,
4,CA-2011-100895,,,,605.47,605.47,,,,,,,,,,


## **Problem:** Transform the data so that it can be used for analysis. 

### **Proposed Solution:** 
- Create 4 columns
    - Segment
    - Order ID
    - Ship Mode
    - Sales

- **Workflow:**
    - Delete columns with `Total` in it.
    - Rename columns with Unnamed with the previous segment name, as it is logical to assume that these are the headers under segment mentioned in preious column.
    - Merge the column name and first row, in first row.
    - Rename columns where they will be replaced by merged names.
    - Melt the dataset to get a long dataset with 3 columns.
    - Rename columns appropriatedly.
    - Split the column that we merged to create segment and ship mode columns. 

In [3]:
# Delete columns with Total in it

columns_to_drop = [i for i in data.columns if "Total" in i]

data.drop(columns= columns_to_drop, inplace= True)

display(data.head(5))

Unnamed: 0,Segment>>,Consumer,Unnamed: 2,Unnamed: 3,Unnamed: 4,Corporate,Unnamed: 7,Unnamed: 8,Unnamed: 9,Home Office,Unnamed: 12,Unnamed: 13,Unnamed: 14
0,Ship Mode>>,First Class,Same Day,Second Class,Standard Class,First Class,Same Day,Second Class,Standard Class,First Class,Same Day,Second Class,Standard Class
1,Order ID,,,,,,,,,,,,
2,CA-2011-100293,,,,,,,,,,,,91.056
3,CA-2011-100706,,,129.44,,,,,,,,,
4,CA-2011-100895,,,,605.47,,,,,,,,


In [4]:
# Rename "Unnamed" columns with the previous segment name
for i in range(len(data.columns)):
    # Check if "Unnamed" is in the column name
    if 'Unnamed' in data.columns[i]:
        # Replace with the previous column's name
        data.columns.values[i] = data.columns[i - 1]  

display(data.head(5))

Unnamed: 0,Segment>>,Consumer,Consumer.1,Consumer.2,Consumer.3,Corporate,Corporate.1,Corporate.2,Corporate.3,Home Office,Home Office.1,Home Office.2,Home Office.3
0,Ship Mode>>,First Class,Same Day,Second Class,Standard Class,First Class,Same Day,Second Class,Standard Class,First Class,Same Day,Second Class,Standard Class
1,Order ID,,,,,,,,,,,,
2,CA-2011-100293,,,,,,,,,,,,91.056
3,CA-2011-100706,,,129.44,,,,,,,,,
4,CA-2011-100895,,,,605.47,,,,,,,,


In [5]:
# Merge the column name and first row, in columns and drop first row

for i, j in enumerate(data.columns):
    data.columns.values[i] = f"{j}_{data.iloc[0, i]}"

# Drop first row
data.drop(index= 0, inplace= True)

display(data.head(5))

Unnamed: 0,Segment>>_Ship Mode>>,Consumer_First Class,Consumer_Same Day,Consumer_Second Class,Consumer_Standard Class,Corporate_First Class,Corporate_Same Day,Corporate_Second Class,Corporate_Standard Class,Home Office_First Class,Home Office_Same Day,Home Office_Second Class,Home Office_Standard Class
1,Order ID,,,,,,,,,,,,
2,CA-2011-100293,,,,,,,,,,,,91.056
3,CA-2011-100706,,,129.44,,,,,,,,,
4,CA-2011-100895,,,,605.47,,,,,,,,
5,CA-2011-100916,,,,,,,,788.86,,,,


In [6]:
# drop index= 1, another row

data.drop(index= 1, inplace= True)

display(data.head(5))

Unnamed: 0,Segment>>_Ship Mode>>,Consumer_First Class,Consumer_Same Day,Consumer_Second Class,Consumer_Standard Class,Corporate_First Class,Corporate_Same Day,Corporate_Second Class,Corporate_Standard Class,Home Office_First Class,Home Office_Same Day,Home Office_Second Class,Home Office_Standard Class
2,CA-2011-100293,,,,,,,,,,,,91.056
3,CA-2011-100706,,,129.44,,,,,,,,,
4,CA-2011-100895,,,,605.47,,,,,,,,
5,CA-2011-100916,,,,,,,,788.86,,,,
6,CA-2011-101266,,,13.36,,,,,,,,,


In [7]:
# Rename columns where they will be replaced by merged names

data.columns = ['Order_id', 'Consumer_First Class', 'Consumer_Same Day', 'Consumer_Second Class', 'Consumer_Standard Class',
                'Corporate_First Class', 'Corporate_Same Day', 'Corporate_Second Class', 'Corporate_Standard Class', 
                'Home Office_First Class','Home Office_Same Day', 'Home Office_Second Class','Home Office_Standard Class']

data.columns

Index(['Order_id', 'Consumer_First Class', 'Consumer_Same Day',
       'Consumer_Second Class', 'Consumer_Standard Class',
       'Corporate_First Class', 'Corporate_Same Day', 'Corporate_Second Class',
       'Corporate_Standard Class', 'Home Office_First Class',
       'Home Office_Same Day', 'Home Office_Second Class',
       'Home Office_Standard Class'],
      dtype='object')

In [8]:
# Drop the "Grand Total" row as it should not be included in the melted data
data = data[data['Order_id'] != 'Grand Total']

In [9]:
# Melt the dataset to get a long dataset with 3 columns

data_long = data.melt(id_vars= "Order_id",
                     var_name = "Segment Ship",
                     value_name = 'Sales')

data_long= data_long[data_long['Sales'].notna()]
data_long = data_long.reset_index(drop=True)
data_long

Unnamed: 0,Order_id,Segment Ship,Sales
0,CA-2011-103366,Consumer_First Class,149.95
1,CA-2011-109043,Consumer_First Class,243.6
2,CA-2011-113166,Consumer_First Class,9.568
3,CA-2011-124023,Consumer_First Class,8.96
4,CA-2011-130155,Consumer_First Class,34.2
...,...,...,...
817,US-2014-129224,Home Office_Standard Class,4.608
818,US-2014-132031,Home Office_Standard Class,513.496
819,US-2014-132297,Home Office_Standard Class,598.31
820,US-2014-132675,Home Office_Standard Class,148.16


In [10]:
# Split the column that we merged to create segment and ship mode columns.

data_long[['Segment', 'Ship Mode']] = data_long['Segment Ship'].str.split("_", expand= True)
data_long.drop(columns= "Segment Ship", inplace= True)

display(data_long)

Unnamed: 0,Order_id,Sales,Segment,Ship Mode
0,CA-2011-103366,149.95,Consumer,First Class
1,CA-2011-109043,243.6,Consumer,First Class
2,CA-2011-113166,9.568,Consumer,First Class
3,CA-2011-124023,8.96,Consumer,First Class
4,CA-2011-130155,34.2,Consumer,First Class
...,...,...,...,...
817,US-2014-129224,4.608,Home Office,Standard Class
818,US-2014-132031,513.496,Home Office,Standard Class
819,US-2014-132297,598.31,Home Office,Standard Class
820,US-2014-132675,148.16,Home Office,Standard Class
