# Mini Project : Data Wrangling..

In [1]:
import pandas as pd

In [5]:
# 1-> Loading the Dataset:

In [2]:
# Sample sales data
data = {'Product ID': ['101', '102', '103', '104', '105', 'null'],
        'Quantity Sold': ['100', '90', '80', '70', '60', 'null'],
        'Sales Price': [50, 40, 30, 'null', 20, 15],
        'Region': ['North', 'East', 'West', 'South', 'East', 'North']}

In [3]:
# Convert to DataFrame
df = pd.DataFrame(data)

In [4]:
print("Original Data:")
print(df)

Original Data:
  Product ID Quantity Sold Sales Price Region
0        101           100          50  North
1        102            90          40   East
2        103            80          30   West
3        104            70        null  South
4        105            60          20   East
5       null          null          15  North


In [None]:
# 2-> Handling Missing Values:

In [None]:
# Replace 'null' strings with actual NaN
df.replace('null', pd.NA, inplace=True)

In [8]:
# Drop rows with missing 'Product ID'
df.dropna(subset=['Product ID'], inplace=True)

In [None]:
# Fill missing 'Sales Price' with mean value
df['Sales Price'] = df['Sales Price'].astype(float)
df['Sales Price'].fillna(df['Sales Price'].mean(), inplace=True)

In [12]:
print("\\nAfter Handling Missing Values:")
print(df)

\nAfter Handling Missing Values:
  Product ID Quantity Sold Sales Price Region
0        101           100          50  North
1        102            90          40   East
2        103            80          30   West
3        104            70        <NA>  South
4        105            60          20   East


In [None]:
# 3->  Data Type Conversion:

In [13]:
# Convert 'Product ID' and 'Quantity Sold' to integer
df['Product ID'] = df['Product ID'].astype(int)
df['Quantity Sold'] = df['Quantity Sold'].astype(int)

In [14]:
print("\\nAfter Data Type Conversion:")
print(df)

\nAfter Data Type Conversion:
   Product ID  Quantity Sold Sales Price Region
0         101            100          50  North
1         102             90          40   East
2         103             80          30   West
3         104             70        <NA>  South
4         105             60          20   East


In [None]:
# 4-> Data Transformation:

In [15]:
# Calculate total sales and add a new column
df['Total Sales'] = df['Quantity Sold'] * df['Sales Price']

In [16]:
print("\\nAfter Data Transformation:")
print(df)

\nAfter Data Transformation:
   Product ID  Quantity Sold Sales Price Region Total Sales
0         101            100          50  North        5000
1         102             90          40   East        3600
2         103             80          30   West        2400
3         104             70        <NA>  South        <NA>
4         105             60          20   East        1200


In [None]:
# 5-> Handling Categorical Data:

In [17]:
# Convert 'Region' to categorical type
df['Region'] = df['Region'].astype('category')

In [18]:
print("\\nAfter Handling Categorical Data:")
print(df)

\nAfter Handling Categorical Data:
   Product ID  Quantity Sold Sales Price Region Total Sales
0         101            100          50  North        5000
1         102             90          40   East        3600
2         103             80          30   West        2400
3         104             70        <NA>  South        <NA>
4         105             60          20   East        1200


In [None]:
# 6-> Removing Duplicates:

In [19]:
# Drop duplicate rows if any
df.drop_duplicates(inplace=True)

In [20]:
print("\\nAfter Removing Duplicates:")
print(df)

\nAfter Removing Duplicates:
   Product ID  Quantity Sold Sales Price Region Total Sales
0         101            100          50  North        5000
1         102             90          40   East        3600
2         103             80          30   West        2400
3         104             70        <NA>  South        <NA>
4         105             60          20   East        1200


In [21]:
print("\\nFinal Data:")
print(df)

\nFinal Data:
   Product ID  Quantity Sold Sales Price Region Total Sales
0         101            100          50  North        5000
1         102             90          40   East        3600
2         103             80          30   West        2400
3         104             70        <NA>  South        <NA>
4         105             60          20   East        1200


In [22]:
# In this mini-project, we've walked through loading the data, handling missing values, converting data types,
# transforming data, handling categorical data, removing duplicates, and a bit about normalization and feature engineering.
# Depending on the dataset and the problem at hand, additional steps and techniques may be necessary.


#### Improve it more later