In [1]:
import pandas as pd
transactions_data = {
    'TransactionID': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'CustomerID': [101, 102, 103, 104, 105, 101, 102, 103, 104, 105],
    'ProductID': [201, 202, 203, 204, 205, 201, 202, 203, 204, 205],
    'Quantity': [2, 1, 3, 5, 1, 2, 1, 3, 5, 1],
    'Price': [10.0, 15.0, 7.5, 20.0, 15.0, 10.0, 15.0, 7.5, 20.0, 15.0],
    'Order Date': ['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04', '2024-01-05',
                   '2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04', '2024-01-05'],
    'CustomerLocation': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix',
                         'New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']
}
products_data = {
    'ProductID': [201, 202, 203, 204, 205],
    'ProductName': ['Widget A', 'Widget B', 'Widget C', 'Widget D', 'Widget E'],
    'Category': ['Gadgets', 'Gadgets', 'Gadgets', 'Tools', 'Tools']
}

transactions_df = pd.DataFrame(transactions_data)
products_df = pd.DataFrame(products_data)

Merge the two tables on ProductID:

In [2]:
integrated_df = pd.merge(transactions_df, products_df, on='ProductID')
print(integrated_df.head())

   TransactionID  CustomerID  ProductID  Quantity  Price  Order Date  \
0              1         101        201         2   10.0  2024-01-01   
1              6         101        201         2   10.0  2024-01-01   
2              2         102        202         1   15.0  2024-01-02   
3              7         102        202         1   15.0  2024-01-02   
4              3         103        203         3    7.5  2024-01-03   

  CustomerLocation ProductName Category  
0         New York    Widget A  Gadgets  
1         New York    Widget A  Gadgets  
2      Los Angeles    Widget B  Gadgets  
3      Los Angeles    Widget B  Gadgets  
4          Chicago    Widget C  Gadgets  


Removing Duplicates

In [3]:
integrated_df.drop_duplicates(inplace=True)

Handling Outliers in Price Column Using IQR

In [7]:
Q1 = integrated_df['Price'].quantile(0.25)
Q3 = integrated_df['Price'].quantile(0.75)
IQR = Q3 - Q1
integrated_df = integrated_df[(integrated_df['Price'] >= (Q1 - 1.5 * IQR)) & (integrated_df['Price'] <= (Q3 + 1.5 * IQR))]

Min-Max Normalization

In [8]:
integrated_df['Quantity'] = (integrated_df['Quantity'] - integrated_df['Quantity'].min()) / (integrated_df['Quantity'].max() - integrated_df['Quantity'].min())
integrated_df['Price'] = (integrated_df['Price'] - integrated_df['Price'].min()) / (integrated_df['Price'].max() - integrated_df['Price'].min())
print(integrated_df.head())

   TransactionID  CustomerID  ProductID  Quantity  Price  Order Date  \
0              1         101        201      0.25    0.2  2024-01-01   
1              6         101        201      0.25    0.2  2024-01-01   
2              2         102        202      0.00    0.6  2024-01-02   
3              7         102        202      0.00    0.6  2024-01-02   
4              3         103        203      0.50    0.0  2024-01-03   

  CustomerLocation ProductName Category  
0         New York    Widget A  Gadgets  
1         New York    Widget A  Gadgets  
2      Los Angeles    Widget B  Gadgets  
3      Los Angeles    Widget B  Gadgets  
4          Chicago    Widget C  Gadgets  


One-Hot Encoding

In [9]:
integrated_df = pd.get_dummies(integrated_df, columns=['Category'], drop_first=True)
print(integrated_df.head())

   TransactionID  CustomerID  ProductID  Quantity  Price  Order Date  \
0              1         101        201      0.25    0.2  2024-01-01   
1              6         101        201      0.25    0.2  2024-01-01   
2              2         102        202      0.00    0.6  2024-01-02   
3              7         102        202      0.00    0.6  2024-01-02   
4              3         103        203      0.50    0.0  2024-01-03   

  CustomerLocation ProductName  Category_Tools  
0         New York    Widget A           False  
1         New York    Widget A           False  
2      Los Angeles    Widget B           False  
3      Los Angeles    Widget B           False  
4          Chicago    Widget C           False  
