In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import missingno as msno
import re

In [3]:
iphone = pd.read_csv("Raw-Csvs/iphone.csv")
print(iphone.shape)
oneplus = pd.read_csv("Raw-Csvs/oneplus.csv")
print(oneplus.shape)
oppo = pd.read_csv("Raw-Csvs/oppo.csv")
print(oppo.shape)
realme = pd.read_csv("Raw-Csvs/realme.csv")
print(realme.shape)
samsung = pd.read_csv("Raw-Csvs/samsung.csv")
print(samsung.shape)
vivo = pd.read_csv("Raw-Csvs/vivo.csv")
print(vivo.shape)
xiaomi = pd.read_csv("Raw-Csvs/xiaomi.csv")
print(xiaomi.shape)

(81, 8)
(91, 8)
(94, 8)
(92, 8)
(81, 8)
(90, 8)
(87, 8)


**Filling iphone data**

In [4]:
iphone.head(7)

Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery
0,Apple iPhone 15 Pro (512 GB) - Blue Titanium,4.4,363,M.R.P:,139900.0,₹164900,(15% off),No
1,Apple iPhone 14 (128 GB) - Purple,4.5,5729,100+ bought in past month,55990.0,₹69600,(20% off),No
2,Apple iPhone 13 (128GB) - Starlight,4.5,31459,1K+ bought in past month,45490.0,₹59900,(24% off),No
3,Apple iPhone 13 (128GB) - Midnight,4.5,31459,1K+ bought in past month,45490.0,₹59600,(24% off),No
4,Apple iPhone 13 (128GB) - Pink,4.5,31459,500+ bought in past month,45490.0,₹59900,(24% off),No
5,Apple iPhone 13 (128GB) - Blue,4.5,31459,500+ bought in past month,45490.0,₹59600,(24% off),No
6,Apple iPhone 14 (128 GB) - Midnight,4.5,5729,300+ bought in past month,52990.0,₹69600,(24% off),No


In [5]:
iphone.isnull().sum()

Product Name          0
Rating (Stars)       11
Number of Reviews    11
Bought Last Month     4
Current MRP           2
Dashed MRP            9
Discount (%)          9
Free Delivery         0
dtype: int64

In [6]:
# Drop rows with any NaN values in the iphone DataFrame
iphone.dropna(inplace=True)

# Display the updated DataFrame to verify the changes
iphone.head()

Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery
0,Apple iPhone 15 Pro (512 GB) - Blue Titanium,4.4,363,M.R.P:,139900.0,₹164900,(15% off),No
1,Apple iPhone 14 (128 GB) - Purple,4.5,5729,100+ bought in past month,55990.0,₹69600,(20% off),No
2,Apple iPhone 13 (128GB) - Starlight,4.5,31459,1K+ bought in past month,45490.0,₹59900,(24% off),No
3,Apple iPhone 13 (128GB) - Midnight,4.5,31459,1K+ bought in past month,45490.0,₹59600,(24% off),No
4,Apple iPhone 13 (128GB) - Pink,4.5,31459,500+ bought in past month,45490.0,₹59900,(24% off),No


In [7]:
iphone.columns

Index(['Product Name', 'Rating (Stars)', 'Number of Reviews',
       'Bought Last Month', 'Current MRP', 'Dashed MRP', 'Discount (%)',
       'Free Delivery'],
      dtype='object')

In [8]:
# Replace 'M.R.P:' with '0' in 'Bought Last Month' column
iphone['Bought Last Month'] = iphone['Bought Last Month'].replace('M.R.P:', '0')

# Replace NaN values in 'Dashed MRP' with 'Current MRP' and NaN values in 'Discount (%)' with 0
iphone['Dashed MRP'] = iphone['Dashed MRP'].fillna(iphone['Current MRP'])
iphone['Discount (%)'] = iphone['Discount (%)'].fillna('0')

iphone.head()

Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery
0,Apple iPhone 15 Pro (512 GB) - Blue Titanium,4.4,363,0,139900.0,₹164900,(15% off),No
1,Apple iPhone 14 (128 GB) - Purple,4.5,5729,100+ bought in past month,55990.0,₹69600,(20% off),No
2,Apple iPhone 13 (128GB) - Starlight,4.5,31459,1K+ bought in past month,45490.0,₹59900,(24% off),No
3,Apple iPhone 13 (128GB) - Midnight,4.5,31459,1K+ bought in past month,45490.0,₹59600,(24% off),No
4,Apple iPhone 13 (128GB) - Pink,4.5,31459,500+ bought in past month,45490.0,₹59900,(24% off),No


In [9]:
# Step 1: Filter only Apple iPhone data
iphone = iphone[iphone['Product Name'].str.contains('Apple', case=False) & iphone['Product Name'].str.contains('iPhone', case=False)]

# Step 2: Extract details from Product Name using regex
def extract_details(product_name):
    # Regular expression pattern to capture brand, product name, storage, and color
    pattern = r"([A-Za-z]+)\s([A-Za-z0-9]+(?:\s[A-Za-z0-9]+)*)\s\((\d+)\s?GB\)\s?-\s?([A-Za-z\s]+)"
    match = re.match(pattern, product_name)
    
    if match:
        brand = match.group(1)
        product = match.group(2)
        storage = match.group(3) + "GB"
        color = match.group(4).strip()
        return [brand, product, storage, color]
    return [None, None, None, None]

# Apply the function to extract details and assign to relevant columns
iphone[['Brand', 'Product', 'Storage', 'Color']] = iphone['Product Name'].apply(lambda x: pd.Series(extract_details(x)))

# Step 3: Convert "Bought Last Month" to numeric, handle NaN as 0
def convert_bought_last_month(value):
    if pd.isna(value):  # Check for NaN values and return 0
        return 0
    # Extract number from string (handle cases like 5K, 200+)
    match = re.search(r'(\d+)(K|\+)?', value)
    if match:
        number = int(match.group(1))  # Get the number part
        if match.group(2) == 'K':  # If 'K' is present, multiply by 1000
            number *= 1000
        return number
    return 0  # Return 0 in case of invalid data

# Step 4: Convert "Dashed MRP" to numeric, handle NaN and assign to Current MRP
def convert_dashed_mrp(value, current_mrp):
    if pd.isna(value):  # If NaN, use the Current MRP
        return current_mrp
    # Remove ₹ symbol and commas, then convert to integer
    if isinstance(value, str):
        return int(value.replace('₹', '').replace(',', '').strip())
    return current_mrp  # If conversion fails, return current MRP

# Step 5: Convert "Discount (%)" to numeric, handle NaN as 0
def convert_discount(value):
    if pd.isna(value):  # If NaN, return 0
        return 0
    # Check if the value is a string and contains '%' symbol
    if isinstance(value, str):
        # Extract the number before the '%' sign, ignoring any non-numeric characters
        match = re.search(r'(\d+)', value)
        if match:
            return float(match.group(1))
    return 0  # Return 0 if the value cannot be converted



# Step 6: Drop rows where "Current MRP" is NaN
iphone = iphone.dropna(subset=['Current MRP'])

# Step 7: Apply conversions
iphone['New Bought Last Month'] = iphone['Bought Last Month'].apply(convert_bought_last_month)
iphone['New Dashed MRP'] = iphone.apply(lambda row: convert_dashed_mrp(row['Dashed MRP'], row['Current MRP']), axis=1)
iphone['New Discount (%)'] = iphone['Discount (%)'].apply(convert_discount)

# Step 8: Map "Free Delivery" values to 0 (No) and 1 (Yes)
iphone['Free Delivery'] = iphone['Free Delivery'].map({'No': 0, 'Yes': 1})

# Step 9: Replace missing "Color" with "Red"
iphone['Color'] = iphone['Color'].fillna('Red')

# Step 10: Show the cleaned and processed data
# print(iphone[['Brand', 'Product', 'Storage', 'Color', 'New Bought Last Month', 'New Dashed MRP', 'New Discount (%)', 'Free Delivery']])

iphone

Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery,Brand,Product,Storage,Color,New Bought Last Month,New Dashed MRP,New Discount (%)
0,Apple iPhone 15 Pro (512 GB) - Blue Titanium,4.4,363,0,139900.0,₹164900,(15% off),0,Apple,iPhone 15 Pro,512GB,Blue Titanium,0,164900,15.0
1,Apple iPhone 14 (128 GB) - Purple,4.5,5729,100+ bought in past month,55990.0,₹69600,(20% off),0,Apple,iPhone 14,128GB,Purple,100,69600,20.0
2,Apple iPhone 13 (128GB) - Starlight,4.5,31459,1K+ bought in past month,45490.0,₹59900,(24% off),0,Apple,iPhone 13,128GB,Starlight,1000,59900,24.0
3,Apple iPhone 13 (128GB) - Midnight,4.5,31459,1K+ bought in past month,45490.0,₹59600,(24% off),0,Apple,iPhone 13,128GB,Midnight,1000,59600,24.0
4,Apple iPhone 13 (128GB) - Pink,4.5,31459,500+ bought in past month,45490.0,₹59900,(24% off),0,Apple,iPhone 13,128GB,Pink,500,59900,24.0
5,Apple iPhone 13 (128GB) - Blue,4.5,31459,500+ bought in past month,45490.0,₹59600,(24% off),0,Apple,iPhone 13,128GB,Blue,500,59600,24.0
6,Apple iPhone 14 (128 GB) - Midnight,4.5,5729,300+ bought in past month,52990.0,₹69600,(24% off),0,Apple,iPhone 14,128GB,Midnight,300,69600,24.0
7,Apple iPhone 13 (128GB) - Green,4.5,31459,500+ bought in past month,45490.0,₹59600,(24% off),0,Apple,iPhone 13,128GB,Green,500,59600,24.0
8,Apple iPhone 15 (128 GB) - Blue,4.5,2431,500+ bought in past month,64900.0,₹79600,(18% off),0,Apple,iPhone 15,128GB,Blue,500,79600,18.0
9,Apple iPhone 15 (128 GB) - Black,4.5,2431,500+ bought in past month,64900.0,₹79600,(18% off),0,Apple,iPhone 15,128GB,Black,500,79600,18.0


In [10]:

# Function to determine RAM based on product name
def get_ram(product_name):
    if "iPhone 16 Pro" in product_name or "iPhone 16" in product_name:
        return "8 GB"
    elif "iPhone 15 Plus" in product_name or "iPhone 15" in product_name:
        return "6 GB"
    elif "iPhone 14 Plus" in product_name or "iPhone 14" in product_name:
        return "6 GB"
    elif "iPhone 13" in product_name:
        return "4 or 6 GB"
    elif "iPhone 12" in product_name:
        return "4 GB"
    elif "iPhone 11" in product_name:
        return "4 GB"
    elif "iPhone SE (2nd gen.)" in product_name:
        return "3 GB"
    else:
        return None  # Default if not matched

# Apply the function to create a new 'RAM' column
iphone["RAM"] = iphone["Product Name"].apply(get_ram)

# Display the updated DataFrame
iphone["RAM"].unique()


array(['6 GB', '4 or 6 GB'], dtype=object)

In [11]:
iphone['Color'] = iphone['Color'].fillna('Black')
iphone['RAM'] = iphone['RAM'].replace('4 or 6 GB', '4 GB')
iphone['Brand'] = "Apple"
iphone.head()

Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery,Brand,Product,Storage,Color,New Bought Last Month,New Dashed MRP,New Discount (%),RAM
0,Apple iPhone 15 Pro (512 GB) - Blue Titanium,4.4,363,0,139900.0,₹164900,(15% off),0,Apple,iPhone 15 Pro,512GB,Blue Titanium,0,164900,15.0,6 GB
1,Apple iPhone 14 (128 GB) - Purple,4.5,5729,100+ bought in past month,55990.0,₹69600,(20% off),0,Apple,iPhone 14,128GB,Purple,100,69600,20.0,6 GB
2,Apple iPhone 13 (128GB) - Starlight,4.5,31459,1K+ bought in past month,45490.0,₹59900,(24% off),0,Apple,iPhone 13,128GB,Starlight,1000,59900,24.0,4 GB
3,Apple iPhone 13 (128GB) - Midnight,4.5,31459,1K+ bought in past month,45490.0,₹59600,(24% off),0,Apple,iPhone 13,128GB,Midnight,1000,59600,24.0,4 GB
4,Apple iPhone 13 (128GB) - Pink,4.5,31459,500+ bought in past month,45490.0,₹59900,(24% off),0,Apple,iPhone 13,128GB,Pink,500,59900,24.0,4 GB


**ONEPLUS**

In [12]:
oneplus.head()

Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery
0,Samsung Galaxy S23 Ultra 5G AI Smartphone (Pha...,4.5,2656,2K+ bought in past month,74990.0,₹149999,(50% off),No
1,"OnePlus Nord CE4 Lite 5G (Super Silver, 8GB RA...",4.1,3398,5K+ bought in past month,19999.0,₹20999,(5% off),No
2,"OnePlus Nord CE 3 5G (Aqua Surge, 8GB RAM, 128...",4.2,5398,5K+ bought in past month,16999.0,₹26999,(37% off),No
3,"OnePlus Nord CE4 Lite 5G (Mega Blue, 8GB RAM, ...",4.1,3398,4K+ bought in past month,19999.0,₹20999,(5% off),No
4,"Oneplus Nord CE4 (Dark Chrome, 8GB RAM, 128GB ...",4.2,6216,2K+ bought in past month,24999.0,,,No


In [13]:
# Step 1: Update 'Bought Last Month' to 0 where it contains 'M.R.P:' or doesn't contain numbers
oneplus['Bought Last Month'] = oneplus['Bought Last Month'].apply(lambda x: 0 if isinstance(x, str) and ('M.R.P:' in x or not any(char.isdigit() for char in x)) else x)

# Step 2: Remove rows where 'Current MRP' is NaN
oneplus = oneplus[oneplus['Current MRP'].notna()]

# Step 3: Remove rows where 'Product Name' does not contain both 'Samsung' and 'Galaxy'
oneplus = oneplus[oneplus['Product Name'].str.contains('OnePlus', case=False) | oneplus['Product Name'].str.contains('Nord', case=False)]

# Step 4: Remove rows where 'Rating (Stars)' is NaN
oneplus = oneplus[oneplus['Rating (Stars)'].notna()]
# Display the cleaned dataframe
oneplus.head()


Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery
1,"OnePlus Nord CE4 Lite 5G (Super Silver, 8GB RA...",4.1,3398,5K+ bought in past month,19999.0,₹20999,(5% off),No
2,"OnePlus Nord CE 3 5G (Aqua Surge, 8GB RAM, 128...",4.2,5398,5K+ bought in past month,16999.0,₹26999,(37% off),No
3,"OnePlus Nord CE4 Lite 5G (Mega Blue, 8GB RAM, ...",4.1,3398,4K+ bought in past month,19999.0,₹20999,(5% off),No
4,"Oneplus Nord CE4 (Dark Chrome, 8GB RAM, 128GB ...",4.2,6216,2K+ bought in past month,24999.0,,,No
5,"OnePlus Nord CE 3 Lite 5G (Pastel Lime, 8GB RA...",4.2,59050,1K+ bought in past month,15690.0,₹19999,(22% off),No


In [14]:
# Replace 'M.R.P:' with '0' in 'Bought Last Month' column
oneplus['Bought Last Month'] = oneplus['Bought Last Month'].replace('M.R.P:', '0')

# Replace NaN values in 'Dashed MRP' with 'Current MRP' and NaN values in 'Discount (%)' with 0
oneplus['Dashed MRP'] = oneplus['Dashed MRP'].fillna(oneplus['Current MRP'])
oneplus['Discount (%)'] = oneplus['Discount (%)'].fillna('0')

oneplus.head()

Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery
1,"OnePlus Nord CE4 Lite 5G (Super Silver, 8GB RA...",4.1,3398,5K+ bought in past month,19999.0,₹20999,(5% off),No
2,"OnePlus Nord CE 3 5G (Aqua Surge, 8GB RAM, 128...",4.2,5398,5K+ bought in past month,16999.0,₹26999,(37% off),No
3,"OnePlus Nord CE4 Lite 5G (Mega Blue, 8GB RAM, ...",4.1,3398,4K+ bought in past month,19999.0,₹20999,(5% off),No
4,"Oneplus Nord CE4 (Dark Chrome, 8GB RAM, 128GB ...",4.2,6216,2K+ bought in past month,24999.0,24999.0,0,No
5,"OnePlus Nord CE 3 Lite 5G (Pastel Lime, 8GB RA...",4.2,59050,1K+ bought in past month,15690.0,₹19999,(22% off),No


In [15]:
# Define color options to check against
colors = ['Black', 'Blue', 'White', 'Gray', 'Silver', 'Green', 'Red', 'Pink', 'Gold']

# Step 1: Define Brand based on Product Name
oneplus['Brand'] = oneplus['Product Name'].apply(lambda x: 'OnePlus' if 'OnePlus' in x else 'Unknown')

# Step 2: Extract Product Name (i.e., everything after 'OnePlus')
oneplus['Product'] = oneplus['Product Name'].apply(
    lambda x: re.search(r'OnePlus ([^(]+)', x).group(0) if re.search(r'OnePlus ([^(]+)', x) else None
)

# Step 3: Extract Color (if exists, otherwise default to 'Black')
oneplus['Color'] = oneplus['Product Name'].apply(
    lambda x: next((color for color in colors if color in x), 'Black')
)

# Step 4: Extract RAM (in the format of '6GB', '8GB', etc.)
oneplus['RAM'] = oneplus['Product Name'].apply(
    lambda x: re.search(r'(\d+GB)(?=\s*RAM)', x).group(0) if re.search(r'(\d+GB)(?=\s*RAM)', x) else None
)

# Step 5: Extract Storage (in the format of '128GB', '256GB', etc.)
oneplus['Storage'] = oneplus['Product Name'].apply(
    lambda x: re.search(r'(\d+GB)(?=\s*Storage)', x).group(0) if re.search(r'(\d+GB)(?=\s*Storage)', x) else None
)

# Step 6: Remove rows where RAM or Storage is missing
oneplus = oneplus.dropna(subset=['RAM', 'Storage'])

# Display the dataframe with the new columns
oneplus.head()


Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery,Brand,Product,Color,RAM,Storage
1,"OnePlus Nord CE4 Lite 5G (Super Silver, 8GB RA...",4.1,3398,5K+ bought in past month,19999.0,₹20999,(5% off),No,OnePlus,OnePlus Nord CE4 Lite 5G,Silver,8GB,128GB
2,"OnePlus Nord CE 3 5G (Aqua Surge, 8GB RAM, 128...",4.2,5398,5K+ bought in past month,16999.0,₹26999,(37% off),No,OnePlus,OnePlus Nord CE 3 5G,Black,8GB,128GB
3,"OnePlus Nord CE4 Lite 5G (Mega Blue, 8GB RAM, ...",4.1,3398,4K+ bought in past month,19999.0,₹20999,(5% off),No,OnePlus,OnePlus Nord CE4 Lite 5G,Blue,8GB,128GB
4,"Oneplus Nord CE4 (Dark Chrome, 8GB RAM, 128GB ...",4.2,6216,2K+ bought in past month,24999.0,24999.0,0,No,Unknown,,Black,8GB,128GB
5,"OnePlus Nord CE 3 Lite 5G (Pastel Lime, 8GB RA...",4.2,59050,1K+ bought in past month,15690.0,₹19999,(22% off),No,OnePlus,OnePlus Nord CE 3 Lite 5G,Black,8GB,128GB


In [16]:
# Function to convert Bought Last Month to a numeric value
def convert_bought_last_month(value):
    if pd.isna(value) or not isinstance(value, str):  # Check for NaN or non-string values
        return 0  # Return 0 if NaN or not a string
    # Extract number from string (handle cases like 5K, 200+)
    match = re.search(r'(\d+)(K|\+)?', value)
    if match:
        number = int(match.group(1))  # Get the number part
        if match.group(2) == 'K':  # If 'K' is present, multiply by 1000
            number *= 1000
        return number
    return 0  # Return 0 if regex doesn't match

# Function to convert Dashed MRP to numeric value
def convert_dashed_mrp(value, current_mrp):
    if pd.isna(value):  # If NaN, use the Current MRP
        return current_mrp
    # Remove ₹ symbol and commas, then convert to integer
    if isinstance(value, str):
        return int(value.replace('₹', '').replace(',', '').strip())
    return current_mrp  # If conversion fails, return current MRP

# Function to convert Discount (%) to numeric, handle NaN as 0
def convert_discount(value):
    if pd.isna(value):  # If NaN, return 0
        return 0
    # Check if the value is a string and contains '%' symbol
    if isinstance(value, str):
        # Extract the number before the '%' sign, ignoring any non-numeric characters
        match = re.search(r'(\d+)', value)
        if match:
            return float(match.group(1))
    return 0  # Return 0 if the value cannot be converted

# Apply the conversion functions to the respective columns
oneplus['New Bought Last Month'] = oneplus['Bought Last Month'].apply(convert_bought_last_month)
oneplus['New Dashed MRP'] = oneplus['Dashed MRP'].apply(lambda x: convert_dashed_mrp(x, oneplus['Current MRP']))
oneplus['New Discount (%)'] = oneplus['Discount (%)'].apply(convert_discount)
oneplus["Brand"] = "oneplus"
# Display the dataframe with the new columns
oneplus


Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery,Brand,Product,Color,RAM,Storage,New Bought Last Month,New Dashed MRP,New Discount (%)
1,"OnePlus Nord CE4 Lite 5G (Super Silver, 8GB RA...",4.1,3398,5K+ bought in past month,19999.0,₹20999,(5% off),No,oneplus,OnePlus Nord CE4 Lite 5G,Silver,8GB,128GB,5000,20999,5.0
2,"OnePlus Nord CE 3 5G (Aqua Surge, 8GB RAM, 128...",4.2,5398,5K+ bought in past month,16999.0,₹26999,(37% off),No,oneplus,OnePlus Nord CE 3 5G,Black,8GB,128GB,5000,26999,37.0
3,"OnePlus Nord CE4 Lite 5G (Mega Blue, 8GB RAM, ...",4.1,3398,4K+ bought in past month,19999.0,₹20999,(5% off),No,oneplus,OnePlus Nord CE4 Lite 5G,Blue,8GB,128GB,4000,20999,5.0
4,"Oneplus Nord CE4 (Dark Chrome, 8GB RAM, 128GB ...",4.2,6216,2K+ bought in past month,24999.0,24999.0,0,No,oneplus,,Black,8GB,128GB,2000,1 19999.0 2 16999.0 3 19999.0 4 ...,0.0
5,"OnePlus Nord CE 3 Lite 5G (Pastel Lime, 8GB RA...",4.2,59050,1K+ bought in past month,15690.0,₹19999,(22% off),No,oneplus,OnePlus Nord CE 3 Lite 5G,Black,8GB,128GB,1000,19999,22.0
6,"OnePlus Nord CE 3 Lite 5G (Chromatic Gray, 8GB...",4.2,59050,2K+ bought in past month,15679.0,₹19999,(22% off),No,oneplus,OnePlus Nord CE 3 Lite 5G,Gray,8GB,128GB,2000,19999,22.0
8,"OnePlus Nord CE4 Lite 5G (Ultra Orange, 8GB RA...",4.1,3398,1K+ bought in past month,19999.0,₹20999,(5% off),No,oneplus,OnePlus Nord CE4 Lite 5G,Black,8GB,128GB,1000,20999,5.0
12,"OnePlus Nord CE4 Lite 5G (Super Silver, 8GB RA...",4.1,3398,500+ bought in past month,21499.0,₹23999,(10% off),No,oneplus,OnePlus Nord CE4 Lite 5G,Silver,8GB,256GB,500,23999,10.0
13,"OnePlus 12R (Iron Gray, 16GB RAM, 256GB Storage)",4.3,3529,1K+ bought in past month,40999.0,₹45999,(11% off),No,oneplus,OnePlus 12R,Gray,16GB,256GB,1000,45999,11.0
14,"OnePlus Nord 4 5G (Oasis Green, 8GB RAM, 256GB...",4.2,1686,1K+ bought in past month,32999.0,32999.0,0,No,oneplus,OnePlus Nord 4 5G,Green,8GB,256GB,1000,1 19999.0 2 16999.0 3 19999.0 4 ...,0.0


In [17]:
# Check if the length of 'New Dashed MRP' is greater than 5 and set it to 'Dashed MRP' if true
iphone['New Dashed MRP'] = iphone.apply(lambda row: row['Dashed MRP'] if len(str(row['New Dashed MRP'])) > 5 else row['New Dashed MRP'], axis=1)
oneplus['New Dashed MRP'] = oneplus.apply(lambda row: row['Dashed MRP'] if len(str(row['New Dashed MRP'])) > 5 else row['New Dashed MRP'], axis=1)

# Display the updated DataFrames
iphone.head()
oneplus.head()

Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery,Brand,Product,Color,RAM,Storage,New Bought Last Month,New Dashed MRP,New Discount (%)
1,"OnePlus Nord CE4 Lite 5G (Super Silver, 8GB RA...",4.1,3398,5K+ bought in past month,19999.0,₹20999,(5% off),No,oneplus,OnePlus Nord CE4 Lite 5G,Silver,8GB,128GB,5000,20999.0,5.0
2,"OnePlus Nord CE 3 5G (Aqua Surge, 8GB RAM, 128...",4.2,5398,5K+ bought in past month,16999.0,₹26999,(37% off),No,oneplus,OnePlus Nord CE 3 5G,Black,8GB,128GB,5000,26999.0,37.0
3,"OnePlus Nord CE4 Lite 5G (Mega Blue, 8GB RAM, ...",4.1,3398,4K+ bought in past month,19999.0,₹20999,(5% off),No,oneplus,OnePlus Nord CE4 Lite 5G,Blue,8GB,128GB,4000,20999.0,5.0
4,"Oneplus Nord CE4 (Dark Chrome, 8GB RAM, 128GB ...",4.2,6216,2K+ bought in past month,24999.0,24999.0,0,No,oneplus,,Black,8GB,128GB,2000,24999.0,0.0
5,"OnePlus Nord CE 3 Lite 5G (Pastel Lime, 8GB RA...",4.2,59050,1K+ bought in past month,15690.0,₹19999,(22% off),No,oneplus,OnePlus Nord CE 3 Lite 5G,Black,8GB,128GB,1000,19999.0,22.0


**OPPO**

In [18]:

# Step 1: Update 'Bought Last Month' to 0 where it contains 'M.R.P:' or doesn't contain numbers
oppo['Bought Last Month'] = oppo['Bought Last Month'].apply(lambda x: 0 if isinstance(x, str) and ('M.R.P:' in x or not any(char.isdigit() for char in x)) else x)

# Step 2: Remove rows where 'Current MRP' is NaN
oppo = oppo[oppo['Current MRP'].notna()]

# Step 3: Remove rows where 'Product Name' does not contain both 'Samsung' and 'Galaxy'
oppo = oppo[oppo['Product Name'].str.contains('Oppo', case=False)]

# Step 4: Remove rows where 'Rating (Stars)' is NaN
oppo = oppo[oppo['Rating (Stars)'].notna()]
# Display the cleaned dataframe
oppo.head()


Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery
0,"OPPO A3X 4G (Nebula Red, 4GB RAM, 128GB Storag...",4.3,10,50+ bought in past month,9999.0,₹13999,(29% off),Yes
1,"OPPO A3X 4G (Nebula Red, 4GB RAM, 64GB Storage...",4.3,10,300+ bought in past month,8999.0,₹12999,(31% off),Yes
2,"Oppo A3 Pro 5G (Moonlight Purple, 8Gb Ram, 128...",3.8,3,50+ bought in past month,15600.0,₹20999,(26% off),Yes
4,"OPPO F27 Pro+ 5G (Midnight Navy, 8GB RAM, 256G...",4.0,358,500+ bought in past month,29999.0,₹34999,(14% off),Yes
5,"OPPO F27 5G (Emerald Green, 8GB RAM, 128GB Sto...",3.3,99,300+ bought in past month,20999.0,₹26999,(22% off),Yes


In [19]:
# Define color options to check against
colors = ['Black', 'Blue', 'White', 'Gray', 'Silver', 'Green', 'Red', 'Pink', 'Gold']

# Step 1: Define Brand based on Product Name
oppo['Brand'] = oppo['Product Name'].apply(lambda x: 'Oppo' if 'Oppo' in x else 'Unknown')

# Step 2: Extract Product Name (i.e., everything after 'Vivo')
oppo['Product'] = oppo['Product Name'].apply(
    lambda x: re.search(r'Oppo ([^(]+)', x).group(0) if re.search(r'Oppo ([^(]+)', x) else None
)

# Step 3: Extract Color (if exists, otherwise default to 'Black')
oppo['Color'] = oppo['Product Name'].apply(
    lambda x: next((color for color in colors if color in x), 'Black')
)

# Step 4: Extract RAM (in the format of '6GB', '8GB', etc.)
oppo['RAM'] = oppo['Product Name'].apply(
    lambda x: re.search(r'(\d+GB)(?=\s*RAM)', x).group(0) if re.search(r'(\d+GB)(?=\s*RAM)', x) else None
)

# Step 5: Extract Storage (in the format of '128GB', '256GB', etc.)
oppo['Storage'] = oppo['Product Name'].apply(
    lambda x: re.search(r'(\d+GB)(?=\s*Storage)', x).group(0) if re.search(r'(\d+GB)(?=\s*Storage)', x) else None
)

# Step 6: Remove rows where RAM or Storage is missing
oppo = oppo.dropna(subset=['RAM', 'Storage'])
oppo["Brand"] = "oppo" 
# Display the dataframe with the new columns
oppo.head()


Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery,Brand,Product,Color,RAM,Storage
0,"OPPO A3X 4G (Nebula Red, 4GB RAM, 128GB Storag...",4.3,10,50+ bought in past month,9999.0,₹13999,(29% off),Yes,oppo,,Red,4GB,128GB
1,"OPPO A3X 4G (Nebula Red, 4GB RAM, 64GB Storage...",4.3,10,300+ bought in past month,8999.0,₹12999,(31% off),Yes,oppo,,Red,4GB,64GB
4,"OPPO F27 Pro+ 5G (Midnight Navy, 8GB RAM, 256G...",4.0,358,500+ bought in past month,29999.0,₹34999,(14% off),Yes,oppo,,Black,8GB,256GB
5,"OPPO F27 5G (Emerald Green, 8GB RAM, 128GB Sto...",3.3,99,300+ bought in past month,20999.0,₹26999,(22% off),Yes,oppo,,Green,8GB,128GB
6,"OPPO A3 Pro 5G (Moonlight Purple, 8GB RAM, 128...",3.8,184,200+ bought in past month,17999.0,₹20999,(14% off),Yes,oppo,,Black,8GB,128GB


In [20]:
# Function to convert Bought Last Month to a numeric value
def convert_bought_last_month(value):
    if pd.isna(value) or not isinstance(value, str):  # Check for NaN or non-string values
        return 0  # Return 0 if NaN or not a string
    # Extract number from string (handle cases like 5K, 200+)
    match = re.search(r'(\d+)(K|\+)?', value)
    if match:
        number = int(match.group(1))  # Get the number part
        if match.group(2) == 'K':  # If 'K' is present, multiply by 1000
            number *= 1000
        return number
    return 0  # Return 0 if regex doesn't match

# Function to convert Dashed MRP to numeric value
def convert_dashed_mrp(value, current_mrp):
    if pd.isna(value):  # If NaN, use the Current MRP
        return current_mrp
    # Remove ₹ symbol and commas, then convert to integer
    if isinstance(value, str):
        return int(value.replace('₹', '').replace(',', '').strip())
    return current_mrp  # If conversion fails, return current MRP

# Function to convert Discount (%) to numeric, handle NaN as 0
def convert_discount(value):
    if pd.isna(value):  # If NaN, return 0
        return 0
    # Check if the value is a string and contains '%' symbol
    if isinstance(value, str):
        # Extract the number before the '%' sign, ignoring any non-numeric characters
        match = re.search(r'(\d+)', value)
        if match:
            return float(match.group(1))
    return 0  # Return 0 if the value cannot be converted

# Apply the conversion functions to the respective columns
oppo['New Bought Last Month'] = oppo['Bought Last Month'].apply(convert_bought_last_month)
oppo['New Dashed MRP'] = oppo['Dashed MRP'].apply(lambda x: convert_dashed_mrp(x, oppo['Current MRP']))
oppo['New Discount (%)'] = oppo['Discount (%)'].apply(convert_discount)
oppo["Brand"] = "oppo"
# Display the dataframe with the new columns
oppo


Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery,Brand,Product,Color,RAM,Storage,New Bought Last Month,New Dashed MRP,New Discount (%)
0,"OPPO A3X 4G (Nebula Red, 4GB RAM, 128GB Storag...",4.3,10,50+ bought in past month,9999.0,₹13999,(29% off),Yes,oppo,,Red,4GB,128GB,50,13999,29.0
1,"OPPO A3X 4G (Nebula Red, 4GB RAM, 64GB Storage...",4.3,10,300+ bought in past month,8999.0,₹12999,(31% off),Yes,oppo,,Red,4GB,64GB,300,12999,31.0
4,"OPPO F27 Pro+ 5G (Midnight Navy, 8GB RAM, 256G...",4.0,358,500+ bought in past month,29999.0,₹34999,(14% off),Yes,oppo,,Black,8GB,256GB,500,34999,14.0
5,"OPPO F27 5G (Emerald Green, 8GB RAM, 128GB Sto...",3.3,99,300+ bought in past month,20999.0,₹26999,(22% off),Yes,oppo,,Green,8GB,128GB,300,26999,22.0
6,"OPPO A3 Pro 5G (Moonlight Purple, 8GB RAM, 128...",3.8,184,200+ bought in past month,17999.0,₹20999,(14% off),Yes,oppo,,Black,8GB,128GB,200,20999,14.0
7,"OPPO F27 5G (Emerald Green, 8GB RAM, 256GB Sto...",3.3,99,300+ bought in past month,22999.0,₹28999,(21% off),Yes,oppo,,Green,8GB,256GB,300,28999,21.0
8,"OPPO A3 5G (Nebula Red, 6GB RAM, 128GB Storage)",3.9,37,200+ bought in past month,15999.0,₹19999,(20% off),Yes,oppo,,Red,6GB,128GB,200,19999,20.0
9,"OPPO A3X 4G (Ocean Blue, 4GB RAM, 64GB Storage...",4.2,8,100+ bought in past month,8999.0,₹12999,(31% off),Yes,oppo,,Blue,4GB,64GB,100,12999,31.0
13,"OPPO F27 Pro+ 5G (Midnight Navy, 8GB RAM, 128G...",4.0,357,200+ bought in past month,27999.0,₹32999,(15% off),Yes,oppo,,Black,8GB,128GB,200,32999,15.0
14,"OPPO F27 5G (Emerald Green, 8GB RAM, 128GB Sto...",3.3,99,50+ bought in past month,17697.0,₹25900,(32% off),Yes,oppo,,Green,8GB,128GB,50,25900,32.0


**REALME**

In [21]:

# Step 1: Update 'Bought Last Month' to 0 where it contains 'M.R.P:' or doesn't contain numbers
realme['Bought Last Month'] = realme['Bought Last Month'].apply(lambda x: 0 if isinstance(x, str) and ('M.R.P:' in x or not any(char.isdigit() for char in x)) else x)

# Step 2: Remove rows where 'Current MRP' is NaN
realme = realme[realme['Current MRP'].notna()]

# Step 3: Ensure all values in 'Product Name' are strings
realme['Product Name'] = realme['Product Name'].astype(str)

# Step 4: Remove rows where 'Product Name' does not contain 'Realme'
realme = realme[realme['Product Name'].str.contains('Realme', case=False)]

# Step 5: Remove rows where 'Rating (Stars)' is NaN
realme = realme[realme['Rating (Stars)'].notna()]
# Display the cleaned dataframe
realme.head()


Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery
2,"realme NARZO 70x 5G (Ice Blue, 6GB RAM,128GB S...",4.0,3680,5K+ bought in past month,12999.0,₹17999,(28% off),Yes
3,"realme NARZO N61 (Voyage Blue,6GB RAM+128GB St...",4.0,1057,3K+ bought in past month,8499.0,₹10999,(23% off),Yes
4,"realme NARZO 70x 5G (Forest Green, 6GB RAM,128...",4.0,3680,1K+ bought in past month,12999.0,₹17999,(28% off),Yes
5,"realme 12X 5G (Woodland Green, 8GB RAM, 128GB ...",4.1,115,1K+ bought in past month,12640.0,₹18999,(33% off),Yes
6,"realme 12+ 5G (Navigator Beige, 8GB RAM, 256GB...",3.8,128,500+ bought in past month,16698.0,₹25999,(36% off),Yes


In [22]:
# Define color options to check against
colors = ['Black', 'Blue', 'White', 'Gray', 'Silver', 'Green', 'Red', 'Pink', 'Gold']

# Step 1: Define Brand based on Product Name
realme['Brand'] = realme['Product Name'].apply(lambda x: 'Realme' if 'Realme' in x else 'Unknown')

# Step 2: Extract Product Name (i.e., everything after 'Vivo')
realme['Product'] = realme['Product Name'].apply(
    lambda x: re.search(r'Realme ([^(]+)', x).group(0) if re.search(r'Realme ([^(]+)', x) else None
)

# Step 3: Extract Color (if exists, otherwise default to 'Black')
realme['Color'] = realme['Product Name'].apply(
    lambda x: next((color for color in colors if color in x), 'Black')
)

# Step 4: Extract RAM (in the format of '6GB', '8GB', etc.)
realme['RAM'] = realme['Product Name'].apply(
    lambda x: re.search(r'(\d+GB)(?=\s*RAM)', x).group(0) if re.search(r'(\d+GB)(?=\s*RAM)', x) else None
)

# Step 5: Extract Storage (in the format of '128GB', '256GB', etc.)
realme['Storage'] = realme['Product Name'].apply(
    lambda x: re.search(r'(\d+GB)(?=\s*Storage)', x).group(0) if re.search(r'(\d+GB)(?=\s*Storage)', x) else None
)

# Step 6: Remove rows where RAM or Storage is missing
realme = realme.dropna(subset=['RAM', 'Storage'])

# Display the dataframe with the new columns
realme.head()


Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery,Brand,Product,Color,RAM,Storage
2,"realme NARZO 70x 5G (Ice Blue, 6GB RAM,128GB S...",4.0,3680,5K+ bought in past month,12999.0,₹17999,(28% off),Yes,Unknown,,Blue,6GB,128GB
3,"realme NARZO N61 (Voyage Blue,6GB RAM+128GB St...",4.0,1057,3K+ bought in past month,8499.0,₹10999,(23% off),Yes,Unknown,,Blue,6GB,128GB
4,"realme NARZO 70x 5G (Forest Green, 6GB RAM,128...",4.0,3680,1K+ bought in past month,12999.0,₹17999,(28% off),Yes,Unknown,,Green,6GB,128GB
5,"realme 12X 5G (Woodland Green, 8GB RAM, 128GB ...",4.1,115,1K+ bought in past month,12640.0,₹18999,(33% off),Yes,Unknown,,Green,8GB,128GB
6,"realme 12+ 5G (Navigator Beige, 8GB RAM, 256GB...",3.8,128,500+ bought in past month,16698.0,₹25999,(36% off),Yes,Unknown,,Black,8GB,256GB


In [23]:
# Function to convert Bought Last Month to a numeric value
def convert_bought_last_month(value):
    if pd.isna(value) or not isinstance(value, str):  # Check for NaN or non-string values
        return 0  # Return 0 if NaN or not a string
    # Extract number from string (handle cases like 5K, 200+)
    match = re.search(r'(\d+)(K|\+)?', value)
    if match:
        number = int(match.group(1))  # Get the number part
        if match.group(2) == 'K':  # If 'K' is present, multiply by 1000
            number *= 1000
        return number
    return 0  # Return 0 if regex doesn't match

# Function to convert Dashed MRP to numeric value
def convert_dashed_mrp(value, current_mrp):
    if pd.isna(value):  # If NaN, use the Current MRP
        return current_mrp
    # Remove ₹ symbol and commas, then convert to integer
    if isinstance(value, str):
        return int(value.replace('₹', '').replace(',', '').strip())
    return current_mrp  # If conversion fails, return current MRP

# Function to convert Discount (%) to numeric, handle NaN as 0
def convert_discount(value):
    if pd.isna(value):  # If NaN, return 0
        return 0
    # Check if the value is a string and contains '%' symbol
    if isinstance(value, str):
        # Extract the number before the '%' sign, ignoring any non-numeric characters
        match = re.search(r'(\d+)', value)
        if match:
            return float(match.group(1))
    return 0  # Return 0 if the value cannot be converted

# Apply the conversion functions to the respective columns
realme['New Bought Last Month'] = realme['Bought Last Month'].apply(convert_bought_last_month)
realme['New Dashed MRP'] = realme['Dashed MRP'].apply(lambda x: convert_dashed_mrp(x, realme['Current MRP']))
realme['New Discount (%)'] = realme['Discount (%)'].apply(convert_discount)
realme["Brand"] = "realme"
# Display the dataframe with the new columns
realme


Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery,Brand,Product,Color,RAM,Storage,New Bought Last Month,New Dashed MRP,New Discount (%)
2,"realme NARZO 70x 5G (Ice Blue, 6GB RAM,128GB S...",4.0,3680,5K+ bought in past month,12999.0,₹17999,(28% off),Yes,realme,,Blue,6GB,128GB,5000,17999,28.0
3,"realme NARZO N61 (Voyage Blue,6GB RAM+128GB St...",4.0,1057,3K+ bought in past month,8499.0,₹10999,(23% off),Yes,realme,,Blue,6GB,128GB,3000,10999,23.0
4,"realme NARZO 70x 5G (Forest Green, 6GB RAM,128...",4.0,3680,1K+ bought in past month,12999.0,₹17999,(28% off),Yes,realme,,Green,6GB,128GB,1000,17999,28.0
5,"realme 12X 5G (Woodland Green, 8GB RAM, 128GB ...",4.1,115,1K+ bought in past month,12640.0,₹18999,(33% off),Yes,realme,,Green,8GB,128GB,1000,18999,33.0
6,"realme 12+ 5G (Navigator Beige, 8GB RAM, 256GB...",3.8,128,500+ bought in past month,16698.0,₹25999,(36% off),Yes,realme,,Black,8GB,256GB,500,25999,36.0
7,"realme NARZO N65 5G (Deep Green 6GB RAM, 128GB...",4.0,1625,4K+ bought in past month,12499.0,₹14999,(17% off),Yes,realme,,Green,6GB,128GB,4000,14999,17.0
8,"realme NARZO 70 Turbo 5G (Turbo Yellow,6GB RAM...",4.1,445,2K+ bought in past month,16999.0,₹19999,(15% off),Yes,realme,,Black,6GB,128GB,2000,19999,15.0
9,"realme 12 Pro 5G (Submarine Blue, 12GB RAM, 25...",4.2,103,200+ bought in past month,22499.0,₹33999,(34% off),Yes,realme,,Blue,12GB,256GB,200,33999,34.0
12,"realme NARZO N65 5G (Amber Gold 6GB RAM, 128GB...",4.0,1625,4K+ bought in past month,12499.0,₹14999,(17% off),Yes,realme,,Gold,6GB,128GB,4000,14999,17.0
14,"realme NARZO 70 Turbo 5G (Turbo Purple,8GB RAM...",4.2,227,1K+ bought in past month,17999.0,₹20999,(14% off),Yes,realme,,Black,8GB,128GB,1000,20999,14.0


**SAMSUNG**

In [24]:

# Step 1: Update 'Bought Last Month' to 0 where it contains 'M.R.P:' or doesn't contain numbers
samsung['Bought Last Month'] = samsung['Bought Last Month'].apply(lambda x: 0 if isinstance(x, str) and ('M.R.P:' in x or not any(char.isdigit() for char in x)) else x)

# Step 2: Remove rows where 'Current MRP' is NaN
samsung = samsung[samsung['Current MRP'].notna()]

# Step 3: Remove rows where 'Product Name' does not contain both 'Samsung' and 'Galaxy'
samsung = samsung[samsung['Product Name'].str.contains('Samsung', case=False) & samsung['Product Name'].str.contains('Galaxy', case=False)]

# Step 4: Remove rows where 'Rating (Stars)' is NaN
samsung = samsung[samsung['Rating (Stars)'].notna()]
# Display the cleaned dataframe
samsung.head()


Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery
0,"Samsung Galaxy M35 5G (Thunder Grey,8GB RAM,12...",4.1,5149,400+ bought in past month,17499.0,₹25999,(33% off),No
1,"Samsung Galaxy A55 5G (Awesome Navy, 8GB RAM, ...",4.1,356,50+ bought in past month,39989.0,₹42999,(7% off),No
2,"Samsung Galaxy M05 (Mint Green, 4GB RAM, 64 GB...",3.9,961,5K+ bought in past month,6499.0,₹9999,(35% off),No
3,Samsung Galaxy M15 5G Prime Edition (Blue Topa...,3.9,925,4K+ bought in past month,11999.0,₹16999,(29% off),No
4,Samsung Galaxy M15 5G Prime Edition (Stone Gre...,3.9,925,4K+ bought in past month,11999.0,₹16999,(29% off),No


In [25]:

# Extract the 'Brand' (Samsung is the brand in all rows)
samsung['Brand'] = 'Samsung'

# Define color options to check against
colors = ['Black', 'Blue', 'White', 'Gray', 'Silver', 'Green', 'Red', 'Pink', 'Gold']

# Extract Product Name (i.e., everything after 'Samsung Galaxy')
samsung['Product'] = samsung['Product Name'].apply(lambda x: 'Samsung Galaxy ' + re.search(r'Samsung Galaxy ([^(]+)', x).group(1) if re.search(r'Samsung Galaxy ([^(]+)', x) else None)

# Extract Color (if exists, otherwise default to 'Black')
samsung['Color'] = samsung['Product Name'].apply(lambda x: next((color for color in colors if color in x), 'Black'))

# Extract RAM (in the format of '6GB', '8GB', etc.)
samsung['RAM'] = samsung['Product Name'].apply(lambda x: re.search(r'(\d+GB)(?=\s*(RAM|GB))', x).group(0) if re.search(r'(\d+GB)(?=\s*(RAM|GB))', x) else None)

# Extract Storage (in the format of '128GB', '256GB', etc.)
samsung['Storage'] = samsung['Product Name'].apply(lambda x: re.search(r'(\d+GB)(?=\s*Storage)', x).group(0) if re.search(r'(\d+GB)(?=\s*Storage)', x) else None)

# Remove rows where RAM or Storage is missing
samsung = samsung.dropna(subset=['RAM', 'Storage'])

# Display the dataframe with the new columns
samsung.head()

Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery,Brand,Product,Color,RAM,Storage
0,"Samsung Galaxy M35 5G (Thunder Grey,8GB RAM,12...",4.1,5149,400+ bought in past month,17499.0,₹25999,(33% off),No,Samsung,Samsung Galaxy M35 5G,Black,8GB,128GB
1,"Samsung Galaxy A55 5G (Awesome Navy, 8GB RAM, ...",4.1,356,50+ bought in past month,39989.0,₹42999,(7% off),No,Samsung,Samsung Galaxy A55 5G,Black,8GB,128GB
3,Samsung Galaxy M15 5G Prime Edition (Blue Topa...,3.9,925,4K+ bought in past month,11999.0,₹16999,(29% off),No,Samsung,Samsung Galaxy M15 5G Prime Edition,Blue,6GB,128GB
4,Samsung Galaxy M15 5G Prime Edition (Stone Gre...,3.9,925,4K+ bought in past month,11999.0,₹16999,(29% off),No,Samsung,Samsung Galaxy M15 5G Prime Edition,Black,6GB,128GB
5,Samsung Galaxy M15 5G Prime Edition (Celestial...,3.9,925,3K+ bought in past month,11999.0,₹16999,(29% off),No,Samsung,Samsung Galaxy M15 5G Prime Edition,Blue,6GB,128GB


In [26]:
import pandas as pd
import re

# Function to convert Bought Last Month to a numeric value
def convert_bought_last_month(value):
    if pd.isna(value) or not isinstance(value, str):  # Check for NaN or non-string values
        return 0  # Return 0 if NaN or not a string
    # Extract number from string (handle cases like 5K, 200+)
    match = re.search(r'(\d+)(K|\+)?', value)
    if match:
        number = int(match.group(1))  # Get the number part
        if match.group(2) == 'K':  # If 'K' is present, multiply by 1000
            number *= 1000
        return number
    return 0  # Return 0 if regex doesn't match

# Function to convert Dashed MRP to numeric value
def convert_dashed_mrp(value, current_mrp):
    if pd.isna(value):  # If NaN, use the Current MRP
        return current_mrp
    # Remove ₹ symbol and commas, then convert to integer
    if isinstance(value, str):
        return int(value.replace('₹', '').replace(',', '').strip())
    return current_mrp  # If conversion fails, return current MRP

# Function to convert Discount (%) to numeric, handle NaN as 0
def convert_discount(value):
    if pd.isna(value):  # If NaN, return 0
        return 0
    # Check if the value is a string and contains '%' symbol
    if isinstance(value, str):
        # Extract the number before the '%' sign, ignoring any non-numeric characters
        match = re.search(r'(\d+)', value)
        if match:
            return float(match.group(1))
    return 0  # Return 0 if the value cannot be converted

# Apply the conversion functions to the respective columns
samsung['New Bought Last Month'] = samsung['Bought Last Month'].apply(convert_bought_last_month)
samsung['New Dashed MRP'] = samsung['Dashed MRP'].apply(lambda x: convert_dashed_mrp(x, samsung['Current MRP']))
samsung['New Discount (%)'] = samsung['Discount (%)'].apply(convert_discount)
samsung["Brand"] = "samsung"
# Display the dataframe with the new columns
samsung


Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery,Brand,Product,Color,RAM,Storage,New Bought Last Month,New Dashed MRP,New Discount (%)
0,"Samsung Galaxy M35 5G (Thunder Grey,8GB RAM,12...",4.1,5149,400+ bought in past month,17499.0,₹25999,(33% off),No,samsung,Samsung Galaxy M35 5G,Black,8GB,128GB,400,25999,33.0
1,"Samsung Galaxy A55 5G (Awesome Navy, 8GB RAM, ...",4.1,356,50+ bought in past month,39989.0,₹42999,(7% off),No,samsung,Samsung Galaxy A55 5G,Black,8GB,128GB,50,42999,7.0
3,Samsung Galaxy M15 5G Prime Edition (Blue Topa...,3.9,925,4K+ bought in past month,11999.0,₹16999,(29% off),No,samsung,Samsung Galaxy M15 5G Prime Edition,Blue,6GB,128GB,4000,16999,29.0
4,Samsung Galaxy M15 5G Prime Edition (Stone Gre...,3.9,925,4K+ bought in past month,11999.0,₹16999,(29% off),No,samsung,Samsung Galaxy M15 5G Prime Edition,Black,6GB,128GB,4000,16999,29.0
5,Samsung Galaxy M15 5G Prime Edition (Celestial...,3.9,925,3K+ bought in past month,11999.0,₹16999,(29% off),No,samsung,Samsung Galaxy M15 5G Prime Edition,Blue,6GB,128GB,3000,16999,29.0
6,"Samsung Galaxy M35 5G (Daybreak Blue,6GB RAM,1...",4.1,5149,2K+ bought in past month,15999.0,₹24499,(35% off),No,samsung,Samsung Galaxy M35 5G,Blue,6GB,128GB,2000,24499,35.0
7,"Samsung Galaxy M35 5G (Moonlight Blue,6GB RAM,...",4.1,5149,1K+ bought in past month,15999.0,₹24499,(35% off),No,samsung,Samsung Galaxy M35 5G,Blue,6GB,128GB,1000,24499,35.0
9,"Samsung Galaxy M35 5G (Thunder Grey,6GB RAM,12...",4.1,5149,1K+ bought in past month,15999.0,₹24499,(35% off),No,samsung,Samsung Galaxy M35 5G,Black,6GB,128GB,1000,24499,35.0
10,"Samsung Galaxy A35 5G (Awesome Iceblue, 8GB RA...",3.8,218,400+ bought in past month,30989.0,₹33999,(9% off),No,samsung,Samsung Galaxy A35 5G,Black,8GB,128GB,400,33999,9.0
11,"Samsung Galaxy A35 5G (Awesome Navy, 8GB RAM, ...",4.2,54,100+ bought in past month,33989.0,₹36999,(8% off),No,samsung,Samsung Galaxy A35 5G,Black,8GB,256GB,100,36999,8.0


**VIVO**

In [27]:

# Step 1: Update 'Bought Last Month' to 0 where it contains 'M.R.P:' or doesn't contain numbers
vivo['Bought Last Month'] = vivo['Bought Last Month'].apply(lambda x: 0 if isinstance(x, str) and ('M.R.P:' in x or not any(char.isdigit() for char in x)) else x)

# Step 2: Remove rows where 'Current MRP' is NaN
vivo = vivo[vivo['Current MRP'].notna()]

# Step 3: Remove rows where 'Product Name' does not contain both 'Samsung' and 'Galaxy'
vivo = vivo[vivo['Product Name'].str.contains('Vivo', case=False)]

# Step 4: Remove rows where 'Rating (Stars)' is NaN
vivo = vivo[vivo['Rating (Stars)'].notna()]
# Display the cleaned dataframe
vivo.head()


Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery
0,"Vivo T3X 5G (Crimson Bliss, 4GB Ram, 128GB Sto...",4.2,24,1K+ bought in past month,12799.0,₹17499,(27% off),Yes
1,"Vivo T3x 5G (Crimson Bliss, 128 GB) (6 GB RAM)",3.9,87,500+ bought in past month,13888.0,₹18999,(27% off),Yes
2,"Vivo Y300 5G (Titanium Silver, 8GB RAM, 128GB ...",5.0,1,200+ bought in past month,21999.0,₹26999,(19% off),Yes
3,"Vivo T3 Lite 5G Smartphone (Vibrant Green, 4GB...",3.5,31,500+ bought in past month,10750.0,₹14499,(26% off),Yes
4,"Vivo Y300 5G (Phantom Purple, 8GB RAM, 256GB S...",5.0,1,50+ bought in past month,23999.0,₹28999,(17% off),Yes


In [28]:
# Define color options to check against
colors = ['Black', 'Blue', 'White', 'Gray', 'Silver', 'Green', 'Red', 'Pink', 'Gold']

# Step 1: Define Brand based on Product Name
vivo['Brand'] = vivo['Product Name'].apply(lambda x: 'Vivo' if 'Vivo' in x else 'Unknown')

# Step 2: Extract Product Name (i.e., everything after 'Vivo')
vivo['Product'] = vivo['Product Name'].apply(
    lambda x: re.search(r'Vivo ([^(]+)', x).group(0) if re.search(r'Vivo ([^(]+)', x) else None
)

# Step 3: Extract Color (if exists, otherwise default to 'Black')
vivo['Color'] = vivo['Product Name'].apply(
    lambda x: next((color for color in colors if color in x), 'Black')
)

# Step 4: Extract RAM (in the format of '6GB', '8GB', etc.)
vivo['RAM'] = vivo['Product Name'].apply(
    lambda x: re.search(r'(\d+GB)(?=\s*RAM)', x).group(0) if re.search(r'(\d+GB)(?=\s*RAM)', x) else None
)

# Step 5: Extract Storage (in the format of '128GB', '256GB', etc.)
vivo['Storage'] = vivo['Product Name'].apply(
    lambda x: re.search(r'(\d+GB)(?=\s*Storage)', x).group(0) if re.search(r'(\d+GB)(?=\s*Storage)', x) else None
)

# Step 6: Remove rows where RAM or Storage is missing
vivo = vivo.dropna(subset=['RAM', 'Storage'])

# Display the dataframe with the new columns
vivo.head()


Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery,Brand,Product,Color,RAM,Storage
2,"Vivo Y300 5G (Titanium Silver, 8GB RAM, 128GB ...",5.0,1,200+ bought in past month,21999.0,₹26999,(19% off),Yes,Vivo,Vivo Y300 5G,Silver,8GB,128GB
4,"Vivo Y300 5G (Phantom Purple, 8GB RAM, 256GB S...",5.0,1,50+ bought in past month,23999.0,₹28999,(17% off),Yes,Vivo,Vivo Y300 5G,Black,8GB,256GB
5,"Vivo V40 5G Smartphone (Lotus Purple, 8GB RAM,...",4.0,38,500+ bought in past month,34949.0,₹42999,(19% off),Yes,Vivo,Vivo V40 5G Smartphone,Black,8GB,256GB
12,"vivo Y18i (Space Black, 4GB RAM, 64GB Storage)...",3.8,50,500+ bought in past month,7999.0,₹11999,(33% off),Yes,Unknown,,Black,4GB,64GB
13,"Vivo V40e 5G AI Smartphone (Royal Bronze, 8GB ...",3.9,17,500+ bought in past month,28289.0,₹42999,(34% off),Yes,Vivo,Vivo V40e 5G AI Smartphone,Black,8GB,256GB


In [29]:
# Function to convert Bought Last Month to a numeric value
def convert_bought_last_month(value):
    if pd.isna(value) or not isinstance(value, str):  # Check for NaN or non-string values
        return 0  # Return 0 if NaN or not a string
    # Extract number from string (handle cases like 5K, 200+)
    match = re.search(r'(\d+)(K|\+)?', value)
    if match:
        number = int(match.group(1))  # Get the number part
        if match.group(2) == 'K':  # If 'K' is present, multiply by 1000
            number *= 1000
        return number
    return 0  # Return 0 if regex doesn't match

# Function to convert Dashed MRP to numeric value
def convert_dashed_mrp(value, current_mrp):
    if pd.isna(value):  # If NaN, use the Current MRP
        return current_mrp
    # Remove ₹ symbol and commas, then convert to integer
    if isinstance(value, str):
        return int(value.replace('₹', '').replace(',', '').strip())
    return current_mrp  # If conversion fails, return current MRP

# Function to convert Discount (%) to numeric, handle NaN as 0
def convert_discount(value):
    if pd.isna(value):  # If NaN, return 0
        return 0
    # Check if the value is a string and contains '%' symbol
    if isinstance(value, str):
        # Extract the number before the '%' sign, ignoring any non-numeric characters
        match = re.search(r'(\d+)', value)
        if match:
            return float(match.group(1))
    return 0  # Return 0 if the value cannot be converted

# Apply the conversion functions to the respective columns
vivo['New Bought Last Month'] = vivo['Bought Last Month'].apply(convert_bought_last_month)
vivo['New Dashed MRP'] = vivo['Dashed MRP'].apply(lambda x: convert_dashed_mrp(x, vivo['Current MRP']))
vivo['New Discount (%)'] = vivo['Discount (%)'].apply(convert_discount)
vivo["Brand"] = "vivo"
# Display the dataframe with the new columns
vivo


Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery,Brand,Product,Color,RAM,Storage,New Bought Last Month,New Dashed MRP,New Discount (%)
2,"Vivo Y300 5G (Titanium Silver, 8GB RAM, 128GB ...",5.0,1,200+ bought in past month,21999.0,₹26999,(19% off),Yes,vivo,Vivo Y300 5G,Silver,8GB,128GB,200,26999,19.0
4,"Vivo Y300 5G (Phantom Purple, 8GB RAM, 256GB S...",5.0,1,50+ bought in past month,23999.0,₹28999,(17% off),Yes,vivo,Vivo Y300 5G,Black,8GB,256GB,50,28999,17.0
5,"Vivo V40 5G Smartphone (Lotus Purple, 8GB RAM,...",4.0,38,500+ bought in past month,34949.0,₹42999,(19% off),Yes,vivo,Vivo V40 5G Smartphone,Black,8GB,256GB,500,42999,19.0
12,"vivo Y18i (Space Black, 4GB RAM, 64GB Storage)...",3.8,50,500+ bought in past month,7999.0,₹11999,(33% off),Yes,vivo,,Black,4GB,64GB,500,11999,33.0
13,"Vivo V40e 5G AI Smartphone (Royal Bronze, 8GB ...",3.9,17,500+ bought in past month,28289.0,₹42999,(34% off),Yes,vivo,Vivo V40e 5G AI Smartphone,Black,8GB,256GB,500,42999,34.0
14,"vivo Y18i (Gem Green, 4GB RAM, 64GB Storage) w...",3.8,50,200+ bought in past month,7999.0,₹11999,(33% off),Yes,vivo,,Green,4GB,64GB,200,11999,33.0
16,"vivo Y28s 5G (Vintage Red, 6GB RAM, 128GB Stor...",3.7,85,400+ bought in past month,14999.0,₹19999,(25% off),Yes,vivo,,Red,6GB,128GB,400,19999,25.0
17,"Vivo Y300 5G (Phantom Purple, 8GB RAM, 128GB S...",5.0,1,100+ bought in past month,21999.0,₹26999,(19% off),Yes,vivo,Vivo Y300 5G,Black,8GB,128GB,100,26999,19.0
24,"vivo Y28e 5G (Vintage Red, 4GB RAM, 128GB Stor...",3.7,82,100+ bought in past month,11999.0,₹15999,(25% off),Yes,vivo,,Red,4GB,128GB,100,15999,25.0
25,"Vivo V30 5G (Peacock Green, 8GB RAM, 128GB Sto...",4.3,17,100+ bought in past month,25125.0,₹38999,(36% off),Yes,vivo,Vivo V30 5G,Green,8GB,128GB,100,38999,36.0


**XIAOMI**

In [30]:

# Step 1: Update 'Bought Last Month' to 0 where it contains 'M.R.P:' or doesn't contain numbers
xiaomi['Bought Last Month'] = xiaomi['Bought Last Month'].apply(lambda x: 0 if isinstance(x, str) and ('M.R.P:' in x or not any(char.isdigit() for char in x)) else x)

# Step 2: Remove rows where 'Current MRP' is NaN
xiaomi = xiaomi[xiaomi['Current MRP'].notna()]

# Step 3: Remove rows where 'Product Name' does not contain both 'Samsung' and 'Galaxy'
xiaomi = xiaomi[xiaomi['Product Name'].str.contains('Xiaomi', case=False) | xiaomi['Product Name'].str.contains('Redmi', case=False)]

# Step 4: Remove rows where 'Rating (Stars)' is NaN
xiaomi = xiaomi[xiaomi['Rating (Stars)'].notna()]
# Display the cleaned dataframe
xiaomi


Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery
2,"Xiaomi 14 (Jade Green, 12GB RAM, 512GB Storage...",4.2,608,400+ bought in past month,49999.0,₹79999,(38% off),No
3,"Xiaomi Handheld Garment Steamer, Foldable trav...",4.0,289,2K+ bought in past month,2099.0,₹3999,(48% off),No
4,Xiaomi Mi 360° Home Security Camera 2K (1296p)...,4.0,1027,5K+ bought in past month,2399.0,₹4999,(52% off),No
5,Xiaomi Power Bank 4i 20000mAh 33W Super Fast C...,4.3,95546,7K+ bought in past month,1999.0,₹3999,(50% off),No
6,Xiaomi Pad 6| Qualcomm Snapdragon 870| Powered...,4.5,2505,1K+ bought in past month,24999.0,₹41999,(40% off),No
...,...,...,...,...,...,...,...,...
78,"Redmi Note 13 Pro+ (Fusion Black, 8GB RAM, 256...",3.9,926,200+ bought in past month,22863.0,₹33999,(33% off),No
81,"Redmi Note 13 Pro Coral Purple, 8GB RAM, 256GB...",3.9,1507,0,24999.0,₹30999,(19% off),No
82,Xiaomi Wireless Power Bank 10000mAh | 22.5W Fa...,4.1,1169,100+ bought in past month,1999.0,₹3899,(49% off),No
83,"Xiaomi 11T Pro 5G Hyperphone (Meteorite Black,...",4.0,2563,0,25990.0,₹69999,(63% off),No


In [31]:
# Define color options to check against
colors = ['Black', 'Blue', 'White', 'Gray', 'Silver', 'Green', 'Red', 'Pink', 'Gold']

# Set Brand column based on the name
xiaomi['Brand'] = xiaomi['Product Name'].apply(lambda x: 'Xiaomi' if 'Xiaomi' in x else 'Redmi')

# Extract Product Name (i.e., everything after 'Xiaomi' or 'Redmi')
xiaomi['Product'] = xiaomi['Product Name'].apply(
    lambda x: re.search(r'(Xiaomi|Redmi) ([^(]+)', x).group(0) if re.search(r'(Xiaomi|Redmi) ([^(]+)', x) else None
)

# Extract Color (if exists, otherwise default to 'Black')
xiaomi['Color'] = xiaomi['Product Name'].apply(
    lambda x: next((color for color in colors if color in x), 'Black')
)

# Extract RAM (in the format of '6GB', '8GB', etc.)
xiaomi['RAM'] = xiaomi['Product Name'].apply(
    lambda x: re.search(r'(\d+GB)(?=\s*RAM)', x).group(0) if re.search(r'(\d+GB)(?=\s*RAM)', x) else None
)

# Extract Storage (in the format of '128GB', '256GB', etc.)
xiaomi['Storage'] = xiaomi['Product Name'].apply(
    lambda x: re.search(r'(\d+GB)(?=\s*Storage)', x).group(0) if re.search(r'(\d+GB)(?=\s*Storage)', x) else None
)

# Remove rows where RAM or Storage is missing
xiaomi = xiaomi.dropna(subset=['RAM', 'Storage'])

# Display the dataframe with the new columns
xiaomi.head()


Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery,Brand,Product,Color,RAM,Storage
2,"Xiaomi 14 (Jade Green, 12GB RAM, 512GB Storage...",4.2,608,400+ bought in past month,49999.0,₹79999,(38% off),No,Xiaomi,Xiaomi 14,Green,12GB,512GB
7,"Xiaomi 11T Pro 5G Hyperphone (Celestial Magic,...",4.1,3953,0,23290.0,₹52999,(56% off),No,Xiaomi,Xiaomi 11T Pro 5G Hyperphone,Black,8GB,256GB
12,"Xiaomi 14 (White, 12GB RAM, 512GB Storage) | 5...",4.2,608,200+ bought in past month,49999.0,₹79999,(38% off),No,Xiaomi,Xiaomi 14,White,12GB,512GB
14,"Redmi Note 13 Pro (Scarlet Red, 8GB RAM, 128GB...",4.1,50,400+ bought in past month,18489.0,₹28999,(36% off),No,Redmi,Redmi Note 13 Pro,Red,8GB,128GB
16,"Xiaomi 14 CIVI (Shadow Black, 8GB RAM, 256GB S...",3.6,45,0,40999.0,₹54999,(25% off),No,Xiaomi,Xiaomi 14 CIVI,Black,8GB,256GB


In [32]:
# Function to convert Bought Last Month to a numeric value
def convert_bought_last_month(value):
    if pd.isna(value) or not isinstance(value, str):  # Check for NaN or non-string values
        return 0  # Return 0 if NaN or not a string
    # Extract number from string (handle cases like 5K, 200+)
    match = re.search(r'(\d+)(K|\+)?', value)
    if match:
        number = int(match.group(1))  # Get the number part
        if match.group(2) == 'K':  # If 'K' is present, multiply by 1000
            number *= 1000
        return number
    return 0  # Return 0 if regex doesn't match

# Function to convert Dashed MRP to numeric value
def convert_dashed_mrp(value, current_mrp):
    if pd.isna(value):  # If NaN, use the Current MRP
        return current_mrp
    # Remove ₹ symbol and commas, then convert to integer
    if isinstance(value, str):
        return int(value.replace('₹', '').replace(',', '').strip())
    return current_mrp  # If conversion fails, return current MRP

# Function to convert Discount (%) to numeric, handle NaN as 0
def convert_discount(value):
    if pd.isna(value):  # If NaN, return 0
        return 0
    # Check if the value is a string and contains '%' symbol
    if isinstance(value, str):
        # Extract the number before the '%' sign, ignoring any non-numeric characters
        match = re.search(r'(\d+)', value)
        if match:
            return float(match.group(1))
    return 0  # Return 0 if the value cannot be converted

# Apply the conversion functions to the respective columns
xiaomi['New Bought Last Month'] = xiaomi['Bought Last Month'].apply(convert_bought_last_month)
xiaomi['New Dashed MRP'] = xiaomi['Dashed MRP'].apply(lambda x: convert_dashed_mrp(x, xiaomi['Current MRP']))
xiaomi['New Discount (%)'] = xiaomi['Discount (%)'].apply(convert_discount)
xiaomi["Brand"] = "xiaomi"
# Display the dataframe with the new columns
xiaomi


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  xiaomi['New Bought Last Month'] = xiaomi['Bought Last Month'].apply(convert_bought_last_month)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  xiaomi['New Dashed MRP'] = xiaomi['Dashed MRP'].apply(lambda x: convert_dashed_mrp(x, xiaomi['Current MRP']))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  

Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery,Brand,Product,Color,RAM,Storage,New Bought Last Month,New Dashed MRP,New Discount (%)
2,"Xiaomi 14 (Jade Green, 12GB RAM, 512GB Storage...",4.2,608,400+ bought in past month,49999.0,₹79999,(38% off),No,xiaomi,Xiaomi 14,Green,12GB,512GB,400,79999,38.0
7,"Xiaomi 11T Pro 5G Hyperphone (Celestial Magic,...",4.1,3953,0,23290.0,₹52999,(56% off),No,xiaomi,Xiaomi 11T Pro 5G Hyperphone,Black,8GB,256GB,0,52999,56.0
12,"Xiaomi 14 (White, 12GB RAM, 512GB Storage) | 5...",4.2,608,200+ bought in past month,49999.0,₹79999,(38% off),No,xiaomi,Xiaomi 14,White,12GB,512GB,200,79999,38.0
14,"Redmi Note 13 Pro (Scarlet Red, 8GB RAM, 128GB...",4.1,50,400+ bought in past month,18489.0,₹28999,(36% off),No,xiaomi,Redmi Note 13 Pro,Red,8GB,128GB,400,28999,36.0
16,"Xiaomi 14 CIVI (Shadow Black, 8GB RAM, 256GB S...",3.6,45,0,40999.0,₹54999,(25% off),No,xiaomi,Xiaomi 14 CIVI,Black,8GB,256GB,0,54999,25.0
17,"Redmi Note 13 Pro+ (Fusion Black, 12GB RAM, 51...",3.9,926,100+ bought in past month,31999.0,₹37999,(16% off),No,xiaomi,Redmi Note 13 Pro+,Black,12GB,512GB,100,37999,16.0
21,"Redmi Note 13 5G (Arctic White, 6GB RAM, 128GB...",3.7,5908,1K+ bought in past month,15999.0,₹20999,(24% off),No,xiaomi,Redmi Note 13 5G,White,6GB,128GB,1000,20999,24.0
22,"Xiaomi 11 Lite (Vinyl Black, 8GB RAM, 128GB St...",3.5,54,0,12740.0,₹25999,(51% off),No,xiaomi,Xiaomi 11 Lite,Black,8GB,128GB,0,25999,51.0
25,"Redmi Note 13 Pro+ (Fusion Purple, 8GB RAM, 25...",3.9,926,500+ bought in past month,27999.0,₹33999,(18% off),No,xiaomi,Redmi Note 13 Pro+,Red,8GB,256GB,500,33999,18.0
26,"Redmi Note 13 Pro (Midnight Black, 8GB RAM, 25...",3.9,1507,50+ bought in past month,24999.0,₹30999,(19% off),No,xiaomi,Redmi Note 13 Pro,Black,8GB,256GB,50,30999,19.0


In [33]:
all_phones = pd.concat([iphone, oneplus, oppo, realme, samsung, vivo, xiaomi], ignore_index=True)
print(all_phones.shape)

(300, 16)


In [34]:
all_phones.sample(10)

Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery,Brand,Product,Storage,Color,New Bought Last Month,New Dashed MRP,New Discount (%),RAM
158,"realme GT 6T 5G (Fluid Silver,8GB RAM+128GB St...",4.3,2687,300+ bought in past month,30999.0,₹33999,(9% off),No,realme,,128GB,Silver,300,33999,9.0,8GB
245,"Vivo V30e 5G Smartphone (Silk Blue, 8GB RAM, 2...",4.4,7,100+ bought in past month,24490.0,₹34999,(30% off),Yes,vivo,Vivo V30e 5G Smartphone,256GB,Blue,100,34999,30.0,8GB
131,"realme NARZO 70x 5G (Forest Green, 6GB RAM,128...",4.0,3680,1K+ bought in past month,12999.0,₹17999,(28% off),Yes,realme,,128GB,Green,1000,17999,28.0,6GB
227,"vivo Y18i (Gem Green, 4GB RAM, 64GB Storage) w...",3.8,50,200+ bought in past month,7999.0,₹11999,(33% off),Yes,vivo,,64GB,Green,200,11999,33.0,4GB
286,"Redmi Note 13 Pro+ (Fusion White, 8GB RAM, 256...",3.9,926,50+ bought in past month,23435.0,₹33999,(31% off),No,xiaomi,Redmi Note 13 Pro+,256GB,White,50,33999,31.0,8GB
285,"Xiaomi 11T Pro 5G Hyperphone (Celestial Magic,...",4.0,2563,0,21499.0,₹49999,(57% off),No,xiaomi,Xiaomi 11T Pro 5G Hyperphone,128GB,Black,0,49999,57.0,8GB
288,"Xiaomi 11T Pro 5G Hyperphone (Meteorite Black,...",4.1,3953,0,24280.0,₹49999,(51% off),No,xiaomi,Xiaomi 11T Pro 5G Hyperphone,128GB,Black,0,49999,51.0,8GB
5,Apple iPhone 13 (128GB) - Blue,4.5,31459,500+ bought in past month,45490.0,₹59600,(24% off),0,Apple,iPhone 13,128GB,Blue,500,59600,24.0,4 GB
26,Apple iPhone 14 Plus (128 GB) - Midnight,4.5,2072,0,59900.0,₹79600,(25% off),0,Apple,iPhone 14 Plus,128GB,Midnight,0,79600,25.0,6 GB
106,"OPPO A3 Pro 5G (Starry Black, 8GB RAM, 256GB S...",3.8,184,100+ bought in past month,19999.0,₹22999,(13% off),Yes,oppo,,256GB,Black,100,22999,13.0,8GB


In [35]:
all_phones.to_csv('data_01.csv', index=False)

In [36]:
df = pd.read_csv('data_01.csv')
df.drop(["Product Name","Product","Bought Last Month" , "Dashed MRP" , "Discount (%)"],axis=1,inplace=True)


In [37]:
# Convert 'Brand' column to lowercase
df['Brand'] = df['Brand'].str.lower()

# Remove 'GB' from 'Storage' column and convert to numeric
df['Storage'] = df['Storage'].str.replace('GB', '').astype(float)

# Remove spaces from 'RAM' column and remove 'GB'
df['RAM'] = df['RAM'].str.replace(' ', '').str.replace('GB', '').astype(float)

df.head()

Unnamed: 0,Rating (Stars),Number of Reviews,Current MRP,Free Delivery,Brand,Storage,Color,New Bought Last Month,New Dashed MRP,New Discount (%),RAM
0,4.4,363,139900.0,0,apple,512.0,Blue Titanium,0,₹164900,15.0,6.0
1,4.5,5729,55990.0,0,apple,128.0,Purple,100,69600,20.0,6.0
2,4.5,31459,45490.0,0,apple,128.0,Starlight,1000,59900,24.0,4.0
3,4.5,31459,45490.0,0,apple,128.0,Midnight,1000,59600,24.0,4.0
4,4.5,31459,45490.0,0,apple,128.0,Pink,500,59900,24.0,4.0


In [38]:
print(df.isnull().sum())
df.dropna(inplace=True)
df.isnull().sum()

Rating (Stars)           0
Number of Reviews        0
Current MRP              0
Free Delivery            0
Brand                    0
Storage                  2
Color                    2
New Bought Last Month    0
New Dashed MRP           0
New Discount (%)         0
RAM                      0
dtype: int64


Rating (Stars)           0
Number of Reviews        0
Current MRP              0
Free Delivery            0
Brand                    0
Storage                  0
Color                    0
New Bought Last Month    0
New Dashed MRP           0
New Discount (%)         0
RAM                      0
dtype: int64

In [39]:
# Remove long multi-line entry from 'New Dashed MRP'
df['New Dashed MRP'] = df['New Dashed MRP'].apply(lambda x: x if '\n' not in str(x) else None)

# Drop rows with None or NaN values in 'New Dashed MRP' after cleaning
df = df.dropna(subset=['New Dashed MRP'])

# Map 'Yes' and 'No' in 'Free Delivery' to 1 and 0
df['Free Delivery'] = df['Free Delivery'].map({'Yes': 1, 'No': 0, '1': 1, '0': 0})


In [40]:
# Count the number of entries with '₹' symbol
count_with_symbol = df['New Dashed MRP'].str.contains('₹').sum()

# Remove '₹' symbol from 'New Dashed MRP' column
df['New Dashed MRP'] = df['New Dashed MRP'].str.replace('₹', '')

print(f"Number of entries with '₹' symbol: {count_with_symbol}")

Number of entries with '₹' symbol: 9


In [41]:
# Remove any non-numeric characters from 'Number of Reviews' and convert to numeric
df['Number of Reviews'] = df['Number of Reviews'].str.replace(r'\D', '', regex=True).astype(float)

# Display the updated DataFrame
df.head()

Unnamed: 0,Rating (Stars),Number of Reviews,Current MRP,Free Delivery,Brand,Storage,Color,New Bought Last Month,New Dashed MRP,New Discount (%),RAM
0,4.4,363.0,139900.0,0,apple,512.0,Blue Titanium,0,164900,15.0,6.0
1,4.5,5729.0,55990.0,0,apple,128.0,Purple,100,69600,20.0,6.0
2,4.5,31459.0,45490.0,0,apple,128.0,Starlight,1000,59900,24.0,4.0
3,4.5,31459.0,45490.0,0,apple,128.0,Midnight,1000,59600,24.0,4.0
4,4.5,31459.0,45490.0,0,apple,128.0,Pink,500,59900,24.0,4.0


In [42]:
df.to_csv('data_02.csv', index=False)