In [25]:
import pandas as pd
import numpy as np
import re

In [26]:
df = pd.read_csv("Amazon-Oppo-csvs/amazon_page_1.csv")
df

Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery
0,"OPPO A3X 4G (Nebula Red, 4GB RAM, 64GB Storage...",4.7,13.0,300+ bought in past month,8999.0,₹12999,(31% off),Yes
1,"Oppo F27 Pro+ 5G (Midnight Navy, 8GB RAM, 256G...",3.9,178.0,500+ bought in past month,29999.0,₹34999,(14% off),Yes
2,"OPPO A3X 4G (Ocean Blue, 4GB RAM, 128GB Storag...",,,M.R.P:,9942.0,₹13999,(29% off),Yes
3,"Oppo A3 Pro 5G (Moonlight Purple, 8GB RAM, 128...",3.7,73.0,300+ bought in past month,17999.0,₹20999,(14% off),Yes
4,"OPPO A3 5G (Nebula Red, 6GB RAM, 128GB Storage)",4.2,29.0,400+ bought in past month,15999.0,₹19999,(20% off),Yes
5,"Oppo F27 5G (Emerald Green, 8GB RAM, 128GB Sto...",3.5,28.0,400+ bought in past month,20999.0,₹26999,(22% off),Yes
6,"Oppo F27 5G (Emerald Green, 8GB RAM, 256GB Sto...",3.3,29.0,400+ bought in past month,22999.0,₹28999,(21% off),Yes
7,"OPPO A3 5G (Ocean Blue, 6GB RAM, 128GB Storage)",4.2,29.0,200+ bought in past month,15999.0,₹19999,(20% off),Yes
8,"OPPO A3X 5G (Starry Purple, 4GB RAM, 128GB Sto...",3.4,77.0,200+ bought in past month,13499.0,₹15999,(16% off),Yes
9,"Oppo F27 Pro+ 5G (Dusk Pink, 8GB RAM, 256GB St...",4.0,52.0,200+ bought in past month,29999.0,₹34999,(14% off),Yes


In [27]:

# Step 1: Update 'Bought Last Month' to 0 where it contains 'M.R.P:' or doesn't contain numbers
df['Bought Last Month'] = df['Bought Last Month'].apply(lambda x: 0 if isinstance(x, str) and ('M.R.P:' in x or not any(char.isdigit() for char in x)) else x)

# Step 2: Remove rows where 'Current MRP' is NaN
df = df[df['Current MRP'].notna()]

# Step 3: Remove rows where 'Product Name' does not contain both 'Samsung' and 'Galaxy'
df = df[df['Product Name'].str.contains('Oppo', case=False)]

# Step 4: Remove rows where 'Rating (Stars)' is NaN
df = df[df['Rating (Stars)'].notna()]
# Display the cleaned dataframe
df


Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery
0,"OPPO A3X 4G (Nebula Red, 4GB RAM, 64GB Storage...",4.7,13,300+ bought in past month,8999.0,₹12999,(31% off),Yes
1,"Oppo F27 Pro+ 5G (Midnight Navy, 8GB RAM, 256G...",3.9,178,500+ bought in past month,29999.0,₹34999,(14% off),Yes
3,"Oppo A3 Pro 5G (Moonlight Purple, 8GB RAM, 128...",3.7,73,300+ bought in past month,17999.0,₹20999,(14% off),Yes
4,"OPPO A3 5G (Nebula Red, 6GB RAM, 128GB Storage)",4.2,29,400+ bought in past month,15999.0,₹19999,(20% off),Yes
5,"Oppo F27 5G (Emerald Green, 8GB RAM, 128GB Sto...",3.5,28,400+ bought in past month,20999.0,₹26999,(22% off),Yes
6,"Oppo F27 5G (Emerald Green, 8GB RAM, 256GB Sto...",3.3,29,400+ bought in past month,22999.0,₹28999,(21% off),Yes
7,"OPPO A3 5G (Ocean Blue, 6GB RAM, 128GB Storage)",4.2,29,200+ bought in past month,15999.0,₹19999,(20% off),Yes
8,"OPPO A3X 5G (Starry Purple, 4GB RAM, 128GB Sto...",3.4,77,200+ bought in past month,13499.0,₹15999,(16% off),Yes
9,"Oppo F27 Pro+ 5G (Dusk Pink, 8GB RAM, 256GB St...",4.0,52,200+ bought in past month,29999.0,₹34999,(14% off),Yes
10,"WeConnect Care Accident, Liquid & Screen Damag...",4.3,84,0,2399.0,₹3999,(40% off),Yes


In [28]:
# Define color options to check against
colors = ['Black', 'Blue', 'White', 'Gray', 'Silver', 'Green', 'Red', 'Pink', 'Gold']

# Step 1: Define Brand based on Product Name
df['Brand'] = df['Product Name'].apply(lambda x: 'Oppo' if 'Oppo' in x else 'Unknown')

# Step 2: Extract Product Name (i.e., everything after 'Vivo')
df['Product'] = df['Product Name'].apply(
    lambda x: re.search(r'Oppo ([^(]+)', x).group(0) if re.search(r'Oppo ([^(]+)', x) else None
)

# Step 3: Extract Color (if exists, otherwise default to 'Black')
df['Color'] = df['Product Name'].apply(
    lambda x: next((color for color in colors if color in x), 'Black')
)

# Step 4: Extract RAM (in the format of '6GB', '8GB', etc.)
df['RAM'] = df['Product Name'].apply(
    lambda x: re.search(r'(\d+GB)(?=\s*RAM)', x).group(0) if re.search(r'(\d+GB)(?=\s*RAM)', x) else None
)

# Step 5: Extract Storage (in the format of '128GB', '256GB', etc.)
df['Storage'] = df['Product Name'].apply(
    lambda x: re.search(r'(\d+GB)(?=\s*Storage)', x).group(0) if re.search(r'(\d+GB)(?=\s*Storage)', x) else None
)

# Step 6: Remove rows where RAM or Storage is missing
df = df.dropna(subset=['RAM', 'Storage'])

# Display the dataframe with the new columns
df


Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery,Brand,Product,Color,RAM,Storage
0,"OPPO A3X 4G (Nebula Red, 4GB RAM, 64GB Storage...",4.7,13,300+ bought in past month,8999.0,₹12999,(31% off),Yes,Unknown,,Red,4GB,64GB
1,"Oppo F27 Pro+ 5G (Midnight Navy, 8GB RAM, 256G...",3.9,178,500+ bought in past month,29999.0,₹34999,(14% off),Yes,Oppo,Oppo F27 Pro+ 5G,Black,8GB,256GB
3,"Oppo A3 Pro 5G (Moonlight Purple, 8GB RAM, 128...",3.7,73,300+ bought in past month,17999.0,₹20999,(14% off),Yes,Oppo,Oppo A3 Pro 5G,Black,8GB,128GB
4,"OPPO A3 5G (Nebula Red, 6GB RAM, 128GB Storage)",4.2,29,400+ bought in past month,15999.0,₹19999,(20% off),Yes,Unknown,,Red,6GB,128GB
5,"Oppo F27 5G (Emerald Green, 8GB RAM, 128GB Sto...",3.5,28,400+ bought in past month,20999.0,₹26999,(22% off),Yes,Oppo,Oppo F27 5G,Green,8GB,128GB
6,"Oppo F27 5G (Emerald Green, 8GB RAM, 256GB Sto...",3.3,29,400+ bought in past month,22999.0,₹28999,(21% off),Yes,Oppo,Oppo F27 5G,Green,8GB,256GB
7,"OPPO A3 5G (Ocean Blue, 6GB RAM, 128GB Storage)",4.2,29,200+ bought in past month,15999.0,₹19999,(20% off),Yes,Unknown,,Blue,6GB,128GB
8,"OPPO A3X 5G (Starry Purple, 4GB RAM, 128GB Sto...",3.4,77,200+ bought in past month,13499.0,₹15999,(16% off),Yes,Unknown,,Black,4GB,128GB
9,"Oppo F27 Pro+ 5G (Dusk Pink, 8GB RAM, 256GB St...",4.0,52,200+ bought in past month,29999.0,₹34999,(14% off),Yes,Oppo,Oppo F27 Pro+ 5G,Pink,8GB,256GB
13,"OPPO A3X 5G (Starry Purple, 4GB RAM, 64GB Stor...",3.4,77,50+ bought in past month,12499.0,₹14999,(17% off),Yes,Unknown,,Black,4GB,64GB


In [29]:
# Function to convert Bought Last Month to a numeric value
def convert_bought_last_month(value):
    if pd.isna(value) or not isinstance(value, str):  # Check for NaN or non-string values
        return 0  # Return 0 if NaN or not a string
    # Extract number from string (handle cases like 5K, 200+)
    match = re.search(r'(\d+)(K|\+)?', value)
    if match:
        number = int(match.group(1))  # Get the number part
        if match.group(2) == 'K':  # If 'K' is present, multiply by 1000
            number *= 1000
        return number
    return 0  # Return 0 if regex doesn't match

# Function to convert Dashed MRP to numeric value
def convert_dashed_mrp(value, current_mrp):
    if pd.isna(value):  # If NaN, use the Current MRP
        return current_mrp
    # Remove ₹ symbol and commas, then convert to integer
    if isinstance(value, str):
        return int(value.replace('₹', '').replace(',', '').strip())
    return current_mrp  # If conversion fails, return current MRP

# Function to convert Discount (%) to numeric, handle NaN as 0
def convert_discount(value):
    if pd.isna(value):  # If NaN, return 0
        return 0
    # Check if the value is a string and contains '%' symbol
    if isinstance(value, str):
        # Extract the number before the '%' sign, ignoring any non-numeric characters
        match = re.search(r'(\d+)', value)
        if match:
            return float(match.group(1))
    return 0  # Return 0 if the value cannot be converted

# Apply the conversion functions to the respective columns
df['New Bought Last Month'] = df['Bought Last Month'].apply(convert_bought_last_month)
df['New Dashed MRP'] = df['Dashed MRP'].apply(lambda x: convert_dashed_mrp(x, df['Current MRP']))
df['New Discount (%)'] = df['Discount (%)'].apply(convert_discount)

# Display the dataframe with the new columns
df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['New Bought Last Month'] = df['Bought Last Month'].apply(convert_bought_last_month)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['New Dashed MRP'] = df['Dashed MRP'].apply(lambda x: convert_dashed_mrp(x, df['Current MRP']))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['New Discount (%)

Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery,Brand,Product,Color,RAM,Storage,New Bought Last Month,New Dashed MRP,New Discount (%)
0,"OPPO A3X 4G (Nebula Red, 4GB RAM, 64GB Storage...",4.7,13,300+ bought in past month,8999.0,₹12999,(31% off),Yes,Unknown,,Red,4GB,64GB,300,12999,31.0
1,"Oppo F27 Pro+ 5G (Midnight Navy, 8GB RAM, 256G...",3.9,178,500+ bought in past month,29999.0,₹34999,(14% off),Yes,Oppo,Oppo F27 Pro+ 5G,Black,8GB,256GB,500,34999,14.0
3,"Oppo A3 Pro 5G (Moonlight Purple, 8GB RAM, 128...",3.7,73,300+ bought in past month,17999.0,₹20999,(14% off),Yes,Oppo,Oppo A3 Pro 5G,Black,8GB,128GB,300,20999,14.0
4,"OPPO A3 5G (Nebula Red, 6GB RAM, 128GB Storage)",4.2,29,400+ bought in past month,15999.0,₹19999,(20% off),Yes,Unknown,,Red,6GB,128GB,400,19999,20.0
5,"Oppo F27 5G (Emerald Green, 8GB RAM, 128GB Sto...",3.5,28,400+ bought in past month,20999.0,₹26999,(22% off),Yes,Oppo,Oppo F27 5G,Green,8GB,128GB,400,26999,22.0
6,"Oppo F27 5G (Emerald Green, 8GB RAM, 256GB Sto...",3.3,29,400+ bought in past month,22999.0,₹28999,(21% off),Yes,Oppo,Oppo F27 5G,Green,8GB,256GB,400,28999,21.0
7,"OPPO A3 5G (Ocean Blue, 6GB RAM, 128GB Storage)",4.2,29,200+ bought in past month,15999.0,₹19999,(20% off),Yes,Unknown,,Blue,6GB,128GB,200,19999,20.0
8,"OPPO A3X 5G (Starry Purple, 4GB RAM, 128GB Sto...",3.4,77,200+ bought in past month,13499.0,₹15999,(16% off),Yes,Unknown,,Black,4GB,128GB,200,15999,16.0
9,"Oppo F27 Pro+ 5G (Dusk Pink, 8GB RAM, 256GB St...",4.0,52,200+ bought in past month,29999.0,₹34999,(14% off),Yes,Oppo,Oppo F27 Pro+ 5G,Pink,8GB,256GB,200,34999,14.0
13,"OPPO A3X 5G (Starry Purple, 4GB RAM, 64GB Stor...",3.4,77,50+ bought in past month,12499.0,₹14999,(17% off),Yes,Unknown,,Black,4GB,64GB,50,14999,17.0


In [30]:
df.to_csv('Filtered-csvs/Oppo/amazon_page_1.csv', index=False)

> finally add a column to the dataframe that contains ram : 

```python
iPhone 16
8 GB
iPhone 16 Plus
8 GB
iPhone 14
6 GB
iPhone 14 Plus
6 GB
iPhone 13
4 or 6 GB
iPhone 12
4 GB
iPhone 11
4 GB
iPhone SE (2nd gen.)
3 GB