In [65]:
import pandas as pd
import numpy as np
import re

In [66]:
df = pd.read_csv("Amazon-Realme-csvs/amazon_page_2.csv")
df

Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery
0,,,,M.R.P:,399.0,₹999,(60% off),Yes
1,,3.9,455.0,4K+ bought in past month,12934.0,₹17999,(28% off),Yes
2,,4.2,68.0,50+ bought in past month,949.0,₹1699,(44% off),No
3,,3.9,76.0,100+ bought in past month,27999.0,₹33999,(18% off),Yes
4,,4.0,132.0,200+ bought in past month,17495.0,₹24999,(30% off),Yes
5,,3.6,14.0,200+ bought in past month,10999.0,₹14999,(27% off),Yes
6,,4.2,15066.0,8K+ bought in past month,1799.0,₹2999,(40% off),Yes
7,,4.1,145.0,200+ bought in past month,14379.0,₹20999,(32% off),Yes
8,,4.1,637.0,M.R.P:,15999.0,₹19999,(20% off),Yes
9,,4.5,4.0,300+ bought in past month,8448.0,₹9999,(16% off),Yes


In [68]:

# Step 1: Update 'Bought Last Month' to 0 where it contains 'M.R.P:' or doesn't contain numbers
df['Bought Last Month'] = df['Bought Last Month'].apply(lambda x: 0 if isinstance(x, str) and ('M.R.P:' in x or not any(char.isdigit() for char in x)) else x)

# Step 2: Remove rows where 'Current MRP' is NaN
df = df[df['Current MRP'].notna()]

# Step 3: Ensure all values in 'Product Name' are strings
df['Product Name'] = df['Product Name'].astype(str)

# Step 4: Remove rows where 'Product Name' does not contain 'Realme'
df = df[df['Product Name'].str.contains('Realme', case=False)]

# Step 5: Remove rows where 'Rating (Stars)' is NaN
df = df[df['Rating (Stars)'].notna()]
# Display the cleaned dataframe
df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Bought Last Month'] = df['Bought Last Month'].apply(lambda x: 0 if isinstance(x, str) and ('M.R.P:' in x or not any(char.isdigit() for char in x)) else x)


Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery


In [69]:
# Define color options to check against
colors = ['Black', 'Blue', 'White', 'Gray', 'Silver', 'Green', 'Red', 'Pink', 'Gold']

# Step 1: Define Brand based on Product Name
df['Brand'] = df['Product Name'].apply(lambda x: 'Realme' if 'Realme' in x else 'Unknown')

# Step 2: Extract Product Name (i.e., everything after 'Vivo')
df['Product'] = df['Product Name'].apply(
    lambda x: re.search(r'Realme ([^(]+)', x).group(0) if re.search(r'Realme ([^(]+)', x) else None
)

# Step 3: Extract Color (if exists, otherwise default to 'Black')
df['Color'] = df['Product Name'].apply(
    lambda x: next((color for color in colors if color in x), 'Black')
)

# Step 4: Extract RAM (in the format of '6GB', '8GB', etc.)
df['RAM'] = df['Product Name'].apply(
    lambda x: re.search(r'(\d+GB)(?=\s*RAM)', x).group(0) if re.search(r'(\d+GB)(?=\s*RAM)', x) else None
)

# Step 5: Extract Storage (in the format of '128GB', '256GB', etc.)
df['Storage'] = df['Product Name'].apply(
    lambda x: re.search(r'(\d+GB)(?=\s*Storage)', x).group(0) if re.search(r'(\d+GB)(?=\s*Storage)', x) else None
)

# Step 6: Remove rows where RAM or Storage is missing
df = df.dropna(subset=['RAM', 'Storage'])

# Display the dataframe with the new columns
df


Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery,Brand,Product,Color,RAM,Storage


In [70]:
# Function to convert Bought Last Month to a numeric value
def convert_bought_last_month(value):
    if pd.isna(value) or not isinstance(value, str):  # Check for NaN or non-string values
        return 0  # Return 0 if NaN or not a string
    # Extract number from string (handle cases like 5K, 200+)
    match = re.search(r'(\d+)(K|\+)?', value)
    if match:
        number = int(match.group(1))  # Get the number part
        if match.group(2) == 'K':  # If 'K' is present, multiply by 1000
            number *= 1000
        return number
    return 0  # Return 0 if regex doesn't match

# Function to convert Dashed MRP to numeric value
def convert_dashed_mrp(value, current_mrp):
    if pd.isna(value):  # If NaN, use the Current MRP
        return current_mrp
    # Remove ₹ symbol and commas, then convert to integer
    if isinstance(value, str):
        return int(value.replace('₹', '').replace(',', '').strip())
    return current_mrp  # If conversion fails, return current MRP

# Function to convert Discount (%) to numeric, handle NaN as 0
def convert_discount(value):
    if pd.isna(value):  # If NaN, return 0
        return 0
    # Check if the value is a string and contains '%' symbol
    if isinstance(value, str):
        # Extract the number before the '%' sign, ignoring any non-numeric characters
        match = re.search(r'(\d+)', value)
        if match:
            return float(match.group(1))
    return 0  # Return 0 if the value cannot be converted

# Apply the conversion functions to the respective columns
df['New Bought Last Month'] = df['Bought Last Month'].apply(convert_bought_last_month)
df['New Dashed MRP'] = df['Dashed MRP'].apply(lambda x: convert_dashed_mrp(x, df['Current MRP']))
df['New Discount (%)'] = df['Discount (%)'].apply(convert_discount)

# Display the dataframe with the new columns
df


Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery,Brand,Product,Color,RAM,Storage,New Bought Last Month,New Dashed MRP,New Discount (%)


In [71]:
df.to_csv('Filtered-csvs/Realme/amazon_page_2.csv', index=False)

> finally add a column to the dataframe that contains ram : 

```python
iPhone 16
8 GB
iPhone 16 Plus
8 GB
iPhone 14
6 GB
iPhone 14 Plus
6 GB
iPhone 13
4 or 6 GB
iPhone 12
4 GB
iPhone 11
4 GB
iPhone SE (2nd gen.)
3 GB