In [67]:
import pandas as pd
import numpy as np
import re

In [68]:
df = pd.read_csv("Amazon-Xiaomi-csvs/amazon_page_10.csv")
df

Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery
0,"Tecno POVA 6 NEO 5G (Aurora Cloud, 8GB+256GB) ...",3.9,230.0,1K+ bought in past month,13999,₹16999,(18% off),No
1,"Redmi Note 13 5G (Arctic White, 6GB RAM, 128GB...",3.7,5707.0,1K+ bought in past month,15999,₹20999,(24% off),No
2,VIDO Transparent Back Case Cover for Xiaomi 11...,4.1,111.0,100+ bought in past month,149,₹999,(85% off),No
3,"Redmi Note 10T 5G (Graphite Black, 6GB RAM, 12...",4.1,101748.0,M.R.P:,13479,₹18999,(29% off),No
4,Xiaomi Redmi Pad Pro Keyboard,4.2,24.0,200+ bought in past month,4160,₹5999,(31% off),No
5,"Xiaomi Mens Men's Advanced Tech Running, Walki...",3.7,233.0,M.R.P:,3208,₹5999,(47% off),No
6,(Renewed) MI Xiaomi Wireless Home Security Cam...,4.0,138.0,50+ bought in past month,1699,₹4499,(62% off),No
7,Xiaomi Redmi Smart Watch 3 Active Gray| 1.83 I...,4.5,3182.0,,5990,,,No
8,Mi Xiaomi 2A Fast Charger USB-A Power Adapter ...,4.2,24568.0,1K+ bought in past month,499,₹599,(17% off),No
9,Mi Xiaomi 120W Hyper Charge Adapter with 6A Hy...,5.0,2.0,(₹10.67₹10.67/Grams),2999,₹10.67,(63% off),No


In [69]:

# Step 1: Update 'Bought Last Month' to 0 where it contains 'M.R.P:' or doesn't contain numbers
df['Bought Last Month'] = df['Bought Last Month'].apply(lambda x: 0 if isinstance(x, str) and ('M.R.P:' in x or not any(char.isdigit() for char in x)) else x)

# Step 2: Remove rows where 'Current MRP' is NaN
df = df[df['Current MRP'].notna()]

# Step 3: Remove rows where 'Product Name' does not contain both 'Samsung' and 'Galaxy'
df = df[df['Product Name'].str.contains('Xiaomi', case=False) | df['Product Name'].str.contains('Redmi', case=False)]

# Step 4: Remove rows where 'Rating (Stars)' is NaN
df = df[df['Rating (Stars)'].notna()]
# Display the cleaned dataframe
df


Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery
1,"Redmi Note 13 5G (Arctic White, 6GB RAM, 128GB...",3.7,5707,1K+ bought in past month,15999,₹20999,(24% off),No
2,VIDO Transparent Back Case Cover for Xiaomi 11...,4.1,111,100+ bought in past month,149,₹999,(85% off),No
3,"Redmi Note 10T 5G (Graphite Black, 6GB RAM, 12...",4.1,101748,0,13479,₹18999,(29% off),No
4,Xiaomi Redmi Pad Pro Keyboard,4.2,24,200+ bought in past month,4160,₹5999,(31% off),No
5,"Xiaomi Mens Men's Advanced Tech Running, Walki...",3.7,233,0,3208,₹5999,(47% off),No
6,(Renewed) MI Xiaomi Wireless Home Security Cam...,4.0,138,50+ bought in past month,1699,₹4499,(62% off),No
7,Xiaomi Redmi Smart Watch 3 Active Gray| 1.83 I...,4.5,3182,,5990,,,No
8,Mi Xiaomi 2A Fast Charger USB-A Power Adapter ...,4.2,24568,1K+ bought in past month,499,₹599,(17% off),No
9,Mi Xiaomi 120W Hyper Charge Adapter with 6A Hy...,5.0,2,(₹10.67₹10.67/Grams),2999,₹10.67,(63% off),No
12,Xiaomi Pad 6 Cover - Blue,4.6,119,0,1559,₹2999,(48% off),No


In [70]:
# Define color options to check against
colors = ['Black', 'Blue', 'White', 'Gray', 'Silver', 'Green', 'Red', 'Pink', 'Gold']

# Set Brand column based on the name
df['Brand'] = df['Product Name'].apply(lambda x: 'Xiaomi' if 'Xiaomi' in x else 'Redmi')

# Extract Product Name (i.e., everything after 'Xiaomi' or 'Redmi')
df['Product'] = df['Product Name'].apply(
    lambda x: re.search(r'(Xiaomi|Redmi) ([^(]+)', x).group(0) if re.search(r'(Xiaomi|Redmi) ([^(]+)', x) else None
)

# Extract Color (if exists, otherwise default to 'Black')
df['Color'] = df['Product Name'].apply(
    lambda x: next((color for color in colors if color in x), 'Black')
)

# Extract RAM (in the format of '6GB', '8GB', etc.)
df['RAM'] = df['Product Name'].apply(
    lambda x: re.search(r'(\d+GB)(?=\s*RAM)', x).group(0) if re.search(r'(\d+GB)(?=\s*RAM)', x) else None
)

# Extract Storage (in the format of '128GB', '256GB', etc.)
df['Storage'] = df['Product Name'].apply(
    lambda x: re.search(r'(\d+GB)(?=\s*Storage)', x).group(0) if re.search(r'(\d+GB)(?=\s*Storage)', x) else None
)

# Remove rows where RAM or Storage is missing
df = df.dropna(subset=['RAM', 'Storage'])

# Display the dataframe with the new columns
df


Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery,Brand,Product,Color,RAM,Storage
1,"Redmi Note 13 5G (Arctic White, 6GB RAM, 128GB...",3.7,5707,1K+ bought in past month,15999,₹20999,(24% off),No,Redmi,Redmi Note 13 5G,White,6GB,128GB
3,"Redmi Note 10T 5G (Graphite Black, 6GB RAM, 12...",4.1,101748,0,13479,₹18999,(29% off),No,Redmi,Redmi Note 10T 5G,Black,6GB,128GB


In [71]:
# Function to convert Bought Last Month to a numeric value
def convert_bought_last_month(value):
    if pd.isna(value) or not isinstance(value, str):  # Check for NaN or non-string values
        return 0  # Return 0 if NaN or not a string
    # Extract number from string (handle cases like 5K, 200+)
    match = re.search(r'(\d+)(K|\+)?', value)
    if match:
        number = int(match.group(1))  # Get the number part
        if match.group(2) == 'K':  # If 'K' is present, multiply by 1000
            number *= 1000
        return number
    return 0  # Return 0 if regex doesn't match

# Function to convert Dashed MRP to numeric value
def convert_dashed_mrp(value, current_mrp):
    if pd.isna(value):  # If NaN, use the Current MRP
        return current_mrp
    # Remove ₹ symbol and commas, then convert to integer
    if isinstance(value, str):
        return int(value.replace('₹', '').replace(',', '').strip())
    return current_mrp  # If conversion fails, return current MRP

# Function to convert Discount (%) to numeric, handle NaN as 0
def convert_discount(value):
    if pd.isna(value):  # If NaN, return 0
        return 0
    # Check if the value is a string and contains '%' symbol
    if isinstance(value, str):
        # Extract the number before the '%' sign, ignoring any non-numeric characters
        match = re.search(r'(\d+)', value)
        if match:
            return float(match.group(1))
    return 0  # Return 0 if the value cannot be converted

# Apply the conversion functions to the respective columns
df['New Bought Last Month'] = df['Bought Last Month'].apply(convert_bought_last_month)
df['New Dashed MRP'] = df['Dashed MRP'].apply(lambda x: convert_dashed_mrp(x, df['Current MRP']))
df['New Discount (%)'] = df['Discount (%)'].apply(convert_discount)

# Display the dataframe with the new columns
df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['New Bought Last Month'] = df['Bought Last Month'].apply(convert_bought_last_month)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['New Dashed MRP'] = df['Dashed MRP'].apply(lambda x: convert_dashed_mrp(x, df['Current MRP']))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['New Discount (%)

Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery,Brand,Product,Color,RAM,Storage,New Bought Last Month,New Dashed MRP,New Discount (%)
1,"Redmi Note 13 5G (Arctic White, 6GB RAM, 128GB...",3.7,5707,1K+ bought in past month,15999,₹20999,(24% off),No,Redmi,Redmi Note 13 5G,White,6GB,128GB,1000,20999,24.0
3,"Redmi Note 10T 5G (Graphite Black, 6GB RAM, 12...",4.1,101748,0,13479,₹18999,(29% off),No,Redmi,Redmi Note 10T 5G,Black,6GB,128GB,0,18999,29.0


In [72]:
df.to_csv('Filtered-csvs/Xiaomi/amazon_page_10.csv', index=False)

> finally add a column to the dataframe that contains ram : 

```python
iPhone 16
8 GB
iPhone 16 Plus
8 GB
iPhone 14
6 GB
iPhone 14 Plus
6 GB
iPhone 13
4 or 6 GB
iPhone 12
4 GB
iPhone 11
4 GB
iPhone SE (2nd gen.)
3 GB