In [31]:
import pandas as pd
import numpy as np
import re

In [32]:
df = pd.read_csv("Amazon-Vivo-csvs/amazon_page_5.csv")
df

Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery
0,"Samsung Galaxy M05 (Mint Green, 4GB RAM, 64 GB...",3.9,731.0,5K+ bought in past month,7999.0,₹9999,(20% off),No
1,"Redmi Note 13 5G (Stealth Black, 6GB RAM, 128G...",3.7,5707.0,500+ bought in past month,15999.0,₹20999,(24% off),No
2,"Vivo V40 Pro 5G AI Smartphone (Ganges Blue, 8G...",4.1,28.0,200+ bought in past month,45965.0,₹54999,(16% off),No
3,"vivo Y28s 5G (Twinkling Purple, 4GB RAM, 128GB...",3.7,74.0,100+ bought in past month,13499.0,₹17999,(25% off),No
4,"Vivo Y27 (Garden Green, 6GB RAM, 128GB Storage...",4.0,953.0,M.R.P:,10999.0,₹18999,(42% off),No
5,"Vivo V29 Pro 5G - 8GB RAM, 256GB Storage, 4600...",4.0,1.0,M.R.P:,33249.0,₹44999,(26% off),No
6,"vivo Y18e (Space Black, 4GB RAM, 64GB Storage)...",3.9,153.0,100+ bought in past month,7999.0,₹11999,(33% off),No
7,"vivo Y28s 5G(Twinkling Purple, 8GB RAM, 128GB ...",3.7,74.0,100+ bought in past month,16499.0,₹21999,(25% off),No
8,"iQOO Z9 Lite 5G (Mocha Brown, 6GB RAM, 128GB S...",4.0,1973.0,5K+ bought in past month,11498.0,₹15499,(26% off),No
9,"Samsung Galaxy M05 (Mint Green, 4GB RAM, 64 GB...",3.9,731.0,5K+ bought in past month,7999.0,₹9999,(20% off),No


In [33]:

# Step 1: Update 'Bought Last Month' to 0 where it contains 'M.R.P:' or doesn't contain numbers
df['Bought Last Month'] = df['Bought Last Month'].apply(lambda x: 0 if isinstance(x, str) and ('M.R.P:' in x or not any(char.isdigit() for char in x)) else x)

# Step 2: Remove rows where 'Current MRP' is NaN
df = df[df['Current MRP'].notna()]

# Step 3: Remove rows where 'Product Name' does not contain both 'Samsung' and 'Galaxy'
df = df[df['Product Name'].str.contains('Vivo', case=False)]

# Step 4: Remove rows where 'Rating (Stars)' is NaN
df = df[df['Rating (Stars)'].notna()]
# Display the cleaned dataframe
df


Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery
2,"Vivo V40 Pro 5G AI Smartphone (Ganges Blue, 8G...",4.1,28,200+ bought in past month,45965.0,₹54999,(16% off),No
3,"vivo Y28s 5G (Twinkling Purple, 4GB RAM, 128GB...",3.7,74,100+ bought in past month,13499.0,₹17999,(25% off),No
4,"Vivo Y27 (Garden Green, 6GB RAM, 128GB Storage...",4.0,953,0,10999.0,₹18999,(42% off),No
5,"Vivo V29 Pro 5G - 8GB RAM, 256GB Storage, 4600...",4.0,1,0,33249.0,₹44999,(26% off),No
6,"vivo Y18e (Space Black, 4GB RAM, 64GB Storage)...",3.9,153,100+ bought in past month,7999.0,₹11999,(33% off),No
7,"vivo Y28s 5G(Twinkling Purple, 8GB RAM, 128GB ...",3.7,74,100+ bought in past month,16499.0,₹21999,(25% off),No
12,"Vivo V40 5G Smartphone (Titanium Grey, 8GB RAM...",4.1,24,100+ bought in past month,34781.0,₹42999,(19% off),No
15,"vivo Y18e (Gem Green, 4GB RAM, 64GB Storage) w...",3.9,153,100+ bought in past month,7999.0,₹11999,(33% off),No
16,"vivo Y18 (Space Black, 4GB RAM, 128GB Storage)...",3.8,37,0,9499.0,₹13999,(32% off),No
17,"Vivo V29 Pro 5G - 8GB RAM, 256GB Storage, 4600...",4.0,1,0,31600.0,₹39999,(21% off),No


In [34]:
# Define color options to check against
colors = ['Black', 'Blue', 'White', 'Gray', 'Silver', 'Green', 'Red', 'Pink', 'Gold']

# Step 1: Define Brand based on Product Name
df['Brand'] = df['Product Name'].apply(lambda x: 'Vivo' if 'Vivo' in x else 'Unknown')

# Step 2: Extract Product Name (i.e., everything after 'Vivo')
df['Product'] = df['Product Name'].apply(
    lambda x: re.search(r'Vivo ([^(]+)', x).group(0) if re.search(r'Vivo ([^(]+)', x) else None
)

# Step 3: Extract Color (if exists, otherwise default to 'Black')
df['Color'] = df['Product Name'].apply(
    lambda x: next((color for color in colors if color in x), 'Black')
)

# Step 4: Extract RAM (in the format of '6GB', '8GB', etc.)
df['RAM'] = df['Product Name'].apply(
    lambda x: re.search(r'(\d+GB)(?=\s*RAM)', x).group(0) if re.search(r'(\d+GB)(?=\s*RAM)', x) else None
)

# Step 5: Extract Storage (in the format of '128GB', '256GB', etc.)
df['Storage'] = df['Product Name'].apply(
    lambda x: re.search(r'(\d+GB)(?=\s*Storage)', x).group(0) if re.search(r'(\d+GB)(?=\s*Storage)', x) else None
)

# Step 6: Remove rows where RAM or Storage is missing
df = df.dropna(subset=['RAM', 'Storage'])

# Display the dataframe with the new columns
df


Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery,Brand,Product,Color,RAM,Storage
2,"Vivo V40 Pro 5G AI Smartphone (Ganges Blue, 8G...",4.1,28,200+ bought in past month,45965.0,₹54999,(16% off),No,Vivo,Vivo V40 Pro 5G AI Smartphone,Blue,8GB,256GB
3,"vivo Y28s 5G (Twinkling Purple, 4GB RAM, 128GB...",3.7,74,100+ bought in past month,13499.0,₹17999,(25% off),No,Unknown,,Black,4GB,128GB
4,"Vivo Y27 (Garden Green, 6GB RAM, 128GB Storage...",4.0,953,0,10999.0,₹18999,(42% off),No,Vivo,Vivo Y27,Green,6GB,128GB
5,"Vivo V29 Pro 5G - 8GB RAM, 256GB Storage, 4600...",4.0,1,0,33249.0,₹44999,(26% off),No,Vivo,"Vivo V29 Pro 5G - 8GB RAM, 256GB Storage, 4600...",Black,8GB,256GB
6,"vivo Y18e (Space Black, 4GB RAM, 64GB Storage)...",3.9,153,100+ bought in past month,7999.0,₹11999,(33% off),No,Unknown,,Black,4GB,64GB
7,"vivo Y28s 5G(Twinkling Purple, 8GB RAM, 128GB ...",3.7,74,100+ bought in past month,16499.0,₹21999,(25% off),No,Unknown,,Black,8GB,128GB
12,"Vivo V40 5G Smartphone (Titanium Grey, 8GB RAM...",4.1,24,100+ bought in past month,34781.0,₹42999,(19% off),No,Vivo,Vivo V40 5G Smartphone,Black,8GB,256GB
15,"vivo Y18e (Gem Green, 4GB RAM, 64GB Storage) w...",3.9,153,100+ bought in past month,7999.0,₹11999,(33% off),No,Unknown,,Green,4GB,64GB
16,"vivo Y18 (Space Black, 4GB RAM, 128GB Storage)...",3.8,37,0,9499.0,₹13999,(32% off),No,Unknown,,Black,4GB,128GB
17,"Vivo V29 Pro 5G - 8GB RAM, 256GB Storage, 4600...",4.0,1,0,31600.0,₹39999,(21% off),No,Vivo,"Vivo V29 Pro 5G - 8GB RAM, 256GB Storage, 4600...",Blue,8GB,256GB


In [35]:
# Function to convert Bought Last Month to a numeric value
def convert_bought_last_month(value):
    if pd.isna(value) or not isinstance(value, str):  # Check for NaN or non-string values
        return 0  # Return 0 if NaN or not a string
    # Extract number from string (handle cases like 5K, 200+)
    match = re.search(r'(\d+)(K|\+)?', value)
    if match:
        number = int(match.group(1))  # Get the number part
        if match.group(2) == 'K':  # If 'K' is present, multiply by 1000
            number *= 1000
        return number
    return 0  # Return 0 if regex doesn't match

# Function to convert Dashed MRP to numeric value
def convert_dashed_mrp(value, current_mrp):
    if pd.isna(value):  # If NaN, use the Current MRP
        return current_mrp
    # Remove ₹ symbol and commas, then convert to integer
    if isinstance(value, str):
        return int(value.replace('₹', '').replace(',', '').strip())
    return current_mrp  # If conversion fails, return current MRP

# Function to convert Discount (%) to numeric, handle NaN as 0
def convert_discount(value):
    if pd.isna(value):  # If NaN, return 0
        return 0
    # Check if the value is a string and contains '%' symbol
    if isinstance(value, str):
        # Extract the number before the '%' sign, ignoring any non-numeric characters
        match = re.search(r'(\d+)', value)
        if match:
            return float(match.group(1))
    return 0  # Return 0 if the value cannot be converted

# Apply the conversion functions to the respective columns
df['New Bought Last Month'] = df['Bought Last Month'].apply(convert_bought_last_month)
df['New Dashed MRP'] = df['Dashed MRP'].apply(lambda x: convert_dashed_mrp(x, df['Current MRP']))
df['New Discount (%)'] = df['Discount (%)'].apply(convert_discount)

# Display the dataframe with the new columns
df


Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery,Brand,Product,Color,RAM,Storage,New Bought Last Month,New Dashed MRP,New Discount (%)
2,"Vivo V40 Pro 5G AI Smartphone (Ganges Blue, 8G...",4.1,28,200+ bought in past month,45965.0,₹54999,(16% off),No,Vivo,Vivo V40 Pro 5G AI Smartphone,Blue,8GB,256GB,200,54999,16.0
3,"vivo Y28s 5G (Twinkling Purple, 4GB RAM, 128GB...",3.7,74,100+ bought in past month,13499.0,₹17999,(25% off),No,Unknown,,Black,4GB,128GB,100,17999,25.0
4,"Vivo Y27 (Garden Green, 6GB RAM, 128GB Storage...",4.0,953,0,10999.0,₹18999,(42% off),No,Vivo,Vivo Y27,Green,6GB,128GB,0,18999,42.0
5,"Vivo V29 Pro 5G - 8GB RAM, 256GB Storage, 4600...",4.0,1,0,33249.0,₹44999,(26% off),No,Vivo,"Vivo V29 Pro 5G - 8GB RAM, 256GB Storage, 4600...",Black,8GB,256GB,0,44999,26.0
6,"vivo Y18e (Space Black, 4GB RAM, 64GB Storage)...",3.9,153,100+ bought in past month,7999.0,₹11999,(33% off),No,Unknown,,Black,4GB,64GB,100,11999,33.0
7,"vivo Y28s 5G(Twinkling Purple, 8GB RAM, 128GB ...",3.7,74,100+ bought in past month,16499.0,₹21999,(25% off),No,Unknown,,Black,8GB,128GB,100,21999,25.0
12,"Vivo V40 5G Smartphone (Titanium Grey, 8GB RAM...",4.1,24,100+ bought in past month,34781.0,₹42999,(19% off),No,Vivo,Vivo V40 5G Smartphone,Black,8GB,256GB,100,42999,19.0
15,"vivo Y18e (Gem Green, 4GB RAM, 64GB Storage) w...",3.9,153,100+ bought in past month,7999.0,₹11999,(33% off),No,Unknown,,Green,4GB,64GB,100,11999,33.0
16,"vivo Y18 (Space Black, 4GB RAM, 128GB Storage)...",3.8,37,0,9499.0,₹13999,(32% off),No,Unknown,,Black,4GB,128GB,0,13999,32.0
17,"Vivo V29 Pro 5G - 8GB RAM, 256GB Storage, 4600...",4.0,1,0,31600.0,₹39999,(21% off),No,Vivo,"Vivo V29 Pro 5G - 8GB RAM, 256GB Storage, 4600...",Blue,8GB,256GB,0,39999,21.0


In [36]:
df.to_csv('Filtered-csvs/Vivo/amazon_page_5.csv', index=False)

> finally add a column to the dataframe that contains ram : 

```python
iPhone 16
8 GB
iPhone 16 Plus
8 GB
iPhone 14
6 GB
iPhone 14 Plus
6 GB
iPhone 13
4 or 6 GB
iPhone 12
4 GB
iPhone 11
4 GB
iPhone SE (2nd gen.)
3 GB