In [160]:
import pandas as pd
import numpy as np
import re

In [161]:
df = pd.read_csv("Amazon-Samsung-csvs/amazon_samsung_page_10.csv")
df

Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery
0,"Samsung Galaxy Tab S9 FE, S Pen in-Box, 27.69 ...",4.3,90.0,M.R.P:,34999.0,₹44999,(22% off),Yes
1,Isoelite Remote Compatible for Samsung Side Di...,4.1,154.0,M.R.P:,399.0,₹999,(60% off),Yes
2,(Refurbished) Samsung Galaxy M14 4G (Arctic Bl...,4.1,7.0,50+ bought in past month,7999.0,₹8500,(6% off),Yes
3,"Samsung Galaxy A15 5G (Blue Black, 6GB, 128GB ...",3.9,882.0,400+ bought in past month,15499.0,₹19999,(23% off),Yes
4,"Samsung Galaxy Z Flip6 5G AI Smartphone (Blue,...",1.0,1.0,50+ bought in past month,79230.0,₹109999,(28% off),Yes
5,"Samsung Galaxy A15 5G (Blue, 8GB, 128GB Storag...",3.9,882.0,300+ bought in past month,16999.0,₹21499,(21% off),Yes
6,Samsung Galaxy Tab A9+ 27.94 cm (11.0 inch) Di...,3.9,46.0,200+ bought in past month,20999.0,₹32999,(36% off),Yes
7,Samsung Galaxy Z Fold6 5G AI Smartphone (Silve...,4.3,73.0,M.R.P:,164499.0,₹176999,(7% off),Yes
8,Samsung Usb To Type C 1.5 Meter 25W Fast Charg...,3.6,2315.0,700+ bought in past month,199.0,₹699,(72% off),Yes
9,"Samsung 8 kg, 5 star, Eco Bubble Tech, Digital...",4.2,25810.0,4K+ bought in past month,19990.0,₹27000,(26% off),Yes


In [162]:

# Step 1: Update 'Bought Last Month' to 0 where it contains 'M.R.P:' or doesn't contain numbers
df['Bought Last Month'] = df['Bought Last Month'].apply(lambda x: 0 if isinstance(x, str) and ('M.R.P:' in x or not any(char.isdigit() for char in x)) else x)

# Step 2: Remove rows where 'Current MRP' is NaN
df = df[df['Current MRP'].notna()]

# Step 3: Remove rows where 'Product Name' does not contain both 'Samsung' and 'Galaxy'
df = df[df['Product Name'].str.contains('Samsung', case=False) & df['Product Name'].str.contains('Galaxy', case=False)]

# Step 4: Remove rows where 'Rating (Stars)' is NaN
df = df[df['Rating (Stars)'].notna()]
# Display the cleaned dataframe
df


Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery
0,"Samsung Galaxy Tab S9 FE, S Pen in-Box, 27.69 ...",4.3,90,0,34999.0,₹44999,(22% off),Yes
2,(Refurbished) Samsung Galaxy M14 4G (Arctic Bl...,4.1,7,50+ bought in past month,7999.0,₹8500,(6% off),Yes
3,"Samsung Galaxy A15 5G (Blue Black, 6GB, 128GB ...",3.9,882,400+ bought in past month,15499.0,₹19999,(23% off),Yes
4,"Samsung Galaxy Z Flip6 5G AI Smartphone (Blue,...",1.0,1,50+ bought in past month,79230.0,₹109999,(28% off),Yes
5,"Samsung Galaxy A15 5G (Blue, 8GB, 128GB Storag...",3.9,882,300+ bought in past month,16999.0,₹21499,(21% off),Yes
6,Samsung Galaxy Tab A9+ 27.94 cm (11.0 inch) Di...,3.9,46,200+ bought in past month,20999.0,₹32999,(36% off),Yes
7,Samsung Galaxy Z Fold6 5G AI Smartphone (Silve...,4.3,73,0,164499.0,₹176999,(7% off),Yes
8,Samsung Usb To Type C 1.5 Meter 25W Fast Charg...,3.6,2315,700+ bought in past month,199.0,₹699,(72% off),Yes
10,Samsung Galaxy Tab A9+ 27.94 cm (11.0 inch) Di...,4.1,1533,1K+ bought in past month,19931.0,₹27999,(29% off),Yes
11,Magnetic Braided Solo Loop for Samsung Galaxy ...,4.4,3,0,1399.0,₹1700,(18% off),Yes


In [163]:

# Extract the 'Brand' (Samsung is the brand in all rows)
df['Brand'] = 'Samsung'

# Define color options to check against
colors = ['Black', 'Blue', 'White', 'Gray', 'Silver', 'Green', 'Red', 'Pink', 'Gold']

# Extract Product Name (i.e., everything after 'Samsung Galaxy')
df['Product'] = df['Product Name'].apply(lambda x: 'Samsung Galaxy ' + re.search(r'Samsung Galaxy ([^(]+)', x).group(1) if re.search(r'Samsung Galaxy ([^(]+)', x) else None)

# Extract Color (if exists, otherwise default to 'Black')
df['Color'] = df['Product Name'].apply(lambda x: next((color for color in colors if color in x), 'Black'))

# Extract RAM (in the format of '6GB', '8GB', etc.)
df['RAM'] = df['Product Name'].apply(lambda x: re.search(r'(\d+GB)(?=\s*(RAM|GB))', x).group(0) if re.search(r'(\d+GB)(?=\s*(RAM|GB))', x) else None)

# Extract Storage (in the format of '128GB', '256GB', etc.)
df['Storage'] = df['Product Name'].apply(lambda x: re.search(r'(\d+GB)(?=\s*Storage)', x).group(0) if re.search(r'(\d+GB)(?=\s*Storage)', x) else None)

# Remove rows where RAM or Storage is missing
df = df.dropna(subset=['RAM', 'Storage'])

# Display the dataframe with the new columns
df

Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery,Brand,Product,Color,RAM,Storage
4,"Samsung Galaxy Z Flip6 5G AI Smartphone (Blue,...",1.0,1,50+ bought in past month,79230.0,₹109999,(28% off),Yes,Samsung,Samsung Galaxy Z Flip6 5G AI Smartphone,Blue,12GB,256GB
7,Samsung Galaxy Z Fold6 5G AI Smartphone (Silve...,4.3,73,0,164499.0,₹176999,(7% off),Yes,Samsung,Samsung Galaxy Z Fold6 5G AI Smartphone,Silver,12GB,512GB
14,"Samsung Galaxy S20 Ultra (Cosmic Gray, 12GB RA...",4.2,285,0,69999.0,₹99999,(30% off),Yes,Samsung,Samsung Galaxy S20 Ultra,Gray,12GB,128GB
18,"Samsung Galaxy A13 Black, 6GB RAM, 128GB Stora...",4.0,1432,0,13580.0,₹20990,(35% off),Yes,Samsung,"Samsung Galaxy A13 Black, 6GB RAM, 128GB Stora...",Black,6GB,128GB


In [164]:
import pandas as pd
import re

# Function to convert Bought Last Month to a numeric value
def convert_bought_last_month(value):
    if pd.isna(value) or not isinstance(value, str):  # Check for NaN or non-string values
        return 0  # Return 0 if NaN or not a string
    # Extract number from string (handle cases like 5K, 200+)
    match = re.search(r'(\d+)(K|\+)?', value)
    if match:
        number = int(match.group(1))  # Get the number part
        if match.group(2) == 'K':  # If 'K' is present, multiply by 1000
            number *= 1000
        return number
    return 0  # Return 0 if regex doesn't match

# Function to convert Dashed MRP to numeric value
def convert_dashed_mrp(value, current_mrp):
    if pd.isna(value):  # If NaN, use the Current MRP
        return current_mrp
    # Remove ₹ symbol and commas, then convert to integer
    if isinstance(value, str):
        return int(value.replace('₹', '').replace(',', '').strip())
    return current_mrp  # If conversion fails, return current MRP

# Function to convert Discount (%) to numeric, handle NaN as 0
def convert_discount(value):
    if pd.isna(value):  # If NaN, return 0
        return 0
    # Check if the value is a string and contains '%' symbol
    if isinstance(value, str):
        # Extract the number before the '%' sign, ignoring any non-numeric characters
        match = re.search(r'(\d+)', value)
        if match:
            return float(match.group(1))
    return 0  # Return 0 if the value cannot be converted

# Apply the conversion functions to the respective columns
df['New Bought Last Month'] = df['Bought Last Month'].apply(convert_bought_last_month)
df['New Dashed MRP'] = df['Dashed MRP'].apply(lambda x: convert_dashed_mrp(x, df['Current MRP']))
df['New Discount (%)'] = df['Discount (%)'].apply(convert_discount)

# Display the dataframe with the new columns
df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['New Bought Last Month'] = df['Bought Last Month'].apply(convert_bought_last_month)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['New Dashed MRP'] = df['Dashed MRP'].apply(lambda x: convert_dashed_mrp(x, df['Current MRP']))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['New Discount (%)

Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery,Brand,Product,Color,RAM,Storage,New Bought Last Month,New Dashed MRP,New Discount (%)
4,"Samsung Galaxy Z Flip6 5G AI Smartphone (Blue,...",1.0,1,50+ bought in past month,79230.0,₹109999,(28% off),Yes,Samsung,Samsung Galaxy Z Flip6 5G AI Smartphone,Blue,12GB,256GB,50,109999,28.0
7,Samsung Galaxy Z Fold6 5G AI Smartphone (Silve...,4.3,73,0,164499.0,₹176999,(7% off),Yes,Samsung,Samsung Galaxy Z Fold6 5G AI Smartphone,Silver,12GB,512GB,0,176999,7.0
14,"Samsung Galaxy S20 Ultra (Cosmic Gray, 12GB RA...",4.2,285,0,69999.0,₹99999,(30% off),Yes,Samsung,Samsung Galaxy S20 Ultra,Gray,12GB,128GB,0,99999,30.0
18,"Samsung Galaxy A13 Black, 6GB RAM, 128GB Stora...",4.0,1432,0,13580.0,₹20990,(35% off),Yes,Samsung,"Samsung Galaxy A13 Black, 6GB RAM, 128GB Stora...",Black,6GB,128GB,0,20990,35.0


In [165]:
df.to_csv('Filtered-csvs/Samsung/amazon_samsung_page_10.csv', index=False)

> finally add a column to the dataframe that contains ram : 

```python
iPhone 16
8 GB
iPhone 16 Plus
8 GB
iPhone 14
6 GB
iPhone 14 Plus
6 GB
iPhone 13
4 or 6 GB
iPhone 12
4 GB
iPhone 11
4 GB
iPhone SE (2nd gen.)
3 GB