In [68]:
import pandas as pd
import numpy as np
import re

In [69]:
df = pd.read_csv("../Raw-Csvs/Amazon-Iphone-csvs/amazon_page_10.csv")
df

Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery
0,"Samsung Galaxy M05 (Mint Green, 4GB RAM, 64 GB...",3.9,770.0,5K+ bought in past month,6999,₹9999,(30% off),Yes
1,Samsung Galaxy S23 Ultra 5G AI Smartphone (Gre...,4.4,2505.0,5K+ bought in past month,79999,₹149999,(47% off),Yes
2,Apple iPhone 14 (256 GB) - Starlight,4.6,98.0,M.R.P:,69900,₹89900,(22% off),Yes
3,Mobile Phone 6 Silver 64GB Storage 2 GB RAM Co...,,,M.R.P:,13999,₹29999,(53% off),Yes
4,Apple iPhone 14 Plus (512 GB) - Blue,3.7,14.0,M.R.P:,89900,₹119900,(25% off),Yes
5,"realme GT 6T 5G (Razor Green,12GB RAM+256GB St...",4.3,182.0,1K+ bought in past month,35998,₹37999,(5% off),Yes
6,Apple iPhone 14 Plus (512 GB) - Midnight,4.2,8.0,M.R.P:,89900,₹119900,(25% off),Yes
7,"OnePlus 12R (Iron Gray, 8 GB RAM, 256 GB Storage)",4.2,416.0,2K+ bought in past month,38999,₹42999,(9% off),Yes
8,Apple iPhone 14 Plus (512 GB) - Starlight,4.3,9.0,M.R.P:,89900,₹119900,(25% off),Yes
9,"OnePlus Nord CE 3 Lite 5G (Pastel Lime, 8GB RA...",4.2,22285.0,2K+ bought in past month,15663,₹19999,(22% off),Yes


In [70]:
# Step 1: Filter only Apple iPhone data
df = df[df['Product Name'].str.contains('Apple', case=False) & df['Product Name'].str.contains('iPhone', case=False)]

# Step 2: Extract details from Product Name using regex
def extract_details(product_name):
    # Regular expression pattern to capture brand, product name, storage, and color
    pattern = r"([A-Za-z]+)\s([A-Za-z0-9]+(?:\s[A-Za-z0-9]+)*)\s\((\d+)\s?GB\)\s?-\s?([A-Za-z\s]+)"
    match = re.match(pattern, product_name)
    
    if match:
        brand = match.group(1)
        product = match.group(2)
        storage = match.group(3) + "GB"
        color = match.group(4).strip()
        return [brand, product, storage, color]
    return [None, None, None, None]

# Apply the function to extract details and assign to relevant columns
df[['Brand', 'Product', 'Storage', 'Color']] = df['Product Name'].apply(lambda x: pd.Series(extract_details(x)))

# Step 3: Convert "Bought Last Month" to numeric, handle NaN as 0
def convert_bought_last_month(value):
    if pd.isna(value):  # Check for NaN values and return 0
        return 0
    # Extract number from string (handle cases like 5K, 200+)
    match = re.search(r'(\d+)(K|\+)?', value)
    if match:
        number = int(match.group(1))  # Get the number part
        if match.group(2) == 'K':  # If 'K' is present, multiply by 1000
            number *= 1000
        return number
    return 0  # Return 0 in case of invalid data

# Step 4: Convert "Dashed MRP" to numeric, handle NaN and assign to Current MRP
def convert_dashed_mrp(value, current_mrp):
    if pd.isna(value):  # If NaN, use the Current MRP
        return current_mrp
    # Remove ₹ symbol and commas, then convert to integer
    if isinstance(value, str):
        return int(value.replace('₹', '').replace(',', '').strip())
    return current_mrp  # If conversion fails, return current MRP

# Step 5: Convert "Discount (%)" to numeric, handle NaN as 0
def convert_discount(value):
    if pd.isna(value):  # If NaN, return 0
        return 0
    # Check if the value is a string and contains '%' symbol
    if isinstance(value, str):
        # Extract the number before the '%' sign, ignoring any non-numeric characters
        match = re.search(r'(\d+)', value)
        if match:
            return float(match.group(1))
    return 0  # Return 0 if the value cannot be converted



# Step 6: Drop rows where "Current MRP" is NaN
df = df.dropna(subset=['Current MRP'])

# Step 7: Apply conversions
df['New Bought Last Month'] = df['Bought Last Month'].apply(convert_bought_last_month)
df['New Dashed MRP'] = df.apply(lambda row: convert_dashed_mrp(row['Dashed MRP'], row['Current MRP']), axis=1)
df['New Discount (%)'] = df['Discount (%)'].apply(convert_discount)

# Step 8: Map "Free Delivery" values to 0 (No) and 1 (Yes)
df['Free Delivery'] = df['Free Delivery'].map({'No': 0, 'Yes': 1})

# Step 9: Replace missing "Color" with "Red"
df['Color'] = df['Color'].fillna('Red')

# Step 10: Show the cleaned and processed data
# print(df[['Brand', 'Product', 'Storage', 'Color', 'New Bought Last Month', 'New Dashed MRP', 'New Discount (%)', 'Free Delivery']])

df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[['Brand', 'Product', 'Storage', 'Color']] = df['Product Name'].apply(lambda x: pd.Series(extract_details(x)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[['Brand', 'Product', 'Storage', 'Color']] = df['Product Name'].apply(lambda x: pd.Series(extract_details(x)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-

Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery,Brand,Product,Storage,Color,New Bought Last Month,New Dashed MRP,New Discount (%)
2,Apple iPhone 14 (256 GB) - Starlight,4.6,98,M.R.P:,69900,₹89900,(22% off),1,Apple,iPhone 14,256GB,Starlight,0,89900,22.0
4,Apple iPhone 14 Plus (512 GB) - Blue,3.7,14,M.R.P:,89900,₹119900,(25% off),1,Apple,iPhone 14 Plus,512GB,Blue,0,119900,25.0
6,Apple iPhone 14 Plus (512 GB) - Midnight,4.2,8,M.R.P:,89900,₹119900,(25% off),1,Apple,iPhone 14 Plus,512GB,Midnight,0,119900,25.0
8,Apple iPhone 14 Plus (512 GB) - Starlight,4.3,9,M.R.P:,89900,₹119900,(25% off),1,Apple,iPhone 14 Plus,512GB,Starlight,0,119900,25.0
17,Apple iPhone 12 Pro Max (128GB) - Pacific Blue,4.6,446,M.R.P:,109000,₹119900,(9% off),1,Apple,iPhone 12 Pro Max,128GB,Pacific Blue,0,119900,9.0


In [71]:

# Function to determine RAM based on product name
def get_ram(product_name):
    if "iPhone 16 Pro" in product_name or "iPhone 16" in product_name:
        return "8 GB"
    elif "iPhone 15 Plus" in product_name or "iPhone 15" in product_name:
        return "6 GB"
    elif "iPhone 14 Plus" in product_name or "iPhone 14" in product_name:
        return "6 GB"
    elif "iPhone 13" in product_name:
        return "4 or 6 GB"
    elif "iPhone 12" in product_name:
        return "4 GB"
    elif "iPhone 11" in product_name:
        return "4 GB"
    elif "iPhone SE (2nd gen.)" in product_name:
        return "3 GB"
    else:
        return None  # Default if not matched

# Apply the function to create a new 'RAM' column
df["RAM"] = df["Product Name"].apply(get_ram)

# Display the updated DataFrame
df


Unnamed: 0,Product Name,Rating (Stars),Number of Reviews,Bought Last Month,Current MRP,Dashed MRP,Discount (%),Free Delivery,Brand,Product,Storage,Color,New Bought Last Month,New Dashed MRP,New Discount (%),RAM
2,Apple iPhone 14 (256 GB) - Starlight,4.6,98,M.R.P:,69900,₹89900,(22% off),1,Apple,iPhone 14,256GB,Starlight,0,89900,22.0,6 GB
4,Apple iPhone 14 Plus (512 GB) - Blue,3.7,14,M.R.P:,89900,₹119900,(25% off),1,Apple,iPhone 14 Plus,512GB,Blue,0,119900,25.0,6 GB
6,Apple iPhone 14 Plus (512 GB) - Midnight,4.2,8,M.R.P:,89900,₹119900,(25% off),1,Apple,iPhone 14 Plus,512GB,Midnight,0,119900,25.0,6 GB
8,Apple iPhone 14 Plus (512 GB) - Starlight,4.3,9,M.R.P:,89900,₹119900,(25% off),1,Apple,iPhone 14 Plus,512GB,Starlight,0,119900,25.0,6 GB
17,Apple iPhone 12 Pro Max (128GB) - Pacific Blue,4.6,446,M.R.P:,109000,₹119900,(9% off),1,Apple,iPhone 12 Pro Max,128GB,Pacific Blue,0,119900,9.0,4 GB


In [72]:
df = df[df["RAM"].notna()]

In [73]:
df.to_csv('../Filtered-csvs/Iphone/amazon_page_10.csv', index=False)

> finally add a column to the dataframe that contains ram : 

```python
iPhone 16
8 GB
iPhone 16 Plus
8 GB
iPhone 14
6 GB
iPhone 14 Plus
6 GB
iPhone 13
4 or 6 GB
iPhone 12
4 GB
iPhone 11
4 GB
iPhone SE (2nd gen.)
3 GB