In [2]:
import pandas as pd
import numpy as np

In [3]:
df=pd.read_csv('flipkart_laptop.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Product_name,Prices,Description,Reviews
0,0,Acer Aspire 3 Intel Celeron Dual Core - (8 GB/...,"₹15,990",Intel Celeron Dual Core Processor8 GB DDR4 RAM...,3.8
1,1,Acer Aspire 3 Backlit AMD Ryzen 7 Octa Core 77...,"₹38,990",AMD Ryzen 7 Octa Core Processor16 GB DDR4 RAMW...,4.0
2,2,DELL Vostro Intel Core i3 12th Gen 1215U - (8 ...,"₹42,906",Intel Core i3 Processor (12th Gen)8 GB DDR4 RA...,4.4
3,3,HP Victus Intel Core i5 13th Gen 13420H - (16 ...,"₹69,990",Intel Core i5 Processor (13th Gen)16 GB DDR4 R...,4.4
4,4,ASUS Vivobook Go 15 OLED AMD Ryzen 3 Quad Core...,"₹34,990",AMD Ryzen 3 Quad Core Processor8 GB LPDDR5 RAM...,3.9


In [5]:

# Extract key fields from Product_name
df["Brand_Model"] = df["Product_name"].str.extract(r"^(.*?)\s(?:Intel|AMD)")
df["Processor"] = df["Product_name"].str.extract(r"((?:Intel|AMD)[^-\(]*)")  # Fixed this line
df["RAM"] = df["Product_name"].str.extract(r"(\d+\s*GB)(?=/)")
df["Storage"] = df["Product_name"].str.extract(r"/\s*(\d+\s*GB\s*(?:SSD|HDD))")
df["OS"] = df["Product_name"].str.extract(r"(Windows\s\d+\s\w+|DOS|Ubuntu)")
df["Model"] = df["Product_name"].str.extract(r"\)\s*([A-Za-z0-9\-]+)\s")
df["Type"] = df["Product_name"].str.extract(r"(Thin and Light Laptop|Gaming Laptop|Laptop)")

# Ensure it's string type before cleaning
df['Prices'] = df['Prices'].astype(str)

# Step 1: Replace corrupted currency symbol
df['Prices'] = df['Prices'].str.replace("â‚¹", "", regex=False)

# Step 2: Remove any other non-numeric characters (like commas, spaces, invisible unicode)
df['Prices'] = df['Prices'].str.replace(r'[^\d.]', '', regex=True)

# Step 3: Convert to numeric, forcing invalid entries to NaN
df['Prices'] = pd.to_numeric(df['Prices'], errors='coerce')


#df['Description'] = df['Description'].astype(str)
df['Description'] = df['Description'].str.replace(r'\s+', ' ', regex=True)  # normalize whitespace
df['Description'] = df['Description'].str.replace(r'(?<=[a-z])(?=[A-Z])', ' ', regex=True)  # add space between camel cases
df['Description'] = df['Description'].str.strip()

# Optional: remove stop words or apply stemming/lemmatization (for NLP tasks)

#Drop rows where Product_name is missing
df = df.dropna(subset=['Product_name'])

#Drop the Product_name column entirely--bcz we have extracted required info from that
df = df.drop(columns=['Product_name'])

# 1. Drop rows where Brand_Model is missing
df = df.dropna(subset=['Brand_Model'])

# 2. Reset Sr.no column
df = df.reset_index(drop=True)
df.insert(0, 'Sr.no', range(1, len(df) + 1))

# 3. Drop the old unnamed column if still present
if 'Unnamed: 0' in df.columns:
    df.drop(columns=['Unnamed: 0'], inplace=True)

# 4.fillna for nan values
df.fillna({
    'Brand_Model': 'Not available',
    'Prices': 0,
    'Processor': 'Not available',
    'OS': 'Not available',
    'Model': 'Not available',
    'Type': 'Not available',
    'Description': 'No Description',
    'Reviews': 0
}, inplace=True)

# 5. Reorder columns
desired_order = ['Sr.no', 'Brand_Model', 'Prices', 'RAM', 'Storage', 'Processor', 'OS', 'Model', 'Type', 'Description', 'Reviews']
df = df[[col for col in desired_order if col in df.columns]]  # only reorder existing columns

# Save cleaned data
df.to_csv("flipkart_laptop_cleaned.csv", index=False)


print("✅ Cleaned data saved to 'flipkart_laptop_cleaned.csv'")



✅ Cleaned data saved to 'flipkart_laptop_cleaned.csv'


In [6]:
df.head()

Unnamed: 0,Sr.no,Brand_Model,Prices,RAM,Storage,Processor,OS,Model,Type,Description,Reviews
0,1,Acer Aspire 3,15990,8 GB,256 GB SSD,Intel Celeron Dual Core,Windows 11 Home,A311-45,Not available,Intel Celeron Dual Core Processor8 GB DDR4 RAM...,3.8
1,2,Acer Aspire 3 Backlit,38990,16 GB,512 GB SSD,AMD Ryzen 7 Octa Core 7730U,Windows 11 Home,A325-42,Not available,AMD Ryzen 7 Octa Core Processor16 GB DDR4 RAMW...,4.0
2,3,DELL Vostro,42906,8 GB,512 GB SSD,Intel Core i3 12th Gen 1215U,Windows 11 Home,3520,Laptop,Intel Core i3 Processor (12th Gen)8 GB DDR4 RA...,4.4
3,4,HP Victus,69990,16 GB,512 GB SSD,Intel Core i5 13th Gen 13420H,Windows 11 Home,Not available,Not available,Intel Core i5 Processor (13th Gen)16 GB DDR4 R...,4.4
4,5,ASUS Vivobook Go 15 OLED,34990,8 GB,512 GB SSD,AMD Ryzen 3 Quad Core 7320U,Windows 11 Home,Not available,Not available,AMD Ryzen 3 Quad Core Processor8 GB LPDDR5 RAM...,3.9
