# Part-2: Data Cleaning

In [1]:
import numpy as np
import pandas as pd

import re

In [2]:
df = pd.read_csv('data/laptops_flipkart.csv')

df.head()

Unnamed: 0,product_title,product_price,product_rating,product_review,product_feature
0,ASUS Chromebook Intel Celeron Dual Core N4500 ...,"₹13,990",3.8,"2,594 Ratings & 228 Reviews",Intel Celeron Dual Core Processor4 GB LPDDR4X ...
1,Lenovo Chromebook MediaTek Kompanio 520 - (8 G...,"₹23,850",3.8,"1,463 Ratings & 125 Reviews",MediaTek Kompanio 520 Processor8 GB LPDDR4X RA...
2,"ASUS Vivobook 15, with Backlit Keyboard, Intel...","₹48,990",4.2,"7,525 Ratings & 419 Reviews",Intel Core i5 Processor (12th Gen)16 GB DDR4 R...
3,HP Victus Intel Core i5 12th Gen 12450H - (16 ...,"₹62,990",4.4,61 Ratings & 3 Reviews,Intel Core i5 Processor (12th Gen)16 GB DDR4 R...
4,Acer Swift Go 14 TouchScreen AI PC Intel Core ...,"₹84,990",4.1,88 Ratings & 9 Reviews,Intel Core Ultra 7 Processor16 GB LPDDR5X RAMW...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 888 entries, 0 to 887
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   product_title    888 non-null    object 
 1   product_price    887 non-null    object 
 2   product_rating   709 non-null    float64
 3   product_review   709 non-null    object 
 4   product_feature  888 non-null    object 
dtypes: float64(1), object(4)
memory usage: 34.8+ KB


## Data Extraction

### **01. Brand**

In [4]:
# Extract the brand name as the first word from the 'Title' column
# Store it in a new column called 'brand' for brand-level analysis

In [5]:
# Brand

# Extracting the brand using regex
df['brand'] = df['product_title'].str.extract(r'^\s*([A-Za-z]+)')

df.head()

Unnamed: 0,product_title,product_price,product_rating,product_review,product_feature,brand
0,ASUS Chromebook Intel Celeron Dual Core N4500 ...,"₹13,990",3.8,"2,594 Ratings & 228 Reviews",Intel Celeron Dual Core Processor4 GB LPDDR4X ...,ASUS
1,Lenovo Chromebook MediaTek Kompanio 520 - (8 G...,"₹23,850",3.8,"1,463 Ratings & 125 Reviews",MediaTek Kompanio 520 Processor8 GB LPDDR4X RA...,Lenovo
2,"ASUS Vivobook 15, with Backlit Keyboard, Intel...","₹48,990",4.2,"7,525 Ratings & 419 Reviews",Intel Core i5 Processor (12th Gen)16 GB DDR4 R...,ASUS
3,HP Victus Intel Core i5 12th Gen 12450H - (16 ...,"₹62,990",4.4,61 Ratings & 3 Reviews,Intel Core i5 Processor (12th Gen)16 GB DDR4 R...,HP
4,Acer Swift Go 14 TouchScreen AI PC Intel Core ...,"₹84,990",4.1,88 Ratings & 9 Reviews,Intel Core Ultra 7 Processor16 GB LPDDR5X RAMW...,Acer


In [6]:
df['brand'].value_counts()

brand
HP           188
SAMSUNG      153
Lenovo       150
ASUS         141
Acer          91
DELL          59
MSI           43
CHUWI         22
Apple         18
Infinix        9
Ultimus        4
Colorful       4
Primebook      3
MICROSOFT      2
Thomson        1
Name: count, dtype: int64

### **02. Price**

In [7]:
# Create a new column to classify products into price tiers (e.g., Budget, Mid-range, Premium)
# Helps segment products for pricing strategy or market positioning analysis

In [8]:
# Price

df['price'] = df['product_price'].str.replace(r'[^\d.]', '', regex=True)
df['price'] = pd.to_numeric(df['price'], errors='coerce')

df.head()

Unnamed: 0,product_title,product_price,product_rating,product_review,product_feature,brand,price
0,ASUS Chromebook Intel Celeron Dual Core N4500 ...,"₹13,990",3.8,"2,594 Ratings & 228 Reviews",Intel Celeron Dual Core Processor4 GB LPDDR4X ...,ASUS,13990.0
1,Lenovo Chromebook MediaTek Kompanio 520 - (8 G...,"₹23,850",3.8,"1,463 Ratings & 125 Reviews",MediaTek Kompanio 520 Processor8 GB LPDDR4X RA...,Lenovo,23850.0
2,"ASUS Vivobook 15, with Backlit Keyboard, Intel...","₹48,990",4.2,"7,525 Ratings & 419 Reviews",Intel Core i5 Processor (12th Gen)16 GB DDR4 R...,ASUS,48990.0
3,HP Victus Intel Core i5 12th Gen 12450H - (16 ...,"₹62,990",4.4,61 Ratings & 3 Reviews,Intel Core i5 Processor (12th Gen)16 GB DDR4 R...,HP,62990.0
4,Acer Swift Go 14 TouchScreen AI PC Intel Core ...,"₹84,990",4.1,88 Ratings & 9 Reviews,Intel Core Ultra 7 Processor16 GB LPDDR5X RAMW...,Acer,84990.0


### **03. Ratings and Reviews**

In [9]:
# Convert the 'Rating' column from string to numeric
# Handle any non-numeric or missing rating values gracefully

In [10]:
# product review to number of ratings and reviews

df['num_of_ratings'] = df['product_review'].str.extract(r'([\d,]+) Ratings')[0].str.replace(',', '', regex=True).astype(float)
df['num_of_reviews'] = df['product_review'].str.extract(r'([\d,]+) Reviews')[0].str.replace(',', '', regex=True).astype(float)

df.head()

Unnamed: 0,product_title,product_price,product_rating,product_review,product_feature,brand,price,num_of_ratings,num_of_reviews
0,ASUS Chromebook Intel Celeron Dual Core N4500 ...,"₹13,990",3.8,"2,594 Ratings & 228 Reviews",Intel Celeron Dual Core Processor4 GB LPDDR4X ...,ASUS,13990.0,2594.0,228.0
1,Lenovo Chromebook MediaTek Kompanio 520 - (8 G...,"₹23,850",3.8,"1,463 Ratings & 125 Reviews",MediaTek Kompanio 520 Processor8 GB LPDDR4X RA...,Lenovo,23850.0,1463.0,125.0
2,"ASUS Vivobook 15, with Backlit Keyboard, Intel...","₹48,990",4.2,"7,525 Ratings & 419 Reviews",Intel Core i5 Processor (12th Gen)16 GB DDR4 R...,ASUS,48990.0,7525.0,419.0
3,HP Victus Intel Core i5 12th Gen 12450H - (16 ...,"₹62,990",4.4,61 Ratings & 3 Reviews,Intel Core i5 Processor (12th Gen)16 GB DDR4 R...,HP,62990.0,61.0,3.0
4,Acer Swift Go 14 TouchScreen AI PC Intel Core ...,"₹84,990",4.1,88 Ratings & 9 Reviews,Intel Core Ultra 7 Processor16 GB LPDDR5X RAMW...,Acer,84990.0,88.0,9.0


### **04. Processor** 

In [11]:
# Extract processor information from the 'Specs' or 'Title' column
# Useful to analyze the distribution of CPU types (e.g., i3, i5, Ryzen)

In [12]:
# Regex pattern
processor_regex = r'(?:MediaTek|AMD|Intel|Apple)[\s\w]+Processor'

# Extract processor as a string instead of a list
df['processor'] = df['product_feature'].apply(lambda x: ' '.join(re.findall(processor_regex, x)))

df.head()

Unnamed: 0,product_title,product_price,product_rating,product_review,product_feature,brand,price,num_of_ratings,num_of_reviews,processor
0,ASUS Chromebook Intel Celeron Dual Core N4500 ...,"₹13,990",3.8,"2,594 Ratings & 228 Reviews",Intel Celeron Dual Core Processor4 GB LPDDR4X ...,ASUS,13990.0,2594.0,228.0,Intel Celeron Dual Core Processor
1,Lenovo Chromebook MediaTek Kompanio 520 - (8 G...,"₹23,850",3.8,"1,463 Ratings & 125 Reviews",MediaTek Kompanio 520 Processor8 GB LPDDR4X RA...,Lenovo,23850.0,1463.0,125.0,MediaTek Kompanio 520 Processor
2,"ASUS Vivobook 15, with Backlit Keyboard, Intel...","₹48,990",4.2,"7,525 Ratings & 419 Reviews",Intel Core i5 Processor (12th Gen)16 GB DDR4 R...,ASUS,48990.0,7525.0,419.0,Intel Core i5 Processor
3,HP Victus Intel Core i5 12th Gen 12450H - (16 ...,"₹62,990",4.4,61 Ratings & 3 Reviews,Intel Core i5 Processor (12th Gen)16 GB DDR4 R...,HP,62990.0,61.0,3.0,Intel Core i5 Processor
4,Acer Swift Go 14 TouchScreen AI PC Intel Core ...,"₹84,990",4.1,88 Ratings & 9 Reviews,Intel Core Ultra 7 Processor16 GB LPDDR5X RAMW...,Acer,84990.0,88.0,9.0,Intel Core Ultra 7 Processor


In [13]:
df['processor'].value_counts()

processor
Intel Core i5 Processor                177
Intel Core i3 Processor                145
Intel Celeron Dual Core Processor      118
Intel Core Ultra 7 Processor           114
AMD Ryzen 5 Hexa Core Processor         52
Intel Core i7 Processor                 49
Intel Celeron Quad Core Processor       38
MediaTek Kompanio 520 Processor         37
AMD Ryzen 7 Octa Core Processor         27
AMD Ryzen 3 Quad Core Processor         24
AMD Ryzen 5 Quad Core Processor         17
Intel Core Ultra 5 Processor            14
AMD Ryzen 3 Dual Core Processor         13
Intel Core i9 Processor                  9
Apple M3 Processor                       6
AMD Athlon Dual Core Processor           6
Apple M4 Processor                       6
Apple M2 Processor                       5
Intel Core 5 Processor                   5
Intel Core Ultra 9 Processor             3
AMD Ryzen 5 Dual Core Processor          3
MediaTek MT8183 Processor                3
                                         3
A

In [14]:
df.loc[(df['processor'] == 'Apple M1 Processor') | 
       (df['processor'] == 'Apple M2 Processor') | 
       (df['processor'] == 'Apple M3 Processor') | 
       (df['processor'] == 'Apple M4 Processor'), 
       'processor'] = 'Apple M series'

In [15]:
df.loc[(df['processor'] == 'AMD Ryzen 3 Octa Core Processor') | 
       (df['processor'] == 'AMD Ryzen 3 Dual Core Processor') | 
       (df['processor'] == 'AMD Ryzen 3 Quad Core Processor'), 
       'processor'] = 'AMD Ryzen 3'

In [16]:
df.loc[(df['processor'] == 'AMD Ryzen 5 Hexa Core Processor') | 
       (df['processor'] == 'AMD Ryzen 5 Quad Core Processor') | 
       (df['processor'] == 'AMD Ryzen 5 Dual Core Processor') | 
       (df['processor'] == 'AMD Ryzen 5 Processor'), 
       'processor'] = 'AMD Ryzen 5'

In [17]:
df.loc[(df['processor'] == 'AMD Ryzen 7 Octa Core Processor') | 
       (df['processor'] == 'AMD Athlon Dual Core Processor') | 
       (df['processor'] == 'AMD Ryzen 9 16 Core Processor') | 
       (df['processor'] == 'AMD Ryzen 9 Octa Core Processor'), 
       'processor'] = 'AMD Ryzen 7'

In [18]:
df.loc[(df['processor'] == 'Intel Celeron Quad Core Processor') | 
       (df['processor'] == 'Intel Pentium Dual Core Processor') | 
       (df['processor'] == 'Intel Celeron Dual Core Processor') |
       (df['processor'] == 'Intel Pentium Gold Processor'), 
       'processor'] = 'Intel i1'

In [19]:
df.loc[(df['processor'] == 'Intel Core i3 Processor') |  
       (df['processor'] == 'Intel OptaneIntel Core i3 Processor') | 
       (df['processor'] == 'Intel Core 3 Processor'), 
       'processor'] = 'Intel i3'

In [20]:
df.loc[(df['processor'] == 'Intel Core N Processor') | 
       (df['processor'] == 'Intel Core 5 Processor') | 
       (df['processor'] == 'Intel Core Ultra 5 Processor')|
       (df['processor'] == 'Intel Core i5 Processor'), 
       'processor'] = 'Intel i5'

In [21]:
df.loc[(df['processor'] == 'Intel Core Ultra 7 Processor') | 
       (df['processor'] == 'Intel Core i7 Processor') | 
       (df['processor'] == 'Intel Core i9 Processor')|
       (df['processor'] == 'Intel Core Ultra 9 Processor'), 
       'processor'] = 'Intel i7'

In [22]:
df.loc[(df['processor'] == 'MediaTek Kompanio 520 Processor') | 
       (df['processor'] == 'MediaTek MT8183 Processor') | 
       (df['processor'] == 'MediaTek MT8788 Processor'), 
       'processor'] = 'MediaTek'

In [23]:
df['processor'].value_counts()

processor
Intel i5          197
Intel i7          175
Intel i1          158
Intel i3          147
AMD Ryzen 5        74
MediaTek           41
AMD Ryzen 3        39
AMD Ryzen 7        36
Apple M series     18
                    3
Name: count, dtype: int64

### **05. RAM**

In [24]:
# Extract RAM size from the specifications text
# Standardize the format to numeric (e.g., 8GB → 8)

In [25]:
df['product_feature'][0:5]

0    Intel Celeron Dual Core Processor4 GB LPDDR4X ...
1    MediaTek Kompanio 520 Processor8 GB LPDDR4X RA...
2    Intel Core i5 Processor (12th Gen)16 GB DDR4 R...
3    Intel Core i5 Processor (12th Gen)16 GB DDR4 R...
4    Intel Core Ultra 7 Processor16 GB LPDDR5X RAMW...
Name: product_feature, dtype: object

In [26]:
# RAM Information

# regex = r'(?:[\w\s]+[\w\s]+)?\d+\sGB[\s\w]+RAM'
# regex = r'(?<=processor)(\S.*?)(?=RAM)'
regex = r'(?<=Processor).*?RAM'

df['RAM'] = df['product_feature'].apply(lambda x : ' '.join(re.findall(regex, x)))

df.head()

Unnamed: 0,product_title,product_price,product_rating,product_review,product_feature,brand,price,num_of_ratings,num_of_reviews,processor,RAM
0,ASUS Chromebook Intel Celeron Dual Core N4500 ...,"₹13,990",3.8,"2,594 Ratings & 228 Reviews",Intel Celeron Dual Core Processor4 GB LPDDR4X ...,ASUS,13990.0,2594.0,228.0,Intel i1,4 GB LPDDR4X RAM
1,Lenovo Chromebook MediaTek Kompanio 520 - (8 G...,"₹23,850",3.8,"1,463 Ratings & 125 Reviews",MediaTek Kompanio 520 Processor8 GB LPDDR4X RA...,Lenovo,23850.0,1463.0,125.0,MediaTek,8 GB LPDDR4X RAM
2,"ASUS Vivobook 15, with Backlit Keyboard, Intel...","₹48,990",4.2,"7,525 Ratings & 419 Reviews",Intel Core i5 Processor (12th Gen)16 GB DDR4 R...,ASUS,48990.0,7525.0,419.0,Intel i5,(12th Gen)16 GB DDR4 RAM
3,HP Victus Intel Core i5 12th Gen 12450H - (16 ...,"₹62,990",4.4,61 Ratings & 3 Reviews,Intel Core i5 Processor (12th Gen)16 GB DDR4 R...,HP,62990.0,61.0,3.0,Intel i5,(12th Gen)16 GB DDR4 RAM
4,Acer Swift Go 14 TouchScreen AI PC Intel Core ...,"₹84,990",4.1,88 Ratings & 9 Reviews,Intel Core Ultra 7 Processor16 GB LPDDR5X RAMW...,Acer,84990.0,88.0,9.0,Intel i7,16 GB LPDDR5X RAM


In [27]:
df['RAM'].value_counts()

RAM
16 GB LPDDR5X RAM                                                                                89
 (12th Gen)8 GB DDR4 RAM                                                                         76
8 GB DDR4 RAM                                                                                    68
 (12th Gen)16 GB DDR4 RAM                                                                        67
4 GB LPDDR4X RAM                                                                                 66
4 GB LPDDR4 RAM                                                                                  54
 (11th Gen)8 GB DDR4 RAM                                                                         50
8 GB LPDDR4X RAM                                                                                 43
32 GB LPDDR5X RAM                                                                                40
 (13th Gen)8 GB LPDDR4X RAM                                                                     

In [28]:
df['RAM'] = df['RAM'].str.strip()

In [29]:
df['RAM'].value_counts()

RAM
16 GB LPDDR5X RAM                                                                                89
(12th Gen)8 GB DDR4 RAM                                                                          76
8 GB DDR4 RAM                                                                                    68
(12th Gen)16 GB DDR4 RAM                                                                         67
4 GB LPDDR4X RAM                                                                                 66
4 GB LPDDR4 RAM                                                                                  54
(11th Gen)8 GB DDR4 RAM                                                                          50
8 GB LPDDR4X RAM                                                                                 43
32 GB LPDDR5X RAM                                                                                40
(13th Gen)8 GB LPDDR4X RAM                                                                      

In [30]:
df.loc[(df['RAM'] == '4 GB LPDDR4X RAM') | 
       (df['RAM'] == '4 GB LPDDR4 RAM') | 
       (df['RAM'] == '4 GB DDR4 RAM'), 
       'RAM'] = '4'

In [31]:
df.loc[(df['RAM'] == '(12th Gen)8 GB DDR4 RAM') | 
       (df['RAM'] == '8 GB DDR4 RAM') | 
       (df['RAM'] == '(11th Gen)8 GB DDR4 RAM') |
       (df['RAM'] == '8 GB LPDDR4X RAM') | 
       (df['RAM'] == '(13th Gen)8 GB LPDDR4X RAM') | 
       (df['RAM'] == '(13th Gen)8 GB DDR4 RAM') |
       (df['RAM'] == '8 GB LPDDR5 RAM') | 
       (df['RAM'] == '(13th Gen)8 GB LPDDR5 RAM') | 
       (df['RAM'] == '(12th Gen)8 GB LPDDR5 RAM') |
       (df['RAM'] == '8 GB Unified Memory RAM') | 
       (df['RAM'] == '(10th Gen)8 GB DDR4 RAM') | 
       (df['RAM'] == '8 GB LPDDR4 RAM') |
       (df['RAM'] == '(13th Gen)8 GB DDR5 RAM') |
       (df['RAM'] == '(7th Gen)8 GB DDR5 RAM') | 
       (df['RAM'] == '(11th Gen)8 GB DDR5 RAM') | 
       (df['RAM'] == '(5th Gen)8 GB DDR4 RAM') |
       (df['RAM'] == ': Intel i3-1215U (Base- 0.9 GHz & Turbo up to 4.40 GHz) 6 CoresRAM  (12th Gen)8 GB DDR4 RAM') |
       (df['RAM'] == ': Intel i3-1115G4 (Base- 1.70 GHz & Turbo up to 4.10 GHz) 2 CoresRAM  (11th Gen)8 GB DDR4 RAM') |
       (df['RAM'] == ': Ryzen R5-5625U (Base- 2.30 GHz & Turbo up to 4.30 GHz) 6 CoresRAM 8 GB DDR4 RAM') |
       (df['RAM'] == ': Intel i5-1235U (Base- 3.30 GHz & Turbo up to 4.40 GHz) 10 CoresRAM  (12th Gen)8 GB DDR4 RAM') |
       (df['RAM'] == '-i3-1115G4 Processor upto 4.1 GHz SpeedRAM  (11th Gen)8 GB DDR4 RAM'), 
       'RAM'] = '8'

In [32]:
df.loc[(df['RAM'] == '(12th Gen)12 GB LPDDR4 RAM') | 
       (df['RAM'] == '(12th Gen)12 GB LPDDR5 RAM') | 
       (df['RAM'] == '(12th Gen)12 GB DDR5 RAM') |
       (df['RAM'] == ': Cutting-edge Intel i3 12th Gen processor, LPDDR4 12GB RAM  (12th Gen)12 GB LPDDR4 RAM'), 
       'RAM'] = '12'

In [33]:
df.loc[(df['RAM'] == '16 GB LPDDR5X RAM') | 
       (df['RAM'] == '(12th Gen)16 GB DDR4 RAM') | 
       (df['RAM'] == '16 GB DDR4 RAM') |
       (df['RAM'] == '(13th Gen)16 GB DDR4 RAM') | 
       (df['RAM'] == '16 GB DDR5 RAM') | 
       (df['RAM'] == '(13th Gen)16 GB LPDDR5 RAM') |
       (df['RAM'] == '(13th Gen)16 GB DDR5 RAM') | 
       (df['RAM'] == '(12th Gen)16 GB LPDDR5 RAM') | 
       (df['RAM'] == '16 GB Unified Memory RAM') |
       (df['RAM'] == '(12th Gen)16 GB DDR5 RAM') | 
       (df['RAM'] == '(11th Gen)16 GB DDR4 RAM') | 
       (df['RAM'] == '16 GB LPDDR5 RAM') |
       (df['RAM'] == '(14th Gen)16 GB DDR5 RAM') |
       (df['RAM'] == '16 GB LPDDR4X RAM') | 
       (df['RAM'] == '(13th Gen)16 GB LPDDR4X RAM') | 
       (df['RAM'] == '(5th Gen)16 GB DDR4 RAM') |
       (df['RAM'] == '(7th Gen)16 GB DDR4 RAM') | 
       (df['RAM'] == '(12th Gen)16 GB LPDDR4 RAM') |
       (df['RAM'] == '(13th Gen)16 GB LPDDR5X RAM'), 
       'RAM'] = '16'

In [34]:
df.loc[(df['RAM'] == '24 GB DDR5 RAM') | 
       (df['RAM'] == '(13th Gen)24 GB DDR5 RAM') | 
       (df['RAM'] == '(12th Gen)24 GB DDR5 RAM') |
       (df['RAM'] == '24 GB Unified Memory RAM'), 
       'RAM'] = '24'

In [35]:
df.loc[(df['RAM'] == '32 GB LPDDR5X RAM') | 
       (df['RAM'] == '(13th Gen)32 GB LPDDR5X RAM') | 
       (df['RAM'] == '(14th Gen)32 GB DDR5 RAM') |
       (df['RAM'] == '32 GB DDR5 RAM'), 
       'RAM'] = '32'

In [36]:
df['RAM'].value_counts()

RAM
16    359
8     348
4     126
32     44
24      7
12      4
Name: count, dtype: int64

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 888 entries, 0 to 887
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   product_title    888 non-null    object 
 1   product_price    887 non-null    object 
 2   product_rating   709 non-null    float64
 3   product_review   709 non-null    object 
 4   product_feature  888 non-null    object 
 5   brand            888 non-null    object 
 6   price            887 non-null    float64
 7   num_of_ratings   709 non-null    float64
 8   num_of_reviews   709 non-null    float64
 9   processor        888 non-null    object 
 10  RAM              888 non-null    object 
dtypes: float64(4), object(7)
memory usage: 76.4+ KB


### **06. OS** 

In [38]:
# Identify and extract the operating system (Windows, DOS, macOS, etc.)
# Helps segment devices by user preference or software compatibility

In [39]:
# OS

# regex = r'(?:\d+\sbit|Android|Mac|Window|DOS)[\s\w]+Operating System'
regex = r'(?<=RAM)(\S.*?)(?=Operating System)'


df['OS'] = df['product_feature'].apply(lambda x : ' '.join(re.findall(regex, x)))

df.head()

Unnamed: 0,product_title,product_price,product_rating,product_review,product_feature,brand,price,num_of_ratings,num_of_reviews,processor,RAM,OS
0,ASUS Chromebook Intel Celeron Dual Core N4500 ...,"₹13,990",3.8,"2,594 Ratings & 228 Reviews",Intel Celeron Dual Core Processor4 GB LPDDR4X ...,ASUS,13990.0,2594.0,228.0,Intel i1,4,Chrome
1,Lenovo Chromebook MediaTek Kompanio 520 - (8 G...,"₹23,850",3.8,"1,463 Ratings & 125 Reviews",MediaTek Kompanio 520 Processor8 GB LPDDR4X RA...,Lenovo,23850.0,1463.0,125.0,MediaTek,8,Chrome
2,"ASUS Vivobook 15, with Backlit Keyboard, Intel...","₹48,990",4.2,"7,525 Ratings & 419 Reviews",Intel Core i5 Processor (12th Gen)16 GB DDR4 R...,ASUS,48990.0,7525.0,419.0,Intel i5,16,Windows 11
3,HP Victus Intel Core i5 12th Gen 12450H - (16 ...,"₹62,990",4.4,61 Ratings & 3 Reviews,Intel Core i5 Processor (12th Gen)16 GB DDR4 R...,HP,62990.0,61.0,3.0,Intel i5,16,Windows 11
4,Acer Swift Go 14 TouchScreen AI PC Intel Core ...,"₹84,990",4.1,88 Ratings & 9 Reviews,Intel Core Ultra 7 Processor16 GB LPDDR5X RAMW...,Acer,84990.0,88.0,9.0,Intel i7,16,Windows 11


In [40]:
# Strip leading and trailing spaces from the 'OS' column
df['OS'] = df['OS'].str.strip()

In [41]:
df.loc[(df['OS'] == '64 bit Windows 11') | 
       (df['OS'] == 'Windows 11 Home') | 
       (df['OS'] == '64 bit Windows 11') |
       (df['OS'] == 'Windows 11 Home') |
       (df['OS'] == '64 bit Windows 11 Home') |
       (df['OS'] == '64 bit Windows 11 Home') | 
       (df['OS'] == ', 512GB SSD StorageGraphics: Intel integrated UHD Graphics GPU2 in 1 Tablet Notebook360-degree foldable screenBluetooth: 5.23 Type-C interfaces :- PD Fast charging, USB 3.0 data transmission, USB 2.0 data Transmission1MP HD CamCompact and PortableIntel Core i3 Processor (12th Gen)12 GB LPDDR4 RAM64 bit Windows 11 Home') |
       (df['OS'] == 'DOS') |
       (df['OS'] == 'DOS') |
       (df['OS'] == '32 bit Windows 11') |
       (df['OS'] == '64 bit Windows 10') | 
       (df['OS'] == 'Windows 10') |       
       (df['OS'] == '64 bit DOS') |
       (df['OS'] == 'Windows 11') |
       (df['OS'] == '64 bit Windows 10') |
       (df['OS'] == '32 bit Windows 11'), 
       'OS'] = 'Windows OS'

In [42]:
df.loc[(df['OS'] == '64 bit Chrome') | 
       (df['OS'] == '64 bit Chrome') |
       (df['OS'] == 'Android') | 
       (df['OS'] == 'Chrome'),
       'OS'] = 'Chrome OS'

In [43]:
df['OS'].value_counts()

OS
Windows OS    710
Chrome OS     160
Mac OS         18
Name: count, dtype: int64

### **07. Display**

In [44]:
# Parse the display size or resolution from the specs
# Enables screen size analysis across price ranges

In [45]:
# Display

regex = r'\d+\.?\d+\scm.*Display'

df['display'] = df['product_feature'].apply(lambda x : ' '.join(re.findall(regex,x)))

df.head()

Unnamed: 0,product_title,product_price,product_rating,product_review,product_feature,brand,price,num_of_ratings,num_of_reviews,processor,RAM,OS,display
0,ASUS Chromebook Intel Celeron Dual Core N4500 ...,"₹13,990",3.8,"2,594 Ratings & 228 Reviews",Intel Celeron Dual Core Processor4 GB LPDDR4X ...,ASUS,13990.0,2594.0,228.0,Intel i1,4,Chrome OS,35.56 cm (14 Inch) Display
1,Lenovo Chromebook MediaTek Kompanio 520 - (8 G...,"₹23,850",3.8,"1,463 Ratings & 125 Reviews",MediaTek Kompanio 520 Processor8 GB LPDDR4X RA...,Lenovo,23850.0,1463.0,125.0,MediaTek,8,Chrome OS,35.56 cm (14 inch) Display
2,"ASUS Vivobook 15, with Backlit Keyboard, Intel...","₹48,990",4.2,"7,525 Ratings & 419 Reviews",Intel Core i5 Processor (12th Gen)16 GB DDR4 R...,ASUS,48990.0,7525.0,419.0,Intel i5,16,Windows OS,39.62 cm (15.6 Inch) Display
3,HP Victus Intel Core i5 12th Gen 12450H - (16 ...,"₹62,990",4.4,61 Ratings & 3 Reviews,Intel Core i5 Processor (12th Gen)16 GB DDR4 R...,HP,62990.0,61.0,3.0,Intel i5,16,Windows OS,39.62 cm (15.6 Inch) Display
4,Acer Swift Go 14 TouchScreen AI PC Intel Core ...,"₹84,990",4.1,88 Ratings & 9 Reviews,Intel Core Ultra 7 Processor16 GB LPDDR5X RAMW...,Acer,84990.0,88.0,9.0,Intel i7,16,Windows OS,35.56 cm (14 Inch) Touchscreen Display


In [46]:
# Strip leading and trailing spaces from the 'OS' column
# df['display'] = df['display'].str.strip()

In [47]:
df.loc[(df['display'] == '35.56 cm (14 Inch) Display') | 
       (df['display'] == '35.56 cm (14 Inch) Touchscreen Display') | 
       (df['display'] == '35.56 cm (14 inch) Display') | 
       (df['display'] == '35.81 cm (14.1 inch) Display') |
       (df['display'] == '35.56 cm (14 inch) Touchscreen Display') |
       (df['display'] == '34.54 cm (13.6 Inch) Display') | 
       (df['display'] == '33.02 cm (13 Inch) Display') | 
       (df['display'] == '34.54 cm (13.6 inch) Display') | 
       (df['display'] == '34.29 cm (13.5 inch) Touchscreen Display') |
       (df['display'] == '35.56 cm (14 Inch) Touchscreen Display') |
       (df['display'] == '33.78 cm (13.3 inch) Touchscreen Display') |
       (df['display'] == '29.46 cm (11.6 Inch) Display') | 
       (df['display'] == '29.46 cm (11.6 inch) Display') | 
       (df['display'] == '31.5 cm (12.4 Inch) Touchscreen Display') | 
       (df['display'] == '33.78 cm (13.3 inch) Display') |
       (df['display'] == '36.83 cm (14.5 Inch) Display') |
       (df['display'] == '35.81 cm (14.1 Inch) Display') | 
       (df['display'] == '37.85 cm (14.9 inch) Display'),
       'display'] = '14'

In [48]:
df.loc[(df['display'] == '39.62 cm (15.6 Inch) Display') | 
       (df['display'] == '39.62 cm (15.6 inch) Display') | 
       (df['display'] == '39.62 cm (15.6 Inch) Touchscreen Display') | 
       (df['display'] == '38.86 cm (15.3 inch) Display') |
       (df['display'] == '38.1 cm (15 Inch) Display') |
       (df['display'] == '39.37 cm (15.5 inch) Display') | 
       (df['display'] == '38.86 cm (15.3 Inch) Display'),
       'display'] = '15'

In [49]:
df.loc[(df['display'] == '40.64 cm (16 Inch) Touchscreen Display') | 
       (df['display'] == '40.64 cm (16 Inch) Display') | 
       (df['display'] == '40.64 cm (16 inch) Display') | 
       (df['display'] == '40.89 cm (16.1 Inch) Display') |
       (df['display'] == '43.94 cm (17.3 Inch) Display') |
       (df['display'] == '40.89 cm (16.1 inch) Display'),
       'display'] = '16'

In [50]:
df['display'].value_counts()

display
15    459
14    354
16     75
Name: count, dtype: int64

### **08. Warranty** 

In [51]:
# Extract warranty duration or coverage details if available
# Useful for understanding value-adds offered by different brands

In [52]:
# Warranty

regex = r'\d+\sYear.*Warranty'

df['warranty'] = df['product_feature'].apply(lambda x : ' '.join(re.findall(regex,x)))

df.head()

Unnamed: 0,product_title,product_price,product_rating,product_review,product_feature,brand,price,num_of_ratings,num_of_reviews,processor,RAM,OS,display,warranty
0,ASUS Chromebook Intel Celeron Dual Core N4500 ...,"₹13,990",3.8,"2,594 Ratings & 228 Reviews",Intel Celeron Dual Core Processor4 GB LPDDR4X ...,ASUS,13990.0,2594.0,228.0,Intel i1,4,Chrome OS,14,1 Year Onsite Warranty
1,Lenovo Chromebook MediaTek Kompanio 520 - (8 G...,"₹23,850",3.8,"1,463 Ratings & 125 Reviews",MediaTek Kompanio 520 Processor8 GB LPDDR4X RA...,Lenovo,23850.0,1463.0,125.0,MediaTek,8,Chrome OS,14,1 Year Carry-in Warranty
2,"ASUS Vivobook 15, with Backlit Keyboard, Intel...","₹48,990",4.2,"7,525 Ratings & 419 Reviews",Intel Core i5 Processor (12th Gen)16 GB DDR4 R...,ASUS,48990.0,7525.0,419.0,Intel i5,16,Windows OS,15,1 Year Onsite Warranty
3,HP Victus Intel Core i5 12th Gen 12450H - (16 ...,"₹62,990",4.4,61 Ratings & 3 Reviews,Intel Core i5 Processor (12th Gen)16 GB DDR4 R...,HP,62990.0,61.0,3.0,Intel i5,16,Windows OS,15,1 Year Onsite Warranty
4,Acer Swift Go 14 TouchScreen AI PC Intel Core ...,"₹84,990",4.1,88 Ratings & 9 Reviews,Intel Core Ultra 7 Processor16 GB LPDDR5X RAMW...,Acer,84990.0,88.0,9.0,Intel i7,16,Windows OS,14,1 Year Carry-in Warranty


In [53]:
# Strip leading and trailing spaces from the 'OS' column
df['warranty'] = df['warranty'].str.strip()

In [54]:
df.loc[(df['warranty'] == '1 Year Onsite Warranty ') | 
       (df['warranty'] == '1 Year Carry-in Warranty') | 
       (df['warranty'] == '1 Year Domestic Warranty') | 
       (df['warranty'] == '1 Year International Travelers Warranty') |
       (df['warranty'] == '1 Year Warranty') |
       (df['warranty'] == '20211 Year Onsite Warranty') | 
       (df['warranty'] == '1 Year Limited Warranty') | 
       (df['warranty'] == '20211 Year Carry-in Warranty') | 
       (df['warranty'] == '1 Year Manufacturing Warranty') |
       (df['warranty'] == '1 Year Manufacturer Warranty') |
       (df['warranty'] == '20241 Year Carry-in Warranty') |
       (df['warranty'] == '1 Year Pick and Drop Warranty') | 
       (df['warranty'] == '111 Year Onsite Warranty') | 
       (df['warranty'] == '20191 Year Onsite Warranty') | 
       (df['warranty'] == '20211 Year On-Site Warranty') |
       (df['warranty'] == '1 Year Domestic Onsite Warranty') |
       (df['warranty'] == '1 Year On-Site Warranty') | 
       (df['warranty'] == '2021 Year Onsite Warranty') |
       (df['warranty'] == '1 Year McAfee1 Year Warranty') |
       (df['warranty'] == '1 Year McAfee1 Year Onsite Warranty') |
       (df['warranty'] == '1 Year Onsite�Warranty') | 
       (df['warranty'] == '1 Year Onsite Warranty') |
       (df['warranty'] == '     ') |
       (df['warranty'] == '1 Years Carry in Warranty'),
       'warranty'] = '1'

In [55]:
df.loc[(df['warranty'] == '2 Year Onsite Warranty') | 
       (df['warranty'] == '2 Year Carry-in Warranty') | 
       (df['warranty'] == '20212 Year Warranty') | 
       (df['warranty'] == '2 Years On-Site Warranty') |
       (df['warranty'] == '20212 Years Warranty') |
       (df['warranty'] == '2 Years Warranty'),
       'warranty'] = '2'

In [56]:
df['warranty'] = df['warranty'].replace(r'^\s*$', '1', regex=True)

In [57]:
df['warranty'].value_counts()

warranty
1    864
2     24
Name: count, dtype: int64

### **09. Storage** 

In [58]:
# Extract storage type and size (HDD, SSD, Hybrid) from the specs
# Normalize into structured columns for type and capacity

In [59]:
# Regex pattern for extracting storage info immediately after "Operating System"
# regex_storage = r'Operating System(\S+\s?\S*)SSD'
regex_storage = r'(?<=Operating System)(\S.*?)SSD'

# Apply the regex to extract storage info
df['storage'] = df['product_feature'].apply(lambda x: re.findall(regex_storage, x))

# Convert list to string if needed (assuming only one storage value per entry)
df['storage'] = df['storage'].apply(lambda x: x[0] if x else None)

df.head()  

Unnamed: 0,product_title,product_price,product_rating,product_review,product_feature,brand,price,num_of_ratings,num_of_reviews,processor,RAM,OS,display,warranty,storage
0,ASUS Chromebook Intel Celeron Dual Core N4500 ...,"₹13,990",3.8,"2,594 Ratings & 228 Reviews",Intel Celeron Dual Core Processor4 GB LPDDR4X ...,ASUS,13990.0,2594.0,228.0,Intel i1,4,Chrome OS,14,1,
1,Lenovo Chromebook MediaTek Kompanio 520 - (8 G...,"₹23,850",3.8,"1,463 Ratings & 125 Reviews",MediaTek Kompanio 520 Processor8 GB LPDDR4X RA...,Lenovo,23850.0,1463.0,125.0,MediaTek,8,Chrome OS,14,1,
2,"ASUS Vivobook 15, with Backlit Keyboard, Intel...","₹48,990",4.2,"7,525 Ratings & 419 Reviews",Intel Core i5 Processor (12th Gen)16 GB DDR4 R...,ASUS,48990.0,7525.0,419.0,Intel i5,16,Windows OS,15,1,512 GB
3,HP Victus Intel Core i5 12th Gen 12450H - (16 ...,"₹62,990",4.4,61 Ratings & 3 Reviews,Intel Core i5 Processor (12th Gen)16 GB DDR4 R...,HP,62990.0,61.0,3.0,Intel i5,16,Windows OS,15,1,512 GB
4,Acer Swift Go 14 TouchScreen AI PC Intel Core ...,"₹84,990",4.1,88 Ratings & 9 Reviews,Intel Core Ultra 7 Processor16 GB LPDDR5X RAMW...,Acer,84990.0,88.0,9.0,Intel i7,16,Windows OS,14,1,1 TB


In [60]:
# Strip leading and trailing spaces from the 'OS' column
df['storage'] = df['storage'].str.strip()

In [61]:
df.loc[(df['storage'] == '512 GB') | 
       (df['storage'] == '512 GB HDD|512 GB') , 
       'storage'] = '512'

In [62]:
df.loc[(df['storage'] == '256 GB') | 
       (df['storage'] == '128 GB') |
       (df['storage'] == '1 TB HDD|256 GB') , 
       'storage'] = '256'

In [63]:
df.loc[(df['storage'] == '1 TB'), 'storage'] = 1024

In [64]:
df['storage'].value_counts()

storage
512     542
1024    116
256      66
Name: count, dtype: int64

In [65]:
df.head()

Unnamed: 0,product_title,product_price,product_rating,product_review,product_feature,brand,price,num_of_ratings,num_of_reviews,processor,RAM,OS,display,warranty,storage
0,ASUS Chromebook Intel Celeron Dual Core N4500 ...,"₹13,990",3.8,"2,594 Ratings & 228 Reviews",Intel Celeron Dual Core Processor4 GB LPDDR4X ...,ASUS,13990.0,2594.0,228.0,Intel i1,4,Chrome OS,14,1,
1,Lenovo Chromebook MediaTek Kompanio 520 - (8 G...,"₹23,850",3.8,"1,463 Ratings & 125 Reviews",MediaTek Kompanio 520 Processor8 GB LPDDR4X RA...,Lenovo,23850.0,1463.0,125.0,MediaTek,8,Chrome OS,14,1,
2,"ASUS Vivobook 15, with Backlit Keyboard, Intel...","₹48,990",4.2,"7,525 Ratings & 419 Reviews",Intel Core i5 Processor (12th Gen)16 GB DDR4 R...,ASUS,48990.0,7525.0,419.0,Intel i5,16,Windows OS,15,1,512.0
3,HP Victus Intel Core i5 12th Gen 12450H - (16 ...,"₹62,990",4.4,61 Ratings & 3 Reviews,Intel Core i5 Processor (12th Gen)16 GB DDR4 R...,HP,62990.0,61.0,3.0,Intel i5,16,Windows OS,15,1,512.0
4,Acer Swift Go 14 TouchScreen AI PC Intel Core ...,"₹84,990",4.1,88 Ratings & 9 Reviews,Intel Core Ultra 7 Processor16 GB LPDDR5X RAMW...,Acer,84990.0,88.0,9.0,Intel i7,16,Windows OS,14,1,1024.0


In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 888 entries, 0 to 887
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   product_title    888 non-null    object 
 1   product_price    887 non-null    object 
 2   product_rating   709 non-null    float64
 3   product_review   709 non-null    object 
 4   product_feature  888 non-null    object 
 5   brand            888 non-null    object 
 6   price            887 non-null    float64
 7   num_of_ratings   709 non-null    float64
 8   num_of_reviews   709 non-null    float64
 9   processor        888 non-null    object 
 10  RAM              888 non-null    object 
 11  OS               888 non-null    object 
 12  display          888 non-null    object 
 13  warranty         888 non-null    object 
 14  storage          724 non-null    object 
dtypes: float64(4), object(11)
memory usage: 104.2+ KB


In [67]:
# Display the final column names in the DataFrame
# Confirms that all necessary fields—including extracted specs like processor, RAM, OS, etc.—are present
# Ensures the data is structured and ready for analysis or export

In [68]:
df.columns

Index(['product_title', 'product_price', 'product_rating', 'product_review',
       'product_feature', 'brand', 'price', 'num_of_ratings', 'num_of_reviews',
       'processor', 'RAM', 'OS', 'display', 'warranty', 'storage'],
      dtype='object')

In [69]:
# Reorder and select only the most relevant columns for final analysis
# This step improves readability and focuses on key product features and metrics

In [70]:
df = df[['brand', 'OS', 'processor', 'RAM', 'storage', 'display', 'num_of_ratings', 'num_of_reviews', 'warranty', 'product_rating', 'price']]

df.head()

Unnamed: 0,brand,OS,processor,RAM,storage,display,num_of_ratings,num_of_reviews,warranty,product_rating,price
0,ASUS,Chrome OS,Intel i1,4,,14,2594.0,228.0,1,3.8,13990.0
1,Lenovo,Chrome OS,MediaTek,8,,14,1463.0,125.0,1,3.8,23850.0
2,ASUS,Windows OS,Intel i5,16,512.0,15,7525.0,419.0,1,4.2,48990.0
3,HP,Windows OS,Intel i5,16,512.0,15,61.0,3.0,1,4.4,62990.0
4,Acer,Windows OS,Intel i7,16,1024.0,14,88.0,9.0,1,4.1,84990.0


In [71]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 888 entries, 0 to 887
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   brand           888 non-null    object 
 1   OS              888 non-null    object 
 2   processor       888 non-null    object 
 3   RAM             888 non-null    object 
 4   storage         724 non-null    object 
 5   display         888 non-null    object 
 6   num_of_ratings  709 non-null    float64
 7   num_of_reviews  709 non-null    float64
 8   warranty        888 non-null    object 
 9   product_rating  709 non-null    float64
 10  price           887 non-null    float64
dtypes: float64(4), object(7)
memory usage: 76.4+ KB


In [72]:
df['RAM'] = pd.to_numeric(df['RAM'], errors='coerce')
df['storage'] = pd.to_numeric(df['storage'], errors='coerce')
df['display'] = pd.to_numeric(df['display'], errors='coerce')
df['num_of_ratings'] = pd.to_numeric(df['num_of_ratings'], errors='coerce')
df['num_of_reviews'] = pd.to_numeric(df['num_of_reviews'], errors='coerce')
df['warranty'] = pd.to_numeric(df['warranty'], errors='coerce')
df['price'] = pd.to_numeric(df['price'], errors='coerce')

In [73]:
df.describe()

Unnamed: 0,RAM,storage,display,num_of_ratings,num_of_reviews,warranty,product_rating,price
count,888.0,724.0,888.0,709.0,709.0,888.0,709.0,887.0
mean,12.0,570.696133,14.685811,1562.554302,133.620592,1.027027,4.129619,59322.45885
std,6.543928,211.16724,0.620344,2285.772823,176.884658,0.162254,0.293811,44268.831089
min,4.0,256.0,14.0,2.0,0.0,1.0,2.8,11990.0
25%,8.0,512.0,14.0,119.0,11.0,1.0,3.9,28090.0
50%,8.0,512.0,15.0,914.0,78.0,1.0,4.1,41990.0
75%,16.0,512.0,15.0,2242.0,194.0,1.0,4.3,73340.0
max,32.0,1024.0,16.0,17381.0,1180.0,2.0,5.0,239990.0


In [84]:
# Convert the 'storage' column to numeric, forcing errors to be NaN
df['storage'] = pd.to_numeric(df['storage'], errors='coerce')

# Replace NaN values with the median of the column
df.loc[df['storage'].isnull(), 'storage'] = df['storage'].median()

In [85]:
df.loc[df['num_of_ratings'].isnull(), 'num_of_ratings'] = df['num_of_ratings'].median()

In [86]:
df.loc[df['num_of_reviews'].isnull(), 'num_of_reviews'] = df['num_of_reviews'].median()

In [87]:
df.loc[df['warranty'].isnull(), 'warranty'] = df['warranty'].median()

In [88]:
df.loc[df['product_rating'].isnull(), 'product_rating'] = 4.1

In [89]:
df.loc[df['price'].isnull(), 'price'] = df['price'].median()

In [90]:
df['storage'].mode()

0    512.0
Name: storage, dtype: float64

In [91]:
df.loc[df['storage'].isnull(), 'storage'] = 512.0

In [92]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 888 entries, 0 to 887
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   brand           888 non-null    object 
 1   OS              888 non-null    object 
 2   processor       888 non-null    object 
 3   RAM             888 non-null    int64  
 4   storage         888 non-null    float64
 5   display         888 non-null    int64  
 6   num_of_ratings  888 non-null    float64
 7   num_of_reviews  888 non-null    float64
 8   warranty        888 non-null    int64  
 9   product_rating  888 non-null    float64
 10  price           888 non-null    float64
dtypes: float64(5), int64(3), object(3)
memory usage: 76.4+ KB


In [83]:
# df.to_csv('data/laptops_flipkart_cleaned.csv', index = False)