# Mobile Phone Data Cleaning and Preprocessing Pipeline

## 1. Import Required Libraries

In [3]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split


## 2. Load and Inspect Raw Data

In [4]:
# Load dataset
df = pd.read_csv('mobiles.csv')

# Initial data inspection
print("\nInitial Dataset Shape:", df.shape)
print("\nFirst 5 Rows:")
display(df.head())

print("\nDataset Info:")
print(df.info())

print("\nMissing Values Summary:")
print(df.isnull().sum())



Initial Dataset Shape: (1020, 12)

First 5 Rows:


Unnamed: 0,mobile_name,price,rating,specs_score,connectivity,processor,storage,battery,display,camera,extra_storage,os
0,OPPO Reno 11,"₹29,990",4.7,86.0,"Dual Sim, 3G, 4G, 5G, VoLTE, Wi-Fi, NFC, IR Bl...","Dimensity 8200, Octa Core, 3.1 GHz Processor","8 GB RAM, 256 GB inbuilt",4800 mAh Battery with 67W Fast Charging,"6.7 inches, 1080 x 2412 px, 120 Hz Display wit...",50 MP + 32 MP + 8 MP Triple Rear & 32 MP Front...,Android v14,No FM Radio
1,Poco X6 Pro 5G,"₹19,999",4.5,84.0,"Dual Sim, 3G, 4G, 5G, VoLTE, Wi-Fi, NFC, IR Bl...","Dimensity 8300 Ultra, Octa Core, 3.35 GHz Proc...","12 GB RAM, 256 GB inbuilt",5500 mAh Battery with 90W Fast Charging,"6.67 inches, 1220 x 2712 px, 120 Hz Display wi...",64 MP + 8 MP + 2 MP Triple Rear & 16 MP Front ...,Android v14,No FM Radio
2,Xiaomi Redmi Note 13 Pro Plus,"₹31,999",4.35,87.0,"Dual Sim, 3G, 4G, 5G, VoLTE, Wi-Fi, NFC, IR Bl...","Dimensity 7200 Ultra, Octa Core, 2.8 GHz Proce...","8 GB RAM, 256 GB inbuilt",5000 mAh Battery with 120W Fast Charging,"6.67 inches, 1220 x 2712 px, 120 Hz Display wi...",200 MP + 8 MP + 2 MP Triple Rear & 16 MP Front...,Memory Card Not Supported,Android v13
3,OPPO Reno 11 Pro,"₹40,990",4.75,88.0,"Dual Sim, 3G, 4G, 5G, VoLTE, Wi-Fi, NFC, IR Bl...","Dimensity 8200, Octa Core, 3.1 GHz Processor","12 GB RAM, 256 GB inbuilt",4600 mAh Battery with 80W Fast Charging,"6.74 inches, 1240 x 2772 px, 120 Hz Display wi...",50 MP + 32 MP + 8 MP Triple Rear & 32 MP Front...,Android v14,No FM Radio
4,Xiaomi Redmi Note 13 Pro Max 5G,"₹33,999",4.0,87.0,"Dual Sim, 3G, 4G, 5G, VoLTE, Wi-Fi, NFC, IR Bl...","Snapdragon 7 Gen1, Octa Core, 2.4 GHz Processor","12 GB RAM, 256 GB inbuilt",5200 mAh Battery with 120W Fast Charging,"6.67 inches, 1220 x 2712 px, 144 Hz Display wi...",200 MP + 13 MP + 8 MP Triple Rear & 32 MP Fron...,Memory Card Not Supported,Android v13



Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1020 entries, 0 to 1019
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   mobile_name    1020 non-null   object 
 1   price          1020 non-null   object 
 2   rating         1020 non-null   float64
 3   specs_score    865 non-null    float64
 4   connectivity   1020 non-null   object 
 5   processor      1020 non-null   object 
 6   storage        1020 non-null   object 
 7   battery        1020 non-null   object 
 8   display        1020 non-null   object 
 9   camera         1020 non-null   object 
 10  extra_storage  1017 non-null   object 
 11  os             1009 non-null   object 
dtypes: float64(2), object(10)
memory usage: 95.8+ KB
None

Missing Values Summary:
mobile_name        0
price              0
rating             0
specs_score      155
connectivity       0
processor          0
storage            0
battery            0
display   

## 3. Handle Missing Values

### 3.1 Numerical Columns Processing

In [5]:
num_cols = ['price', 'rating', 'specs_score', 'storage', 'battery']

for col in num_cols:
    # Currency conversion for price
    if col == 'price':
        df[col] = df[col].replace(r'[₹,]', '', regex=True).astype(float)
    
    # Extract numerical values from text fields
    elif col == 'storage':
        df[col] = df[col].str.extract(r'(\d+)\s*GB inbuilt', expand=False).astype(float)
    
    elif col == 'battery':
        df[col] = df[col].str.extract(r'(\d+)', expand=False).astype(float)
    
    # Convert to numeric and handle NaNs
    else:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # Median imputation
    if df[col].notna().any():
        df[col] = df[col].fillna(df[col].median())
    else:
        print(f"Warning: Column '{col}' has no valid values")


### 3.2 Categorical Columns Processing

In [6]:
cat_cols = ['connectivity', 'processor', 'display', 'camera', 'extra_storage', 'os']

# Impute missing values and standardize text
for col in cat_cols:
    df[col] = df[col].fillna('Unknown').str.lower()

# Remove duplicates
df = df.drop_duplicates()


## 4. Feature Engineering

### 4.1 Display Feature Extraction

In [7]:
def extract_display_features(display_str):
    """Extract screen size, resolution, and refresh rate from display description"""
    size = re.search(r'(\d+\.\d+)\s*inches', display_str)
    resolution = re.search(r'(\d+\s*x\s*\d+)', display_str)
    refresh_rate = re.search(r'(\d+)\s*hz', display_str)
    return (
        float(size.group(1)) if size else np.nan,
        resolution.group(1) if resolution else 'Unknown',
        int(refresh_rate.group(1)) if refresh_rate else np.nan
    )

# Apply feature extraction
df['screen_size'], df['resolution'], df['refresh_rate'] = zip(*df['display'].apply(extract_display_features))


### 4.2 Outlier Handling

In [8]:
# Cap prices at 99th percentile
price_cap = df['price'].quantile(0.99)
df['price'] = np.where(df['price'] > price_cap, price_cap, df['price'])


## 5. Feature Encoding and Scaling

### 5.1 One-Hot Encoding

In [9]:
# Encode categorical features
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_cols = encoder.fit_transform(df[cat_cols + ['resolution']])
encoded_df = pd.DataFrame(encoded_cols, columns=encoder.get_feature_names_out(cat_cols + ['resolution']))

# Merge encoded features
df = pd.concat([df.drop(cat_cols + ['resolution'], axis=1), encoded_df], axis=1)


### 5.2 Feature Scaling

In [10]:
# Scale numerical features
scaler = StandardScaler()
scaled_features = num_cols + ['screen_size', 'refresh_rate']
df[scaled_features] = scaler.fit_transform(df[scaled_features])

# Create new feature
df['battery_per_screen_size'] = df['battery'] / df['screen_size']


## 6. Train-Test Split

In [11]:
# Prepare data for modeling
X = df.drop(['mobile_name', 'price'], axis=1)
y = df['price']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42
)

# Final validation
print("\nFinal Training Set Shape:", X_train.shape)
print("\nMissing Values in Training Data:")
print(X_train.isnull().sum())



Final Training Set Shape: (816, 1195)

Missing Values in Training Data:
rating                      1
specs_score                 1
storage                     1
battery                     1
screen_size                23
                           ..
resolution_720 x 1680       1
resolution_750 x 1334       1
resolution_828 x 1792       1
resolution_Unknown          1
battery_per_screen_size    23
Length: 1195, dtype: int64
