# Flipkart Mobile Dataset

In [1]:
import numpy as np
import pandas as pd

In [2]:
dataset = pd.read_csv("Flipkart Mobile.csv")

In [3]:
dataset

Unnamed: 0,brand,model,base_color,processor,screen_size,ROM,RAM,display_size,num_rear_camera,num_front_camera,battery_capacity,ratings,num_of_ratings,sales_price,discount_percent,sales
0,Apple,iPhone SE,Black,Water,Very Small,64,2,4.7,1,1,1800,4.5,38645,32999,0.17,127.52
1,Apple,iPhone 12 Mini,Red,Ceramic,Small,64,4,5.4,2,1,2815,4.5,244,57149,0.04,1.39
2,Apple,iPhone SE,Red,Water,Very Small,64,2,4.7,1,1,1800,4.5,38645,32999,0.17,127.52
3,Apple,iPhone XR,Others,iOS,Medium,64,3,6.1,1,1,2942,4.6,5366,42999,0.10,23.07
4,Apple,iPhone 12,Red,Ceramic,Medium,128,4,6.1,2,1,2815,4.6,745,69149,0.02,5.15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
425,Xiaomi,Redmi 6 Pro,Black,Qualcomm,Small,32,3,5.8,2,1,4000,4.3,1870,7999,0.30,1.50
426,Xiaomi,Redmi 6 Pro,Red,Qualcomm,Small,64,4,5.8,2,1,4000,4.3,1783,9699,0.28,1.73
427,Xiaomi,Mi 11 Lite,Others,Qualcomm,Large,128,6,6.5,3,1,4250,4.2,1554,21999,0.12,3.42
428,Xiaomi,Redmi 8A Dual,Blue,Qualcomm,Medium,32,3,6.2,2,1,5000,4.2,8161,8299,0.07,6.77


# DATASET Description

ABOUT DATASET

The dataset includes data on mobile phones from the top five most popular 
brands in India: Apple, Poco, Realme, Samsung, and Xiaomi. Information like 
RAM, ROM, Display Size. etc are present which distinguishes one product from 
another. At least one attribute distinguishes each product. Dataset has no null 
value.

Columns: There are 16 columns each having a title which is self-explanatory.
Rows: There are 430 rows each having a mobile with at least a distinct feature.

DESCRIPTION OF ATTRIBUTES

I. brand: Brand Name (Categorical)

II. model: Model Name (Categorical)

III. base_color: Phone Color (Categorical)

IV. processor: Processor brand used (Categorical)

V. screen_size: Categorical screen size (Categorical)

VI. ROM: ROM in gigabyte (Numeric – Discrete)

VII. RAM: RAM in gigabyte (Numeric – Discrete)

VIII. display_size: Actual display size in inches (Numeric – Continuous)

IX. num_rear_camera: No. of cameras on back (Numeric – Discrete)

X. num_front_camera: No. of cameras on front (Numeric – Discrete)

XI. battery_size: Battery in mAH (Numeric – Continuous)

XII. ratings: Customer rating for the product (Numeric – Continuous)

XIII. num_of_ratings: No. of people rating the product, also the equivalent 
no. of unit sold for our problem (Numeric – Continuous)

XIV. sales_price: Selling price of the unit after discount (Numeric –
Continuous)

XV. discount_percent: Discount in percentage offered (Numeric –
Continuous)

XVI. sales: Sales of product in crore rupees (Numeric – Continuous)



In [4]:
dataset = dataset.drop( ['num_of_ratings','num_front_camera'],axis=1)

In [5]:
dataset

Unnamed: 0,brand,model,base_color,processor,screen_size,ROM,RAM,display_size,num_rear_camera,battery_capacity,ratings,sales_price,discount_percent,sales
0,Apple,iPhone SE,Black,Water,Very Small,64,2,4.7,1,1800,4.5,32999,0.17,127.52
1,Apple,iPhone 12 Mini,Red,Ceramic,Small,64,4,5.4,2,2815,4.5,57149,0.04,1.39
2,Apple,iPhone SE,Red,Water,Very Small,64,2,4.7,1,1800,4.5,32999,0.17,127.52
3,Apple,iPhone XR,Others,iOS,Medium,64,3,6.1,1,2942,4.6,42999,0.10,23.07
4,Apple,iPhone 12,Red,Ceramic,Medium,128,4,6.1,2,2815,4.6,69149,0.02,5.15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
425,Xiaomi,Redmi 6 Pro,Black,Qualcomm,Small,32,3,5.8,2,4000,4.3,7999,0.30,1.50
426,Xiaomi,Redmi 6 Pro,Red,Qualcomm,Small,64,4,5.8,2,4000,4.3,9699,0.28,1.73
427,Xiaomi,Mi 11 Lite,Others,Qualcomm,Large,128,6,6.5,3,4250,4.2,21999,0.12,3.42
428,Xiaomi,Redmi 8A Dual,Blue,Qualcomm,Medium,32,3,6.2,2,5000,4.2,8299,0.07,6.77


In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 430 entries, 0 to 429
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   brand             430 non-null    object 
 1   model             430 non-null    object 
 2   base_color        430 non-null    object 
 3   processor         430 non-null    object 
 4   screen_size       430 non-null    object 
 5   ROM               430 non-null    int64  
 6   RAM               430 non-null    int64  
 7   display_size      430 non-null    float64
 8   num_rear_camera   430 non-null    int64  
 9   battery_capacity  430 non-null    int64  
 10  ratings           430 non-null    float64
 11  sales_price       430 non-null    int64  
 12  discount_percent  430 non-null    float64
 13  sales             430 non-null    float64
dtypes: float64(4), int64(5), object(5)
memory usage: 47.2+ KB


In [7]:
dataset.dtypes

brand                object
model                object
base_color           object
processor            object
screen_size          object
ROM                   int64
RAM                   int64
display_size        float64
num_rear_camera       int64
battery_capacity      int64
ratings             float64
sales_price           int64
discount_percent    float64
sales               float64
dtype: object

# To check for Null values in a dataset

In [8]:
dataset.isnull().sum()

brand               0
model               0
base_color          0
processor           0
screen_size         0
ROM                 0
RAM                 0
display_size        0
num_rear_camera     0
battery_capacity    0
ratings             0
sales_price         0
discount_percent    0
sales               0
dtype: int64

In [9]:
dataset.describe()

Unnamed: 0,ROM,RAM,display_size,num_rear_camera,battery_capacity,ratings,sales_price,discount_percent,sales
count,430.0,430.0,430.0,430.0,430.0,430.0,430.0,430.0,430.0
mean,105.748837,5.32093,6.369767,2.904651,4529.397674,4.339302,25433.234884,0.108,29.752326
std,63.164064,2.182635,0.369549,0.95235,986.907252,0.151494,22471.926588,0.073432,58.399588
min,8.0,1.0,4.7,1.0,1800.0,3.0,5742.0,0.01,0.0
25%,64.0,4.0,6.3,2.0,4000.0,4.3,11999.0,0.06,1.64
50%,128.0,4.0,6.5,3.0,4500.0,4.3,16989.5,0.09,9.655
75%,128.0,6.0,6.5,4.0,5000.0,4.4,28999.0,0.16,29.7175
max,512.0,12.0,7.6,4.0,7000.0,4.6,157999.0,0.44,550.19


In [10]:
quan=[]
qual=[]
for columnName in dataset.columns:
    #print(columnName)
    if (dataset[columnName].dtype=='object'):
       # print("qual")
        qual.append(columnName)   
    else:
        #print("quan")
        quan.append(columnName)  


In [11]:
quan

['ROM',
 'RAM',
 'display_size',
 'num_rear_camera',
 'battery_capacity',
 'ratings',
 'sales_price',
 'discount_percent',
 'sales']

In [12]:
qual

['brand', 'model', 'base_color', 'processor', 'screen_size']

In [13]:
descriptive = pd.DataFrame(index=["Mean", "Median", "Mode", "Q1:25%", "Q2:50%", "Q3:75%", "99", "Q4:100%", "IQR",
                                  "1.5Rule", "Lesser", "Greater", "Min", "Max"], columns=quan)

for columnName in quan:
    descriptive.loc["Mean", columnName] = dataset[columnName].mean()
    descriptive.loc["Median", columnName] = dataset[columnName].median()
    descriptive.loc["Mode", columnName] = dataset[columnName].mode()[0]
    descriptive.loc["Q1:25%", columnName] = dataset[columnName].quantile(0.25)
    descriptive.loc["Q2:50%", columnName] = dataset[columnName].quantile(0.50)
    descriptive.loc["Q3:75%", columnName] = dataset[columnName].quantile(0.75)
    descriptive.loc["99", columnName] = np.percentile(dataset[columnName], 99)
    descriptive.loc["Q4:100%", columnName] = dataset[columnName].max()
    descriptive.loc["IQR", columnName] = descriptive.loc["Q3:75%", columnName] - descriptive.loc["Q1:25%", columnName]
    descriptive.loc["1.5Rule", columnName] = 1.5 * descriptive.loc["IQR", columnName]
    descriptive.loc["Lesser", columnName] = descriptive.loc["Q1:25%", columnName] - descriptive.loc["1.5Rule", columnName]
    descriptive.loc["Greater", columnName] = descriptive.loc["Q3:75%", columnName] + descriptive.loc["1.5Rule", columnName]
    descriptive.loc["Min", columnName] = dataset[columnName].min()
    descriptive.loc["Max", columnName] = dataset[columnName].max()

In [14]:
descriptive

Unnamed: 0,ROM,RAM,display_size,num_rear_camera,battery_capacity,ratings,sales_price,discount_percent,sales
Mean,105.748837,5.32093,6.369767,2.904651,4529.397674,4.339302,25433.234884,0.108,29.752326
Median,128.0,4.0,6.5,3.0,4500.0,4.3,16989.5,0.09,9.655
Mode,128.0,4.0,6.5,3.0,5000.0,4.3,14999.0,0.09,23.07
Q1:25%,64.0,4.0,6.3,2.0,4000.0,4.3,11999.0,0.06,1.64
Q2:50%,128.0,4.0,6.5,3.0,4500.0,4.3,16989.5,0.09,9.655
Q3:75%,128.0,6.0,6.5,4.0,5000.0,4.4,28999.0,0.16,29.7175
99,256.0,12.0,6.9,4.0,7000.0,4.6,91999.0,0.3455,223.7309
Q4:100%,512.0,12.0,7.6,4.0,7000.0,4.6,157999.0,0.44,550.19
IQR,64.0,2.0,0.2,2.0,1000.0,0.1,17000.0,0.1,28.0775
1.5Rule,96.0,3.0,0.3,3.0,1500.0,0.15,25500.0,0.15,42.11625


# To check for outliear in a dataset

In [15]:
def outliear_check(quan, descriptive):
    Lesser = []
    Greater = []
    for columnName in quan:
        if descriptive[columnName]["Min"] < descriptive[columnName]["Lesser"]:
            Lesser.append(columnName)
        if descriptive[columnName]["Max"] > descriptive[columnName]["Greater"]:
            Greater.append(columnName)
    return Lesser, Greater


In [16]:
Lesser, Greater=outliear_check(quan, descriptive)

In [17]:
Lesser

['display_size', 'battery_capacity', 'ratings']

In [18]:
Greater

['ROM',
 'RAM',
 'display_size',
 'battery_capacity',
 'ratings',
 'sales_price',
 'discount_percent',
 'sales']

# Outliear removing in a dataset

In [19]:
for columnName in Lesser:
    dataset.loc[dataset[columnName] < descriptive[columnName]["Lesser"], columnName] = descriptive[columnName]["Lesser"]

for columnName in Greater:
    dataset.loc[dataset[columnName] > descriptive[columnName]["Greater"], columnName] = descriptive[columnName]["Greater"]

In [20]:
descriptive = pd.DataFrame(index=["Mean", "Median", "Mode", "Q1:25%", "Q2:50%", "Q3:75%", "99", "Q4:100%", "IQR",
                                  "1.5Rule", "Lesser", "Greater", "Min", "Max"], columns=quan)

for columnName in quan:
    descriptive.loc["Mean", columnName] = dataset[columnName].mean()
    descriptive.loc["Median", columnName] = dataset[columnName].median()
    descriptive.loc["Mode", columnName] = dataset[columnName].mode()[0]
    descriptive.loc["Q1:25%", columnName] = dataset[columnName].quantile(0.25)
    descriptive.loc["Q2:50%", columnName] = dataset[columnName].quantile(0.50)
    descriptive.loc["Q3:75%", columnName] = dataset[columnName].quantile(0.75)
    descriptive.loc["99", columnName] = np.percentile(dataset[columnName], 99)
    descriptive.loc["Q4:100%", columnName] = dataset[columnName].max()
    descriptive.loc["IQR", columnName] = descriptive.loc["Q3:75%", columnName] - descriptive.loc["Q1:25%", columnName]
    descriptive.loc["1.5Rule", columnName] = 1.5 * descriptive.loc["IQR", columnName]
    descriptive.loc["Lesser", columnName] = descriptive.loc["Q1:25%", columnName] - descriptive.loc["1.5Rule", columnName]
    descriptive.loc["Greater", columnName] = descriptive.loc["Q3:75%", columnName] + descriptive.loc["1.5Rule", columnName]
    descriptive.loc["Min", columnName] = dataset[columnName].min()
    descriptive.loc["Max", columnName] = dataset[columnName].max()

In [21]:
descriptive

Unnamed: 0,ROM,RAM,display_size,num_rear_camera,battery_capacity,ratings,sales_price,discount_percent,sales
Mean,102.251163,5.237209,6.412791,2.904651,4530.560465,4.344651,22826.14186,0.10693,20.133852
Median,128.0,4.0,6.5,3.0,4500.0,4.3,16989.5,0.09,9.655
Mode,128.0,4.0,6.5,3.0,5000.0,4.3,54499.0,0.09,71.83375
Q1:25%,64.0,4.0,6.3,2.0,4000.0,4.3,11999.0,0.06,1.64
Q2:50%,128.0,4.0,6.5,3.0,4500.0,4.3,16989.5,0.09,9.655
Q3:75%,128.0,6.0,6.5,4.0,5000.0,4.4,28999.0,0.16,29.7175
99,224.0,9.0,6.8,4.0,6500.0,4.55,54499.0,0.31,71.83375
Q4:100%,224.0,9.0,6.8,4.0,6500.0,4.55,54499.0,0.31,71.83375
IQR,64.0,2.0,0.2,2.0,1000.0,0.1,17000.0,0.1,28.0775
1.5Rule,96.0,3.0,0.3,3.0,1500.0,0.15,25500.0,0.15,42.11625


In [22]:
def outliear_check(quan, descriptive):
    Lesser = []
    Greater = []
    for columnName in quan:
        if descriptive[columnName]["Min"] < descriptive[columnName]["Lesser"]:
            Lesser.append(columnName)
        if descriptive[columnName]["Max"] > descriptive[columnName]["Greater"]:
            Greater.append(columnName)
    return Lesser, Greater


In [23]:
Lesser,Greater=  outliear_check(quan, descriptive)

In [24]:
Lesser

[]

In [25]:
Greater

[]

In [26]:
df1=(dataset[quan])

In [27]:
df1

Unnamed: 0,ROM,RAM,display_size,num_rear_camera,battery_capacity,ratings,sales_price,discount_percent,sales
0,64,2,6.0,1,2500,4.50,32999,0.17,71.83375
1,64,4,6.0,2,2815,4.50,54499,0.04,1.39000
2,64,2,6.0,1,2500,4.50,32999,0.17,71.83375
3,64,3,6.1,1,2942,4.55,42999,0.10,23.07000
4,128,4,6.1,2,2815,4.55,54499,0.02,5.15000
...,...,...,...,...,...,...,...,...,...
425,32,3,6.0,2,4000,4.30,7999,0.30,1.50000
426,64,4,6.0,2,4000,4.30,9699,0.28,1.73000
427,128,6,6.5,3,4250,4.20,21999,0.12,3.42000
428,32,3,6.2,2,5000,4.20,8299,0.07,6.77000


In [28]:
df1=pd.DataFrame(df1,columns=quan)

In [29]:
df1

Unnamed: 0,ROM,RAM,display_size,num_rear_camera,battery_capacity,ratings,sales_price,discount_percent,sales
0,64,2,6.0,1,2500,4.50,32999,0.17,71.83375
1,64,4,6.0,2,2815,4.50,54499,0.04,1.39000
2,64,2,6.0,1,2500,4.50,32999,0.17,71.83375
3,64,3,6.1,1,2942,4.55,42999,0.10,23.07000
4,128,4,6.1,2,2815,4.55,54499,0.02,5.15000
...,...,...,...,...,...,...,...,...,...
425,32,3,6.0,2,4000,4.30,7999,0.30,1.50000
426,64,4,6.0,2,4000,4.30,9699,0.28,1.73000
427,128,6,6.5,3,4250,4.20,21999,0.12,3.42000
428,32,3,6.2,2,5000,4.20,8299,0.07,6.77000


In [30]:
df2=(dataset[qual])

In [31]:
df2 = pd.get_dummies(df2, drop_first=True)

In [32]:
df2

Unnamed: 0,brand_Poco,brand_Realme,brand_Samsung,brand_Xiaomi,model_5 Pro,model_6,model_6i,model_7,model_7 Pro,model_8,...,processor_Exynos,processor_MediaTek,processor_Others,processor_Qualcomm,processor_Water,processor_iOS,screen_size_Medium,screen_size_Small,screen_size_Very Large,screen_size_Very Small
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,True
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,True
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,True,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
425,False,False,False,True,False,False,False,False,False,False,...,False,False,False,True,False,False,False,True,False,False
426,False,False,False,True,False,False,False,False,False,False,...,False,False,False,True,False,False,False,True,False,False
427,False,False,False,True,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
428,False,False,False,True,False,False,False,False,False,False,...,False,False,False,True,False,False,True,False,False,False


In [33]:
df2=pd.DataFrame(df2,columns=qual)

In [34]:
two = pd.DataFrame(dataset)

In [35]:
preprocessed = pd.concat([two], axis=1)

In [36]:
preprocessed

Unnamed: 0,brand,model,base_color,processor,screen_size,ROM,RAM,display_size,num_rear_camera,battery_capacity,ratings,sales_price,discount_percent,sales
0,Apple,iPhone SE,Black,Water,Very Small,64,2,6.0,1,2500,4.50,32999,0.17,71.83375
1,Apple,iPhone 12 Mini,Red,Ceramic,Small,64,4,6.0,2,2815,4.50,54499,0.04,1.39000
2,Apple,iPhone SE,Red,Water,Very Small,64,2,6.0,1,2500,4.50,32999,0.17,71.83375
3,Apple,iPhone XR,Others,iOS,Medium,64,3,6.1,1,2942,4.55,42999,0.10,23.07000
4,Apple,iPhone 12,Red,Ceramic,Medium,128,4,6.1,2,2815,4.55,54499,0.02,5.15000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
425,Xiaomi,Redmi 6 Pro,Black,Qualcomm,Small,32,3,6.0,2,4000,4.30,7999,0.30,1.50000
426,Xiaomi,Redmi 6 Pro,Red,Qualcomm,Small,64,4,6.0,2,4000,4.30,9699,0.28,1.73000
427,Xiaomi,Mi 11 Lite,Others,Qualcomm,Large,128,6,6.5,3,4250,4.20,21999,0.12,3.42000
428,Xiaomi,Redmi 8A Dual,Blue,Qualcomm,Medium,32,3,6.2,2,5000,4.20,8299,0.07,6.77000


In [37]:
preprocessed.to_csv("preprocessed-Flipkart Mobile.csv",index=False)

In [38]:
preprocessed

Unnamed: 0,brand,model,base_color,processor,screen_size,ROM,RAM,display_size,num_rear_camera,battery_capacity,ratings,sales_price,discount_percent,sales
0,Apple,iPhone SE,Black,Water,Very Small,64,2,6.0,1,2500,4.50,32999,0.17,71.83375
1,Apple,iPhone 12 Mini,Red,Ceramic,Small,64,4,6.0,2,2815,4.50,54499,0.04,1.39000
2,Apple,iPhone SE,Red,Water,Very Small,64,2,6.0,1,2500,4.50,32999,0.17,71.83375
3,Apple,iPhone XR,Others,iOS,Medium,64,3,6.1,1,2942,4.55,42999,0.10,23.07000
4,Apple,iPhone 12,Red,Ceramic,Medium,128,4,6.1,2,2815,4.55,54499,0.02,5.15000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
425,Xiaomi,Redmi 6 Pro,Black,Qualcomm,Small,32,3,6.0,2,4000,4.30,7999,0.30,1.50000
426,Xiaomi,Redmi 6 Pro,Red,Qualcomm,Small,64,4,6.0,2,4000,4.30,9699,0.28,1.73000
427,Xiaomi,Mi 11 Lite,Others,Qualcomm,Large,128,6,6.5,3,4250,4.20,21999,0.12,3.42000
428,Xiaomi,Redmi 8A Dual,Blue,Qualcomm,Medium,32,3,6.2,2,5000,4.20,8299,0.07,6.77000
