## Smartphones: ETL-process

In [11]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 

Features for ETL-Process                      
+ Screen Size (inches) 4.0 – 7.6 inches       --> float
+ Screen Resolution (pixels):                 --> int
  + Width: 720 – 3200 px                      --> int 
  + Height: 1280 – 3840 px                    --> int
+ Pixel Density (ppi): 150 – 600 ppi          --> int
+ Weight (grams):  100 – 300 g                --> float
+ Thickness (mm):  5 – 12 mm                  --> float
+ Battery Capacity (mAh): 2000 – 6000 mAh     --> int
+ Processor Clock Speed (GHz): 1.0 – 3.5 GHz  --> float

In [2]:
# create a feature list
hardware_features =  ["Weight(gr)", "Thickness(mm)","Battery_Capacity(mAh)", "Processor_Clock_Speed"]
screen_features = ["Screen_Size(inches)", "SR_Width","SR_Height", "Pixel_Density"]

# create company brands list
company_brands = ["Samsung", "OnePlus", "Xiaomi"]

print(f"Hardware features: {hardware_features}")
print(f"Screen features: {screen_features}")

Hardware features: ['Weight(gr)', 'Thickness(mm)', 'Battery_Capacity(mAh)', 'Processor_Clock_Speed']
Screen features: ['Screen_Size(inches)', 'SR_Width', 'SR_Height', 'Pixel_Density']


In [None]:
# define constant: length of dataframe 
DATA_LEN = 3000

# define function --> generate discrete data (int)
def generate_int_data(feature: str, feature_range: tuple, data_len: int = DATA_LEN) -> list:
  # define contantst
  x,y = feature_range
  feature_df = pd.DataFrame()

  # produce new feature data
  feature_data = np.random.randint(x,y, data_len)
  feature_series = pd.Series(feature_data)
  feature_df[feature] = feature_series
  return feature_df

# define function --> generate real data (float)
def generate_float_data(feature: str, feature_range: tuple, data_len: int = DATA_LEN) -> list:
  # define contantst
  x,y = feature_range
  feature_df = pd.DataFrame()

  # produce new feature data
  feature_data = np.random.uniform(x,y, data_len)
  feature_series = pd.Series(feature_data)
  feature_df[feature] = feature_series
  return feature_df

# define function --> generate categorical data (object)
def produce_categorical_data(feature: str, categories: list, data_len: int = DATA_LEN) -> list:
  # define contantst
  feature_df = pd.DataFrame()

  # produce new feature data
  feature_data = np.random.choice(categories, data_len)
  feature_series = pd.Series(feature_data)
  feature_df[feature] = feature_series
  return feature_df


In [4]:
# produce screen feature data
screen_size = generate_float_data(screen_features[0], (4.0, 7.6))
sr_width = generate_int_data(screen_features[1], (720, 3200))
sr_height = generate_int_data(screen_features[2], (1280, 3840))
pixel_density = generate_int_data(screen_features[3], (150, 600))

# produce hardware feature data
weight = generate_float_data(hardware_features[0], (100.0, 300.0))
thickness = generate_float_data(hardware_features[1], (5.0,12.0))
battery_cap = generate_int_data(hardware_features[2], (2000, 6000))
clockspeed = generate_float_data(hardware_features[-1], (1.0, 3.5))

# create a price data + company brand names 
price_sale = generate_float_data("Price_Sale", (400, 1300))
company_brands = produce_categorical_data("Brand", company_brands)

In [5]:
# combine all features at once 
mobile_phone_df = pd.concat([company_brands, 
                            screen_size,
                             sr_height,
                             sr_width,
                             pixel_density,
                             weight,
                             thickness,
                             battery_cap,
                             clockspeed, 
                             price_sale], axis=1)

In [6]:
mobile_phone_df.head()

Unnamed: 0,Brand,Screen_Size(inches),SR_Height,SR_Width,Pixel_Density,Weight(gr),Thickness(mm),Battery_Capacity(mAh),Processor_Clock_Speed,Price_Sale
0,Xiaomi,6.292954,3832,1922,495,160.855807,10.008206,5534,1.18983,799.48162
1,OnePlus,6.329885,3573,3003,346,226.017067,7.185694,2310,1.81892,1075.259705
2,Samsung,4.060633,2465,2004,403,136.166661,11.960697,5818,2.962226,872.690904
3,Samsung,4.913971,2762,1526,367,290.568013,7.309192,4056,1.466584,967.706494
4,OnePlus,7.267191,1324,2598,387,185.541028,7.677568,3125,3.110436,1076.824278


In [22]:
# reduce number of data 
mobile_phone_df1 = mobile_phone_df.iloc[0:1000]
mobile_phone_df2 = mobile_phone_df.sample(800)
mobile_phone_df2.index = np.arange(0,mobile_phone_df2.shape[0])
mobile_phone_df2

Unnamed: 0,Brand,Screen_Size(inches),SR_Height,SR_Width,Pixel_Density,Weight(gr),Thickness(mm),Battery_Capacity(mAh),Processor_Clock_Speed,Price_Sale
0,Samsung,6.214724,3165,1799,235,232.911487,7.719020,4935,1.370790,1134.120131
1,Xiaomi,7.529126,2751,1583,174,204.467314,7.855609,4693,2.784647,528.949218
2,Xiaomi,6.744018,2301,977,490,145.417790,11.724540,4235,1.166043,1146.400208
3,OnePlus,6.026166,3605,859,341,183.514596,10.533808,2058,1.234825,843.869780
4,Samsung,5.623507,3833,1149,412,123.284643,7.296229,5082,1.370265,1087.905764
...,...,...,...,...,...,...,...,...,...,...
795,OnePlus,4.934964,1333,3183,463,218.372196,5.170715,2275,1.622167,463.755628
796,Xiaomi,6.979174,2035,2963,454,239.081336,9.657847,3660,2.602158,860.968341
797,OnePlus,6.572178,3619,2652,334,141.291226,7.084243,5538,1.085605,1159.248539
798,OnePlus,6.134351,2036,1461,317,201.244296,5.879575,4169,2.571864,1170.859695


In [23]:
# identify the dataset
mobile_phone_df2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 800 entries, 0 to 799
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Brand                  800 non-null    object 
 1   Screen_Size(inches)    800 non-null    float64
 2   SR_Height              800 non-null    int32  
 3   SR_Width               800 non-null    int32  
 4   Pixel_Density          800 non-null    int32  
 5   Weight(gr)             800 non-null    float64
 6   Thickness(mm)          800 non-null    float64
 7   Battery_Capacity(mAh)  800 non-null    int32  
 8   Processor_Clock_Speed  800 non-null    float64
 9   Price_Sale             800 non-null    float64
dtypes: float64(5), int32(4), object(1)
memory usage: 53.1+ KB


In [24]:
# shape of dataset
print(f"Samples: {mobile_phone_df2.shape[0]}")
print(f"Features: {mobile_phone_df2.shape[1]}")

Samples: 800
Features: 10


In [25]:
mobile_phone_df2["Brand"].value_counts()

Brand
Xiaomi     286
Samsung    258
OnePlus    256
Name: count, dtype: int64

In [35]:
mobile_phone_df

Unnamed: 0,Brand,Screen_Size(inches),SR_Height,SR_Width,Pixel_Density,Weight(gr),Thickness(mm),Battery_Capacity(mAh),Processor_Clock_Speed,Price_Sale
0,Xiaomi,6.292954,3832,1922,495,160.855807,10.008206,5534,1.189830,799.481620
1,OnePlus,6.329885,3573,3003,346,226.017067,7.185694,2310,1.818920,1075.259705
2,Samsung,4.060633,2465,2004,403,136.166661,11.960697,5818,2.962226,872.690904
3,Samsung,4.913971,2762,1526,367,290.568013,7.309192,4056,1.466584,967.706494
4,OnePlus,7.267191,1324,2598,387,185.541028,7.677568,3125,3.110436,1076.824278
...,...,...,...,...,...,...,...,...,...,...
2995,OnePlus,7.501829,2962,2037,170,237.484378,10.136111,4711,3.440148,1013.824336
2996,Samsung,7.509296,2344,2773,508,157.845278,9.437900,2066,3.049072,952.005867
2997,Xiaomi,4.937489,3280,2771,220,191.408742,10.948394,2846,2.332527,730.390425
2998,OnePlus,4.900864,3481,2631,401,190.389149,10.285059,5636,2.039995,1228.994747


In [9]:
# save dataset
mobile_phone_df.to_csv("mobile_phone_sale_price.csv")