In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline


In [7]:
df = pd.read_csv('D:\DataSci\Resources\laptop_pricing_dataset_mod1.csv')
print(df.head(5))

   Unnamed: 0 Manufacturer  Category     Screen  GPU  OS  CPU_core  \
0           0         Acer         4  IPS Panel    2   1         5   
1           1         Dell         3    Full HD    1   1         3   
2           2         Dell         3    Full HD    1   1         7   
3           3         Dell         4  IPS Panel    2   1         5   
4           4           HP         4    Full HD    2   1         7   

   Screen_Size_cm  CPU_frequency  RAM_GB  Storage_GB_SSD  Weight_kg  Price  
0          35.560            1.6       8             256       1.60    978  
1          39.624            2.0       4             256       2.20    634  
2          39.624            2.7       8             256       2.20    946  
3          33.782            1.6       8             128       1.22   1244  
4          39.624            1.8       8             256       1.91    837  


In [8]:
df.dtypes

Unnamed: 0          int64
Manufacturer       object
Category            int64
Screen             object
GPU                 int64
OS                  int64
CPU_core            int64
Screen_Size_cm    float64
CPU_frequency     float64
RAM_GB              int64
Storage_GB_SSD      int64
Weight_kg         float64
Price               int64
dtype: object

In [10]:
#it looks like we need to round the screen size data to two decimal places. so we will do that by using the command
df['Screen_Size_cm'] = np.round(df['Screen_Size_cm'], 2)
print(df.head(5))

   Unnamed: 0 Manufacturer  Category     Screen  GPU  OS  CPU_core  \
0           0         Acer         4  IPS Panel    2   1         5   
1           1         Dell         3    Full HD    1   1         3   
2           2         Dell         3    Full HD    1   1         7   
3           3         Dell         4  IPS Panel    2   1         5   
4           4           HP         4    Full HD    2   1         7   

   Screen_Size_cm  CPU_frequency  RAM_GB  Storage_GB_SSD  Weight_kg  Price  
0           35.56            1.6       8             256       1.60    978  
1           39.62            2.0       4             256       2.20    634  
2           39.62            2.7       8             256       2.20    946  
3           33.78            1.6       8             128       1.22   1244  
4           39.62            1.8       8             256       1.91    837  


In [None]:
#identifying columns with missing values
missing_data = df.isnull()
print(missing_data.head(5))
for column in list(missing_data.columns):
    print(column)
    print(missing_data[column].value_counts()) #value_counts returns the frequency of unique values in the column 
    print(" ")

In [24]:
mean = np.mean(df['Weight_kg'],axis=0)
df['Weight_kg'].replace(np.NaN,mean, inplace=True)
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 238 entries, 0 to 237
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      238 non-null    int64  
 1   Manufacturer    238 non-null    object 
 2   Category        238 non-null    int64  
 3   Screen          238 non-null    object 
 4   GPU             238 non-null    int64  
 5   OS              238 non-null    int64  
 6   CPU_core        238 non-null    int64  
 7   Screen_Size_cm  234 non-null    float64
 8   CPU_frequency   238 non-null    float64
 9   RAM_GB          238 non-null    int64  
 10  Storage_GB_SSD  238 non-null    int64  
 11  Weight_kg       238 non-null    float64
 12  Price           238 non-null    int64  
dtypes: float64(3), int64(8), object(2)
memory usage: 24.3+ KB


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Weight_kg'].replace(np.NaN,mean, inplace=True)


In [27]:
df['Screen_Size_cm'] = df['Screen_Size_cm']/2.54
df.rename(columns={'Screen_Size_cm':'Screen_size_inch'}, inplace=True)
print(df.head(5))


   Unnamed: 0 Manufacturer  Category     Screen  GPU  OS  CPU_core  \
0           0         Acer         4  IPS Panel    2   1         5   
1           1         Dell         3    Full HD    1   1         3   
2           2         Dell         3    Full HD    1   1         7   
3           3         Dell         4  IPS Panel    2   1         5   
4           4           HP         4    Full HD    2   1         7   

   Screen_size_inch  CPU_frequency  RAM_GB  Storage_GB_SSD  Weight_kg  Price  
0         14.000000            1.6       8             256       1.60    978  
1         15.598425            2.0       4             256       2.20    634  
2         15.598425            2.7       8             256       2.20    946  
3         13.299213            1.6       8             128       1.22   1244  
4         15.598425            1.8       8             256       1.91    837  


In [30]:
#lets create 3 bins - low, mid and high
bins = np.linspace(df['Price'].min(),df['Price'].max(),4 )
bin_names = ["Low", "Medium", "High"]
df["Price_binned"]= pd.cut(df["Price"], bins, labels=bin_names, include_lowest=True)
print(df.head(5))

   Unnamed: 0 Manufacturer  Category     Screen  GPU  OS  CPU_core  \
0           0         Acer         4  IPS Panel    2   1         5   
1           1         Dell         3    Full HD    1   1         3   
2           2         Dell         3    Full HD    1   1         7   
3           3         Dell         4  IPS Panel    2   1         5   
4           4           HP         4    Full HD    2   1         7   

   Screen_size_inch  CPU_frequency  RAM_GB  Storage_GB_SSD  Weight_kg  Price  \
0         14.000000            1.6       8             256       1.60    978   
1         15.598425            2.0       4             256       2.20    634   
2         15.598425            2.7       8             256       2.20    946   
3         13.299213            1.6       8             128       1.22   1244   
4         15.598425            1.8       8             256       1.91    837   

  Price_binned  
0          Low  
1          Low  
2          Low  
3          Low  
4          Lo

In [34]:
print(df['Screen'].unique())
#so as we see there are only two types for screens, either they are an IPS Panel or they arent
dummy = pd.get_dummies(df["Screen"])
df = pd.concat([df, dummy], axis = 1)
df.drop('Screen', axis=1, inplace=True)
df.head(5)

['IPS Panel' 'Full HD']


Unnamed: 0.1,Unnamed: 0,Manufacturer,Category,GPU,OS,CPU_core,Screen_size_inch,CPU_frequency,RAM_GB,Storage_GB_SSD,Weight_kg,Price,Price_binned,Full HD,IPS Panel
0,0,Acer,4,2,1,5,14.0,1.6,8,256,1.6,978,Low,False,True
1,1,Dell,3,1,1,3,15.598425,2.0,4,256,2.2,634,Low,True,False
2,2,Dell,3,1,1,7,15.598425,2.7,8,256,2.2,946,Low,True,False
3,3,Dell,4,2,1,5,13.299213,1.6,8,128,1.22,1244,Low,False,True
4,4,HP,4,2,1,7,15.598425,1.8,8,256,1.91,837,Low,True,False


In [36]:
df.drop( "CPU_core", axis='columns',inplace=True)
print(df.head(5))

   Unnamed: 0 Manufacturer  Category  GPU  OS  Screen_size_inch  \
0           0         Acer         4    2   1         14.000000   
1           1         Dell         3    1   1         15.598425   
2           2         Dell         3    1   1         15.598425   
3           3         Dell         4    2   1         13.299213   
4           4           HP         4    2   1         15.598425   

   CPU_frequency  RAM_GB  Storage_GB_SSD  Weight_kg  Price Price_binned  \
0            1.6       8             256       1.60    978          Low   
1            2.0       4             256       2.20    634          Low   
2            2.7       8             256       2.20    946          Low   
3            1.6       8             128       1.22   1244          Low   
4            1.8       8             256       1.91    837          Low   

   Full HD  IPS Panel  
0    False       True  
1     True      False  
2     True      False  
3    False       True  
4     True      False  
