In [9]:
import numpy as np
import pandas as pd 


### Load Train Data

In [10]:
dfTrain = pd.read_csv("laptops_train.csv")

### Explore Train Data

In [11]:
dfTrain.head()

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM,Storage,GPU,Operating System,Operating System Version,Weight,Price
0,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,11912523.48
1,Apple,Macbook Air,Ultrabook,"13.3""",1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,7993374.48
2,HP,250 G6,Notebook,"15.6""",Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,,1.86kg,5112900.0
3,Apple,MacBook Pro,Ultrabook,"15.4""",IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,,1.83kg,22563005.4
4,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,,1.37kg,16037611.2


In [12]:
dfTrain.columns

Index(['Manufacturer', 'Model Name', 'Category', 'Screen Size', 'Screen',
       'CPU', 'RAM', ' Storage', 'GPU', 'Operating System',
       'Operating System Version', 'Weight', 'Price'],
      dtype='object')

In [13]:
dfTrain.info() # only operating system version column have null values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 977 entries, 0 to 976
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Manufacturer              977 non-null    object 
 1   Model Name                977 non-null    object 
 2   Category                  977 non-null    object 
 3   Screen Size               977 non-null    object 
 4   Screen                    977 non-null    object 
 5   CPU                       977 non-null    object 
 6   RAM                       977 non-null    object 
 7    Storage                  977 non-null    object 
 8   GPU                       977 non-null    object 
 9   Operating System          977 non-null    object 
 10  Operating System Version  841 non-null    object 
 11  Weight                    977 non-null    object 
 12  Price                     977 non-null    float64
dtypes: float64(1), object(12)
memory usage: 99.4+ KB


In [14]:
dfTrain.describe(include = "all") 

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM,Storage,GPU,Operating System,Operating System Version,Weight,Price
count,977,977,977,977,977,977,977,977,977,977,841.0,977,977.0
unique,19,488,6,18,38,106,8,36,98,7,4.0,166,
top,Dell,XPS 13,Notebook,"15.6""",Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,Windows,10.0,2.2kg,
freq,232,22,549,494,397,151,483,318,225,837,819.0,98,
mean,,,,,,,,,,,,,10018990.0
std,,,,,,,,,,,,,6306430.0
min,,,,,,,,,,,,,1706375.0
25%,,,,,,,,,,,,,5326308.0
50%,,,,,,,,,,,,,8527428.0
75%,,,,,,,,,,,,,13115700.0


In [15]:
dfTrain.duplicated().sum() #no duplicates 

0

## Training Data preprocessing

### removing nan from Operating System Version

In [16]:
dfTrain.fillna("unknown" , inplace = True)

In [17]:
dfTrain['Operating System Version']

0      unknown
1      unknown
2      unknown
3      unknown
4      unknown
        ...   
972         10
973         10
974         10
975         10
976          7
Name: Operating System Version, Length: 977, dtype: object

### Screen Size

In [18]:
# remove " from size
dfTrain['Screen Size'] = dfTrain['Screen Size'].str.replace('"',"")

In [19]:
dfTrain['Screen Size']

0      13.3
1      13.3
2      15.6
3      15.4
4      13.3
       ... 
972    17.3
973    14.0
974    17.3
975    15.6
976    14.0
Name: Screen Size, Length: 977, dtype: object

In [20]:
# change data type
dfTrain['Screen Size'] = dfTrain['Screen Size'].astype(float)

In [21]:
dfTrain['Screen Size']

0      13.3
1      13.3
2      15.6
3      15.4
4      13.3
       ... 
972    17.3
973    14.0
974    17.3
975    15.6
976    14.0
Name: Screen Size, Length: 977, dtype: float64

### Screen

In [22]:
dfTrain["Screen"].unique() # there is type , resolution , touch or not  

array(['IPS Panel Retina Display 2560x1600', '1440x900',
       'Full HD 1920x1080', 'IPS Panel Retina Display 2880x1800',
       '1366x768', 'IPS Panel Full HD 1920x1080',
       'IPS Panel Retina Display 2304x1440',
       'IPS Panel Full HD / Touchscreen 1920x1080',
       'Full HD / Touchscreen 1920x1080',
       'Touchscreen / Quad HD+ 3200x1800',
       'IPS Panel Touchscreen 1920x1200', 'Touchscreen 2256x1504',
       'Quad HD+ / Touchscreen 3200x1800', 'IPS Panel 1366x768',
       'IPS Panel 4K Ultra HD / Touchscreen 3840x2160',
       'IPS Panel Full HD 2160x1440',
       '4K Ultra HD / Touchscreen 3840x2160', 'Touchscreen 2560x1440',
       '1600x900', 'IPS Panel 4K Ultra HD 3840x2160',
       '4K Ultra HD 3840x2160', 'Touchscreen 1366x768',
       'IPS Panel Full HD 1366x768', 'IPS Panel 2560x1440',
       'IPS Panel Full HD 2560x1440',
       'IPS Panel Retina Display 2736x1824', 'Touchscreen 2400x1600',
       '2560x1440', 'IPS Panel Quad HD+ 2560x1440',
       'IPS Panel 

In [23]:
#divide screen column into three columns typeOfScreen ,resolution , touch
#extract resolution 
import re  
def extract_resolution(string):
    pattern = r'\b(\d+x\d+)\b'  # Matches the pattern of digits followed by 'x' and more digits
    match = re.search(pattern, string)
    if match:
        return match.group(1)
    else:
        return None
dfTrain["resolution"] =dfTrain["Screen"].apply(extract_resolution)

In [24]:
dfTrain["resolution"]

0      2560x1600
1       1440x900
2      1920x1080
3      2880x1800
4      2560x1600
         ...    
972    1920x1080
973    1920x1080
974    1920x1080
975    1920x1080
976     1366x768
Name: resolution, Length: 977, dtype: object

In [25]:
dfTrain.head()

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM,Storage,GPU,Operating System,Operating System Version,Weight,Price,resolution
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,unknown,1.37kg,11912523.48,2560x1600
1,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,unknown,1.34kg,7993374.48,1440x900
2,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,unknown,1.86kg,5112900.0,1920x1080
3,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,unknown,1.83kg,22563005.4,2880x1800
4,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,unknown,1.37kg,16037611.2,2560x1600


In [26]:
#extract touchscreen
dfTrain["touchScren"] =dfTrain["Screen"].str.extract(r'(Touchscreen)')

In [27]:
dfTrain["touchScren"]

0      NaN
1      NaN
2      NaN
3      NaN
4      NaN
      ... 
972    NaN
973    NaN
974    NaN
975    NaN
976    NaN
Name: touchScren, Length: 977, dtype: object

In [28]:
dfTrain["touchScren"] = dfTrain["touchScren"].replace('Touchscreen',1)
dfTrain["touchScren"] = dfTrain["touchScren"].replace(np.nan,0)

In [29]:
dfTrain.head()

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM,Storage,GPU,Operating System,Operating System Version,Weight,Price,resolution,touchScren
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,unknown,1.37kg,11912523.48,2560x1600,0.0
1,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,unknown,1.34kg,7993374.48,1440x900,0.0
2,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,unknown,1.86kg,5112900.0,1920x1080,0.0
3,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,unknown,1.83kg,22563005.4,2880x1800,0.0
4,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,unknown,1.37kg,16037611.2,2560x1600,0.0


In [30]:
#extract screenType
dfTrain["screenType"] = dfTrain["Screen"]
dfTrain["screenType"] = dfTrain["screenType"].replace(r'(\b\d+x\d+\b)', '' ,regex = True)
dfTrain["screenType"] = dfTrain["screenType"].replace(r'(Touchscreen)', '' , regex = True)
dfTrain["screenType"] = dfTrain["screenType"].replace('Display','', regex = True)
dfTrain["screenType"] = dfTrain["screenType"].replace('',np.nan, regex = True)

In [31]:
dfTrain.drop("Screen", axis = 1)
dfTrain.head()

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM,Storage,GPU,Operating System,Operating System Version,Weight,Price,resolution,touchScren,screenType
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,unknown,1.37kg,11912523.48,2560x1600,0.0,IPS Panel Retina
1,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,unknown,1.34kg,7993374.48,1440x900,0.0,
2,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,unknown,1.86kg,5112900.0,1920x1080,0.0,Full HD
3,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,unknown,1.83kg,22563005.4,2880x1800,0.0,IPS Panel Retina
4,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,unknown,1.37kg,16037611.2,2560x1600,0.0,IPS Panel Retina


### CPU

In [32]:
dfTrain['CPU'].unique()
#it seems that there is type of CPU and frequency so i will divide this column into two cpuModel and frequency

array(['Intel Core i5 2.3GHz', 'Intel Core i5 1.8GHz',
       'Intel Core i5 7200U 2.5GHz', 'Intel Core i7 2.7GHz',
       'Intel Core i5 3.1GHz', 'AMD A9-Series 9420 3GHz',
       'Intel Core i7 2.2GHz', 'Intel Core i7 8550U 1.8GHz',
       'Intel Core i5 8250U 1.6GHz', 'Intel Core i3 6006U 2GHz',
       'Intel Core i7 2.8GHz', 'Intel Core M m3 1.2GHz',
       'Intel Core i7 7500U 2.7GHz', 'Intel Core i7 2.9GHz',
       'Intel Core i3 7100U 2.4GHz', 'Intel Atom x5-Z8350 1.44GHz',
       'Intel Core i5 7300HQ 2.5GHz', 'AMD E-Series E2-9000e 1.5GHz',
       'Intel Core i5 1.6GHz', 'Intel Core i7 8650U 1.9GHz',
       'Intel Atom x5-Z8300 1.44GHz', 'AMD E-Series E2-6110 1.5GHz',
       'AMD A6-Series 9220 2.5GHz',
       'Intel Celeron Dual Core N3350 1.1GHz',
       'Intel Core i3 7130U 2.7GHz', 'Intel Core i7 7700HQ 2.8GHz',
       'Intel Core i5 2.0GHz', 'AMD Ryzen 1700 3GHz',
       'Intel Pentium Quad Core N4200 1.1GHz',
       'Intel Atom x5-Z8550 1.44GHz',
       'Intel Celeron Du

In [33]:
#extract frequency
dfTrain["cpuFrequency"] = dfTrain["CPU"].str.extract(r'(\d+\.+\d)')

In [34]:
dfTrain["cpuFrequency"]

0      2.3
1      1.8
2      2.5
3      2.7
4      3.1
      ... 
972    2.6
973    2.3
974    2.8
975    2.7
976    2.3
Name: cpuFrequency, Length: 977, dtype: object

In [35]:
dfTrain.head()

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM,Storage,GPU,Operating System,Operating System Version,Weight,Price,resolution,touchScren,screenType,cpuFrequency
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,unknown,1.37kg,11912523.48,2560x1600,0.0,IPS Panel Retina,2.3
1,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,unknown,1.34kg,7993374.48,1440x900,0.0,,1.8
2,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,unknown,1.86kg,5112900.0,1920x1080,0.0,Full HD,2.5
3,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,unknown,1.83kg,22563005.4,2880x1800,0.0,IPS Panel Retina,2.7
4,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,unknown,1.37kg,16037611.2,2560x1600,0.0,IPS Panel Retina,3.1


In [36]:
#extract Model
dfTrain["cpuModel"] = dfTrain["CPU"]
dfTrain["cpuModel"] = dfTrain["cpuModel"].replace(r'(\d+\.+\d)' , '' , regex = True)
dfTrain["cpuModel"] = dfTrain["cpuModel"].replace(r'(GHz)' , '' , regex = True)

In [37]:
dfTrain["cpuModel"]

0             Intel Core i5 
1             Intel Core i5 
2       Intel Core i5 7200U 
3             Intel Core i7 
4             Intel Core i5 
               ...          
972    Intel Core i7 6700HQ 
973     Intel Core i5 6200U 
974    Intel Core i7 7700HQ 
975    Intel Core i5 7200U 0
976     Intel Core i5 6200U 
Name: cpuModel, Length: 977, dtype: object

In [38]:
dfTrain["cpuFrequency"] = dfTrain["cpuFrequency"].astype(float)

In [39]:
dfTrain["cpuFrequency"]

0      2.3
1      1.8
2      2.5
3      2.7
4      3.1
      ... 
972    2.6
973    2.3
974    2.8
975    2.7
976    2.3
Name: cpuFrequency, Length: 977, dtype: float64

In [40]:
dfTrain.head()

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM,Storage,GPU,Operating System,Operating System Version,Weight,Price,resolution,touchScren,screenType,cpuFrequency,cpuModel
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,unknown,1.37kg,11912523.48,2560x1600,0.0,IPS Panel Retina,2.3,Intel Core i5
1,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,unknown,1.34kg,7993374.48,1440x900,0.0,,1.8,Intel Core i5
2,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,unknown,1.86kg,5112900.0,1920x1080,0.0,Full HD,2.5,Intel Core i5 7200U
3,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,unknown,1.83kg,22563005.4,2880x1800,0.0,IPS Panel Retina,2.7,Intel Core i7
4,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,unknown,1.37kg,16037611.2,2560x1600,0.0,IPS Panel Retina,3.1,Intel Core i5


### RAM

In [41]:
#remove GB
dfTrain['RAM'] = dfTrain['RAM'].replace(r'(GB)' , '' , regex = True)
dfTrain['RAM'] = dfTrain['RAM'].astype(float)
dfTrain['RAM']


0       8.0
1       8.0
2       8.0
3      16.0
4       8.0
       ... 
972    32.0
973     8.0
974    16.0
975     8.0
976     4.0
Name: RAM, Length: 977, dtype: float64

In [42]:
dfTrain.head()

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM,Storage,GPU,Operating System,Operating System Version,Weight,Price,resolution,touchScren,screenType,cpuFrequency,cpuModel
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8.0,128GB SSD,Intel Iris Plus Graphics 640,macOS,unknown,1.37kg,11912523.48,2560x1600,0.0,IPS Panel Retina,2.3,Intel Core i5
1,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8.0,128GB Flash Storage,Intel HD Graphics 6000,macOS,unknown,1.34kg,7993374.48,1440x900,0.0,,1.8,Intel Core i5
2,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8.0,256GB SSD,Intel HD Graphics 620,No OS,unknown,1.86kg,5112900.0,1920x1080,0.0,Full HD,2.5,Intel Core i5 7200U
3,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16.0,512GB SSD,AMD Radeon Pro 455,macOS,unknown,1.83kg,22563005.4,2880x1800,0.0,IPS Panel Retina,2.7,Intel Core i7
4,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8.0,256GB SSD,Intel Iris Plus Graphics 650,macOS,unknown,1.37kg,16037611.2,2560x1600,0.0,IPS Panel Retina,3.1,Intel Core i5


### Storage

In [43]:
dfTrain[' Storage'].unique()
# there is a TB and GB as unit so convert TB to GB to have one unit

array(['128GB SSD', '128GB Flash Storage', '256GB SSD', '512GB SSD',
       '500GB HDD', '256GB Flash Storage', '1TB HDD',
       '32GB Flash Storage', '128GB SSD +  1TB HDD',
       '256GB SSD +  256GB SSD', '64GB Flash Storage',
       '256GB SSD +  1TB HDD', '256GB SSD +  2TB HDD', '32GB SSD',
       '2TB HDD', '64GB SSD', '1TB Hybrid', '512GB SSD +  1TB HDD',
       '1TB SSD', '256GB SSD +  500GB HDD', '128GB SSD +  2TB HDD',
       '512GB SSD +  512GB SSD', '16GB SSD', '16GB Flash Storage',
       '512GB SSD +  256GB SSD', '512GB SSD +  2TB HDD',
       '64GB Flash Storage +  1TB HDD', '1GB SSD', '1TB HDD +  1TB HDD',
       '32GB HDD', '1TB SSD +  1TB HDD', '512GB Flash Storage',
       '128GB HDD', '240GB SSD', '8GB SSD', '508GB Hybrid'], dtype=object)

In [44]:
dfTrain[' Storage'] = dfTrain[' Storage'].replace(r'(TB)' , '000GB' , regex = True)
dfTrain[' Storage']

0                    128GB SSD
1          128GB Flash Storage
2                    256GB SSD
3                    512GB SSD
4                    256GB SSD
                ...           
972    256GB SSD +  1000GB HDD
973                  256GB SSD
974    256GB SSD +  1000GB HDD
975    128GB SSD +  1000GB HDD
976               508GB Hybrid
Name:  Storage, Length: 977, dtype: object

In [45]:
#notice that some of devices have two storage so it needs to be split 
dfTrain['secondStorage'] = dfTrain[' Storage'].str.split("+" , n = 1).str[1].str.strip()

In [46]:
dfTrain['secondStorage']

0             NaN
1             NaN
2             NaN
3             NaN
4             NaN
          ...    
972    1000GB HDD
973           NaN
974    1000GB HDD
975    1000GB HDD
976           NaN
Name: secondStorage, Length: 977, dtype: object

In [47]:
dfTrain.head()

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM,Storage,GPU,Operating System,Operating System Version,Weight,Price,resolution,touchScren,screenType,cpuFrequency,cpuModel,secondStorage
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8.0,128GB SSD,Intel Iris Plus Graphics 640,macOS,unknown,1.37kg,11912523.48,2560x1600,0.0,IPS Panel Retina,2.3,Intel Core i5,
1,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8.0,128GB Flash Storage,Intel HD Graphics 6000,macOS,unknown,1.34kg,7993374.48,1440x900,0.0,,1.8,Intel Core i5,
2,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8.0,256GB SSD,Intel HD Graphics 620,No OS,unknown,1.86kg,5112900.0,1920x1080,0.0,Full HD,2.5,Intel Core i5 7200U,
3,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16.0,512GB SSD,AMD Radeon Pro 455,macOS,unknown,1.83kg,22563005.4,2880x1800,0.0,IPS Panel Retina,2.7,Intel Core i7,
4,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8.0,256GB SSD,Intel Iris Plus Graphics 650,macOS,unknown,1.37kg,16037611.2,2560x1600,0.0,IPS Panel Retina,3.1,Intel Core i5,


In [48]:
#second storage size
dfTrain['secondStorageSize'] = dfTrain['secondStorage'].str.extract(r'(\d+)')
dfTrain['secondStorage'] =  dfTrain['secondStorage'].replace(r'(\d+)' , '' , regex = True)
dfTrain['secondStorage'] =  dfTrain['secondStorage'].replace(r'(GB)' , '' , regex = True)

In [49]:
dfTrain['secondStorageSize']

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
       ... 
972    1000
973     NaN
974    1000
975    1000
976     NaN
Name: secondStorageSize, Length: 977, dtype: object

In [50]:
dfTrain['secondStorage']

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
       ... 
972     HDD
973     NaN
974     HDD
975     HDD
976     NaN
Name: secondStorage, Length: 977, dtype: object

In [51]:
dfTrain['primaryStorage'] = dfTrain[' Storage'].str.split("+" , n = 1).str[0].str.strip()
dfTrain['primaryStorage'] = dfTrain['primaryStorage'].replace(r'(\d+)','' ,regex = True)
dfTrain['primaryStorage'] = dfTrain['primaryStorage'].replace(r'(GB)','' ,regex = True)

In [52]:
dfTrain['primaryStorage']

0                 SSD
1       Flash Storage
2                 SSD
3                 SSD
4                 SSD
            ...      
972               SSD
973               SSD
974               SSD
975               SSD
976            Hybrid
Name: primaryStorage, Length: 977, dtype: object

In [53]:
dfTrain['primaryStorageSize'] = dfTrain[' Storage'].str.split("+" , n = 1).str[0].str.strip()
dfTrain['primaryStorageSize'] = dfTrain['primaryStorageSize'].str.extract(r'(\d+)')
dfTrain['primaryStorageSize']

0      128
1      128
2      256
3      512
4      256
      ... 
972    256
973    256
974    256
975    128
976    508
Name: primaryStorageSize, Length: 977, dtype: object

In [54]:
dfTrain.head()

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM,Storage,GPU,Operating System,...,Price,resolution,touchScren,screenType,cpuFrequency,cpuModel,secondStorage,secondStorageSize,primaryStorage,primaryStorageSize
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8.0,128GB SSD,Intel Iris Plus Graphics 640,macOS,...,11912523.48,2560x1600,0.0,IPS Panel Retina,2.3,Intel Core i5,,,SSD,128
1,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8.0,128GB Flash Storage,Intel HD Graphics 6000,macOS,...,7993374.48,1440x900,0.0,,1.8,Intel Core i5,,,Flash Storage,128
2,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8.0,256GB SSD,Intel HD Graphics 620,No OS,...,5112900.0,1920x1080,0.0,Full HD,2.5,Intel Core i5 7200U,,,SSD,256
3,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16.0,512GB SSD,AMD Radeon Pro 455,macOS,...,22563005.4,2880x1800,0.0,IPS Panel Retina,2.7,Intel Core i7,,,SSD,512
4,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8.0,256GB SSD,Intel Iris Plus Graphics 650,macOS,...,16037611.2,2560x1600,0.0,IPS Panel Retina,3.1,Intel Core i5,,,SSD,256


In [55]:
dfTrain = dfTrain.drop(' Storage', axis = 1)

In [56]:
dfTrain.head()

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM,GPU,Operating System,Operating System Version,...,Price,resolution,touchScren,screenType,cpuFrequency,cpuModel,secondStorage,secondStorageSize,primaryStorage,primaryStorageSize
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8.0,Intel Iris Plus Graphics 640,macOS,unknown,...,11912523.48,2560x1600,0.0,IPS Panel Retina,2.3,Intel Core i5,,,SSD,128
1,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8.0,Intel HD Graphics 6000,macOS,unknown,...,7993374.48,1440x900,0.0,,1.8,Intel Core i5,,,Flash Storage,128
2,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8.0,Intel HD Graphics 620,No OS,unknown,...,5112900.0,1920x1080,0.0,Full HD,2.5,Intel Core i5 7200U,,,SSD,256
3,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16.0,AMD Radeon Pro 455,macOS,unknown,...,22563005.4,2880x1800,0.0,IPS Panel Retina,2.7,Intel Core i7,,,SSD,512
4,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8.0,Intel Iris Plus Graphics 650,macOS,unknown,...,16037611.2,2560x1600,0.0,IPS Panel Retina,3.1,Intel Core i5,,,SSD,256


### GPU

In [57]:
dfTrain['GPU'].unique()
#no need for preprocessing

array(['Intel Iris Plus Graphics 640', 'Intel HD Graphics 6000',
       'Intel HD Graphics 620', 'AMD Radeon Pro 455',
       'Intel Iris Plus Graphics 650', 'AMD Radeon R5',
       'Intel Iris Pro Graphics', 'Nvidia GeForce MX150',
       'Intel UHD Graphics 620', 'Intel HD Graphics 520',
       'AMD Radeon Pro 555', 'AMD Radeon R5 M430',
       'Intel HD Graphics 615', 'AMD Radeon Pro 560',
       'Nvidia GeForce 940MX', 'Intel HD Graphics 400',
       'Nvidia GeForce GTX 1050', 'AMD Radeon R2', 'AMD Radeon 530',
       'Nvidia GeForce 930MX', 'Intel HD Graphics',
       'Intel HD Graphics 500', 'Nvidia GeForce 930MX ',
       'Nvidia GeForce GTX 1060', 'Nvidia GeForce 150MX',
       'Intel Iris Graphics 540', 'AMD Radeon RX 580',
       'Nvidia GeForce 920MX', 'AMD Radeon R4 Graphics', 'AMD Radeon 520',
       'Nvidia GeForce GTX 1070', 'Nvidia GeForce GTX 1050 Ti',
       'Nvidia GeForce MX130', 'AMD R4 Graphics',
       'Nvidia GeForce GTX 940MX', 'AMD Radeon RX 560',
       'Nvid

### Weight

In [58]:
# remove kg and change dtype
dfTrain['Weight'].unique()

array(['1.37kg', '1.34kg', '1.86kg', '1.83kg', '2.1kg', '2.04kg', '1.3kg',
       '1.6kg', '2.2kg', '0.92kg', '1.22kg', '0.98kg', '2.5kg', '1.62kg',
       '1.91kg', '2.3kg', '1.35kg', '1.88kg', '1.89kg', '1.65kg',
       '2.71kg', '1.2kg', '1.44kg', '2.8kg', '2kg', '2.65kg', '2.77kg',
       '3.2kg', '0.69kg', '1.49kg', '2.4kg', '2.13kg', '2.43kg', '1.7kg',
       '1.4kg', '1.8kg', '1.9kg', '3kg', '1.252kg', '2.7kg', '2.02kg',
       '1.63kg', '1.96kg', '1.21kg', '2.45kg', '1.25kg', '1.5kg',
       '2.62kg', '1.38kg', '1.58kg', '1.85kg', '1.23kg', '1.26kg',
       '2.16kg', '2.36kg', '2.05kg', '1.32kg', '1.75kg', '0.97kg',
       '2.9kg', '2.56kg', '1.48kg', '1.74kg', '1.1kg', '1.56kg', '2.03kg',
       '1.05kg', '4.4kg', '1.90kg', '1.29kg', '2.0kg', '1.95kg', '2.06kg',
       '1.12kg', '1.42kg', '3.49kg', '3.35kg', '2.23kg', '4.42kg',
       '2.69kg', '2.37kg', '4.7kg', '3.6kg', '2.08kg', '4.3kg', '1.68kg',
       '1.41kg', '4.14kg', '2.18kg', '2.24kg', '2.67kg', '2.14kg',
       '1.

In [59]:
dfTrain['Weight'] = dfTrain['Weight'].str.replace('kg','')
dfTrain['Weight'] = dfTrain['Weight'].astype(float)
dfTrain['Weight'] 

0      1.37
1      1.34
2      1.86
3      1.83
4      1.37
       ... 
972    4.42
973    1.95
974    2.73
975    2.04
976    1.70
Name: Weight, Length: 977, dtype: float64

In [60]:
dfTrain.head()

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM,GPU,Operating System,Operating System Version,...,Price,resolution,touchScren,screenType,cpuFrequency,cpuModel,secondStorage,secondStorageSize,primaryStorage,primaryStorageSize
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8.0,Intel Iris Plus Graphics 640,macOS,unknown,...,11912523.48,2560x1600,0.0,IPS Panel Retina,2.3,Intel Core i5,,,SSD,128
1,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8.0,Intel HD Graphics 6000,macOS,unknown,...,7993374.48,1440x900,0.0,,1.8,Intel Core i5,,,Flash Storage,128
2,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8.0,Intel HD Graphics 620,No OS,unknown,...,5112900.0,1920x1080,0.0,Full HD,2.5,Intel Core i5 7200U,,,SSD,256
3,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16.0,AMD Radeon Pro 455,macOS,unknown,...,22563005.4,2880x1800,0.0,IPS Panel Retina,2.7,Intel Core i7,,,SSD,512
4,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8.0,Intel Iris Plus Graphics 650,macOS,unknown,...,16037611.2,2560x1600,0.0,IPS Panel Retina,3.1,Intel Core i5,,,SSD,256


In [61]:
dfTrain = dfTrain.drop(['Screen', 'CPU'] , axis = 1)

In [62]:
dfTrain.head()

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,RAM,GPU,Operating System,Operating System Version,Weight,Price,resolution,touchScren,screenType,cpuFrequency,cpuModel,secondStorage,secondStorageSize,primaryStorage,primaryStorageSize
0,Apple,MacBook Pro,Ultrabook,13.3,8.0,Intel Iris Plus Graphics 640,macOS,unknown,1.37,11912523.48,2560x1600,0.0,IPS Panel Retina,2.3,Intel Core i5,,,SSD,128
1,Apple,Macbook Air,Ultrabook,13.3,8.0,Intel HD Graphics 6000,macOS,unknown,1.34,7993374.48,1440x900,0.0,,1.8,Intel Core i5,,,Flash Storage,128
2,HP,250 G6,Notebook,15.6,8.0,Intel HD Graphics 620,No OS,unknown,1.86,5112900.0,1920x1080,0.0,Full HD,2.5,Intel Core i5 7200U,,,SSD,256
3,Apple,MacBook Pro,Ultrabook,15.4,16.0,AMD Radeon Pro 455,macOS,unknown,1.83,22563005.4,2880x1800,0.0,IPS Panel Retina,2.7,Intel Core i7,,,SSD,512
4,Apple,MacBook Pro,Ultrabook,13.3,8.0,Intel Iris Plus Graphics 650,macOS,unknown,1.37,16037611.2,2560x1600,0.0,IPS Panel Retina,3.1,Intel Core i5,,,SSD,256


## Test Data preprocessing

In [63]:
dfTest = pd.read_csv('laptops_test.csv')

In [64]:
dfTest.head()

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM,Storage,GPU,Operating System,Operating System Version,Weight,Price
0,HP,15-bs053od (i7-7500U/6GB/1TB/W10),Notebook,"15.6""",1366x768,Intel Core i7 7500U 2.7GHz,6GB,1TB HDD,Intel HD Graphics 620,Windows,10,2.04kg,5148468.0
1,Asus,Rog GL753VE-DS74,Gaming,"17.3""",Full HD 1920x1080,Intel Core i7 7700HQ 2.8GHz,16GB,256GB SSD + 1TB HDD,Nvidia GeForce GTX 1050 Ti,Windows,10,2.99kg,15552108.0
2,Dell,Inspiron 7579,2 in 1 Convertible,"15.6""",IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i7 7500U 2.7GHz,12GB,512GB SSD,Intel HD Graphics 620,Windows,10,2.19kg,11550708.0
3,Toshiba,Portege Z30-C-1CV,Notebook,"13.3""",Full HD 1920x1080,Intel Core i5 6200U 2.3GHz,4GB,128GB SSD,Intel HD Graphics 520,Windows,7,1.2kg,10625940.0
4,Lenovo,IdeaPad 320-15ABR,Notebook,"15.6""",Full HD 1920x1080,AMD A12-Series 9720P 3.6GHz,6GB,256GB SSD,AMD Radeon 530,Windows,10,2.2kg,4881708.0


In [65]:
dfTest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 325 entries, 0 to 324
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Manufacturer              325 non-null    object 
 1   Model Name                325 non-null    object 
 2   Category                  325 non-null    object 
 3   Screen Size               325 non-null    object 
 4   Screen                    325 non-null    object 
 5   CPU                       325 non-null    object 
 6   RAM                       325 non-null    object 
 7    Storage                  325 non-null    object 
 8   GPU                       325 non-null    object 
 9   Operating System          325 non-null    object 
 10  Operating System Version  291 non-null    object 
 11  Weight                    325 non-null    object 
 12  Price                     325 non-null    float64
dtypes: float64(1), object(12)
memory usage: 33.1+ KB


In [66]:
dfTest.columns

Index(['Manufacturer', 'Model Name', 'Category', 'Screen Size', 'Screen',
       'CPU', 'RAM', ' Storage', 'GPU', 'Operating System',
       'Operating System Version', 'Weight', 'Price'],
      dtype='object')

In [67]:
dfTest.fillna("unknown", inplace= True)

In [68]:
dfTest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 325 entries, 0 to 324
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Manufacturer              325 non-null    object 
 1   Model Name                325 non-null    object 
 2   Category                  325 non-null    object 
 3   Screen Size               325 non-null    object 
 4   Screen                    325 non-null    object 
 5   CPU                       325 non-null    object 
 6   RAM                       325 non-null    object 
 7    Storage                  325 non-null    object 
 8   GPU                       325 non-null    object 
 9   Operating System          325 non-null    object 
 10  Operating System Version  325 non-null    object 
 11  Weight                    325 non-null    object 
 12  Price                     325 non-null    float64
dtypes: float64(1), object(12)
memory usage: 33.1+ KB


In [69]:
dfTest.head()

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM,Storage,GPU,Operating System,Operating System Version,Weight,Price
0,HP,15-bs053od (i7-7500U/6GB/1TB/W10),Notebook,"15.6""",1366x768,Intel Core i7 7500U 2.7GHz,6GB,1TB HDD,Intel HD Graphics 620,Windows,10,2.04kg,5148468.0
1,Asus,Rog GL753VE-DS74,Gaming,"17.3""",Full HD 1920x1080,Intel Core i7 7700HQ 2.8GHz,16GB,256GB SSD + 1TB HDD,Nvidia GeForce GTX 1050 Ti,Windows,10,2.99kg,15552108.0
2,Dell,Inspiron 7579,2 in 1 Convertible,"15.6""",IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i7 7500U 2.7GHz,12GB,512GB SSD,Intel HD Graphics 620,Windows,10,2.19kg,11550708.0
3,Toshiba,Portege Z30-C-1CV,Notebook,"13.3""",Full HD 1920x1080,Intel Core i5 6200U 2.3GHz,4GB,128GB SSD,Intel HD Graphics 520,Windows,7,1.2kg,10625940.0
4,Lenovo,IdeaPad 320-15ABR,Notebook,"15.6""",Full HD 1920x1080,AMD A12-Series 9720P 3.6GHz,6GB,256GB SSD,AMD Radeon 530,Windows,10,2.2kg,4881708.0


In [70]:
dfTest['Screen Size'] = dfTest['Screen Size'].str.replace('"' , '')
dfTest['Screen Size'] = dfTest['Screen Size'].astype(float)

In [71]:
dfTest['Screen Size']

0      15.6
1      17.3
2      15.6
3      13.3
4      15.6
       ... 
320    14.0
321    13.3
322    14.0
323    15.6
324    15.6
Name: Screen Size, Length: 325, dtype: float64

In [72]:
dfTest.head()

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM,Storage,GPU,Operating System,Operating System Version,Weight,Price
0,HP,15-bs053od (i7-7500U/6GB/1TB/W10),Notebook,15.6,1366x768,Intel Core i7 7500U 2.7GHz,6GB,1TB HDD,Intel HD Graphics 620,Windows,10,2.04kg,5148468.0
1,Asus,Rog GL753VE-DS74,Gaming,17.3,Full HD 1920x1080,Intel Core i7 7700HQ 2.8GHz,16GB,256GB SSD + 1TB HDD,Nvidia GeForce GTX 1050 Ti,Windows,10,2.99kg,15552108.0
2,Dell,Inspiron 7579,2 in 1 Convertible,15.6,IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i7 7500U 2.7GHz,12GB,512GB SSD,Intel HD Graphics 620,Windows,10,2.19kg,11550708.0
3,Toshiba,Portege Z30-C-1CV,Notebook,13.3,Full HD 1920x1080,Intel Core i5 6200U 2.3GHz,4GB,128GB SSD,Intel HD Graphics 520,Windows,7,1.2kg,10625940.0
4,Lenovo,IdeaPad 320-15ABR,Notebook,15.6,Full HD 1920x1080,AMD A12-Series 9720P 3.6GHz,6GB,256GB SSD,AMD Radeon 530,Windows,10,2.2kg,4881708.0


In [73]:
import re  
def extract_resolution(string):
    pattern = r'\b(\d+x\d+)\b'  # Matches the pattern of digits followed by 'x' and more digits
    match = re.search(pattern, string)
    if match:
        return match.group(1)
    else:
        return None
dfTest["resolution"] =dfTest["Screen"].apply(extract_resolution)

In [74]:
dfTest.head()

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM,Storage,GPU,Operating System,Operating System Version,Weight,Price,resolution
0,HP,15-bs053od (i7-7500U/6GB/1TB/W10),Notebook,15.6,1366x768,Intel Core i7 7500U 2.7GHz,6GB,1TB HDD,Intel HD Graphics 620,Windows,10,2.04kg,5148468.0,1366x768
1,Asus,Rog GL753VE-DS74,Gaming,17.3,Full HD 1920x1080,Intel Core i7 7700HQ 2.8GHz,16GB,256GB SSD + 1TB HDD,Nvidia GeForce GTX 1050 Ti,Windows,10,2.99kg,15552108.0,1920x1080
2,Dell,Inspiron 7579,2 in 1 Convertible,15.6,IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i7 7500U 2.7GHz,12GB,512GB SSD,Intel HD Graphics 620,Windows,10,2.19kg,11550708.0,1920x1080
3,Toshiba,Portege Z30-C-1CV,Notebook,13.3,Full HD 1920x1080,Intel Core i5 6200U 2.3GHz,4GB,128GB SSD,Intel HD Graphics 520,Windows,7,1.2kg,10625940.0,1920x1080
4,Lenovo,IdeaPad 320-15ABR,Notebook,15.6,Full HD 1920x1080,AMD A12-Series 9720P 3.6GHz,6GB,256GB SSD,AMD Radeon 530,Windows,10,2.2kg,4881708.0,1920x1080


In [75]:
dfTest["touchScren"] =dfTest["Screen"].str.extract(r'(Touchscreen)')
dfTest["touchScren"] = dfTest["touchScren"].replace('Touchscreen',1)
dfTest["touchScren"] = dfTest["touchScren"].replace(np.nan,0)

In [76]:
dfTest["touchScren"]

0      0.0
1      0.0
2      1.0
3      0.0
4      0.0
      ... 
320    1.0
321    1.0
322    0.0
323    0.0
324    0.0
Name: touchScren, Length: 325, dtype: float64

In [77]:
dfTest["screenType"] = dfTest["Screen"]
dfTest["screenType"] = dfTest["screenType"].replace(r'(\b\d+x\d+\b)', '' ,regex = True)
dfTest["screenType"] = dfTest["screenType"].replace(r'(Touchscreen)', '' , regex = True)
dfTest["screenType"] = dfTest["screenType"].replace('Display','', regex = True)
dfTest["screenType"] = dfTest["screenType"].replace('',np.nan, regex = True)

In [78]:
dfTest["screenType"]

0                         NaN
1                    Full HD 
2       IPS Panel Full HD /  
3                    Full HD 
4                    Full HD 
                ...          
320     IPS Panel Full HD /  
321    IPS Panel Quad HD+ /  
322                       NaN
323                       NaN
324                       NaN
Name: screenType, Length: 325, dtype: object

In [79]:
dfTest.head()

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM,Storage,GPU,Operating System,Operating System Version,Weight,Price,resolution,touchScren,screenType
0,HP,15-bs053od (i7-7500U/6GB/1TB/W10),Notebook,15.6,1366x768,Intel Core i7 7500U 2.7GHz,6GB,1TB HDD,Intel HD Graphics 620,Windows,10,2.04kg,5148468.0,1366x768,0.0,
1,Asus,Rog GL753VE-DS74,Gaming,17.3,Full HD 1920x1080,Intel Core i7 7700HQ 2.8GHz,16GB,256GB SSD + 1TB HDD,Nvidia GeForce GTX 1050 Ti,Windows,10,2.99kg,15552108.0,1920x1080,0.0,Full HD
2,Dell,Inspiron 7579,2 in 1 Convertible,15.6,IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i7 7500U 2.7GHz,12GB,512GB SSD,Intel HD Graphics 620,Windows,10,2.19kg,11550708.0,1920x1080,1.0,IPS Panel Full HD /
3,Toshiba,Portege Z30-C-1CV,Notebook,13.3,Full HD 1920x1080,Intel Core i5 6200U 2.3GHz,4GB,128GB SSD,Intel HD Graphics 520,Windows,7,1.2kg,10625940.0,1920x1080,0.0,Full HD
4,Lenovo,IdeaPad 320-15ABR,Notebook,15.6,Full HD 1920x1080,AMD A12-Series 9720P 3.6GHz,6GB,256GB SSD,AMD Radeon 530,Windows,10,2.2kg,4881708.0,1920x1080,0.0,Full HD


In [80]:
dfTest = dfTest.drop('Screen',axis = 1 )

In [81]:
dfTest.head()

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,CPU,RAM,Storage,GPU,Operating System,Operating System Version,Weight,Price,resolution,touchScren,screenType
0,HP,15-bs053od (i7-7500U/6GB/1TB/W10),Notebook,15.6,Intel Core i7 7500U 2.7GHz,6GB,1TB HDD,Intel HD Graphics 620,Windows,10,2.04kg,5148468.0,1366x768,0.0,
1,Asus,Rog GL753VE-DS74,Gaming,17.3,Intel Core i7 7700HQ 2.8GHz,16GB,256GB SSD + 1TB HDD,Nvidia GeForce GTX 1050 Ti,Windows,10,2.99kg,15552108.0,1920x1080,0.0,Full HD
2,Dell,Inspiron 7579,2 in 1 Convertible,15.6,Intel Core i7 7500U 2.7GHz,12GB,512GB SSD,Intel HD Graphics 620,Windows,10,2.19kg,11550708.0,1920x1080,1.0,IPS Panel Full HD /
3,Toshiba,Portege Z30-C-1CV,Notebook,13.3,Intel Core i5 6200U 2.3GHz,4GB,128GB SSD,Intel HD Graphics 520,Windows,7,1.2kg,10625940.0,1920x1080,0.0,Full HD
4,Lenovo,IdeaPad 320-15ABR,Notebook,15.6,AMD A12-Series 9720P 3.6GHz,6GB,256GB SSD,AMD Radeon 530,Windows,10,2.2kg,4881708.0,1920x1080,0.0,Full HD


In [82]:
dfTest["cpuFrequency"] = dfTest["CPU"].str.extract(r'(\d+\.+\d)')
dfTrain["cpuFrequency"] = dfTrain["cpuFrequency"].astype(float)
dfTest["cpuModel"] = dfTest["CPU"]
dfTest["cpuModel"] = dfTest["cpuModel"].replace(r'(\d+\.+\d)' , '' , regex = True)
dfTest["cpuModel"] = dfTest["cpuModel"].replace(r'(GHz)' , '' , regex = True)


In [83]:
dfTest = dfTest.drop('CPU',axis = 1)

In [84]:
dfTest.head()


Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,RAM,Storage,GPU,Operating System,Operating System Version,Weight,Price,resolution,touchScren,screenType,cpuFrequency,cpuModel
0,HP,15-bs053od (i7-7500U/6GB/1TB/W10),Notebook,15.6,6GB,1TB HDD,Intel HD Graphics 620,Windows,10,2.04kg,5148468.0,1366x768,0.0,,2.7,Intel Core i7 7500U
1,Asus,Rog GL753VE-DS74,Gaming,17.3,16GB,256GB SSD + 1TB HDD,Nvidia GeForce GTX 1050 Ti,Windows,10,2.99kg,15552108.0,1920x1080,0.0,Full HD,2.8,Intel Core i7 7700HQ
2,Dell,Inspiron 7579,2 in 1 Convertible,15.6,12GB,512GB SSD,Intel HD Graphics 620,Windows,10,2.19kg,11550708.0,1920x1080,1.0,IPS Panel Full HD /,2.7,Intel Core i7 7500U
3,Toshiba,Portege Z30-C-1CV,Notebook,13.3,4GB,128GB SSD,Intel HD Graphics 520,Windows,7,1.2kg,10625940.0,1920x1080,0.0,Full HD,2.3,Intel Core i5 6200U
4,Lenovo,IdeaPad 320-15ABR,Notebook,15.6,6GB,256GB SSD,AMD Radeon 530,Windows,10,2.2kg,4881708.0,1920x1080,0.0,Full HD,3.6,AMD A12-Series 9720P


In [85]:
dfTest['RAM'] = dfTest['RAM'].replace(r'(GB)' , '' , regex = True)
dfTest['RAM'] = dfTest['RAM'].astype(float)
dfTest.head()

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,RAM,Storage,GPU,Operating System,Operating System Version,Weight,Price,resolution,touchScren,screenType,cpuFrequency,cpuModel
0,HP,15-bs053od (i7-7500U/6GB/1TB/W10),Notebook,15.6,6.0,1TB HDD,Intel HD Graphics 620,Windows,10,2.04kg,5148468.0,1366x768,0.0,,2.7,Intel Core i7 7500U
1,Asus,Rog GL753VE-DS74,Gaming,17.3,16.0,256GB SSD + 1TB HDD,Nvidia GeForce GTX 1050 Ti,Windows,10,2.99kg,15552108.0,1920x1080,0.0,Full HD,2.8,Intel Core i7 7700HQ
2,Dell,Inspiron 7579,2 in 1 Convertible,15.6,12.0,512GB SSD,Intel HD Graphics 620,Windows,10,2.19kg,11550708.0,1920x1080,1.0,IPS Panel Full HD /,2.7,Intel Core i7 7500U
3,Toshiba,Portege Z30-C-1CV,Notebook,13.3,4.0,128GB SSD,Intel HD Graphics 520,Windows,7,1.2kg,10625940.0,1920x1080,0.0,Full HD,2.3,Intel Core i5 6200U
4,Lenovo,IdeaPad 320-15ABR,Notebook,15.6,6.0,256GB SSD,AMD Radeon 530,Windows,10,2.2kg,4881708.0,1920x1080,0.0,Full HD,3.6,AMD A12-Series 9720P


In [86]:
dfTest[' Storage'] = dfTest[' Storage'].replace(r'(TB)' , '000GB' , regex = True)
dfTest[' Storage']

0                   1000GB HDD
1      256GB SSD +  1000GB HDD
2                    512GB SSD
3                    128GB SSD
4                    256GB SSD
                ...           
320                  128GB SSD
321                  512GB SSD
322         64GB Flash Storage
323                 1000GB HDD
324                  500GB HDD
Name:  Storage, Length: 325, dtype: object

In [87]:
dfTest['secondStorage'] = dfTest[' Storage'].str.split("+" , n = 1).str[1].str.strip()
dfTest['secondStorageSize'] = dfTest['secondStorage'].str.extract(r'(\d+)')
dfTest['secondStorage'] =  dfTest['secondStorage'].replace(r'(\d+)' , '' , regex = True)
dfTest['secondStorage'] =  dfTest['secondStorage'].replace(r'(GB)' , '' , regex = True)

In [88]:
dfTest['primaryStorage'] = dfTest[' Storage'].str.split("+" , n = 1).str[0].str.strip()
dfTest['primaryStorage'] = dfTest['primaryStorage'].replace(r'(\d+)','' ,regex = True)
dfTest['primaryStorage'] = dfTest['primaryStorage'].replace(r'(GB)','' ,regex = True)

dfTest['primaryStorageSize'] = dfTest[' Storage'].str.split("+" , n = 1).str[0].str.strip()
dfTest['primaryStorageSize'] = dfTest['primaryStorageSize'].str.extract(r'(\d+)')


In [89]:
dfTest = dfTest.drop(' Storage', axis = 1)

In [90]:
dfTest.head()

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,RAM,GPU,Operating System,Operating System Version,Weight,Price,resolution,touchScren,screenType,cpuFrequency,cpuModel,secondStorage,secondStorageSize,primaryStorage,primaryStorageSize
0,HP,15-bs053od (i7-7500U/6GB/1TB/W10),Notebook,15.6,6.0,Intel HD Graphics 620,Windows,10,2.04kg,5148468.0,1366x768,0.0,,2.7,Intel Core i7 7500U,,,HDD,1000
1,Asus,Rog GL753VE-DS74,Gaming,17.3,16.0,Nvidia GeForce GTX 1050 Ti,Windows,10,2.99kg,15552108.0,1920x1080,0.0,Full HD,2.8,Intel Core i7 7700HQ,HDD,1000.0,SSD,256
2,Dell,Inspiron 7579,2 in 1 Convertible,15.6,12.0,Intel HD Graphics 620,Windows,10,2.19kg,11550708.0,1920x1080,1.0,IPS Panel Full HD /,2.7,Intel Core i7 7500U,,,SSD,512
3,Toshiba,Portege Z30-C-1CV,Notebook,13.3,4.0,Intel HD Graphics 520,Windows,7,1.2kg,10625940.0,1920x1080,0.0,Full HD,2.3,Intel Core i5 6200U,,,SSD,128
4,Lenovo,IdeaPad 320-15ABR,Notebook,15.6,6.0,AMD Radeon 530,Windows,10,2.2kg,4881708.0,1920x1080,0.0,Full HD,3.6,AMD A12-Series 9720P,,,SSD,256


In [91]:
dfTest['Weight'] = dfTest['Weight'].str.replace('kg','')
dfTest['Weight'] = dfTest['Weight'].str.replace('s','')
dfTest['Weight']  = dfTest['Weight'].astype(float)

In [92]:
dfTest.head()

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,RAM,GPU,Operating System,Operating System Version,Weight,Price,resolution,touchScren,screenType,cpuFrequency,cpuModel,secondStorage,secondStorageSize,primaryStorage,primaryStorageSize
0,HP,15-bs053od (i7-7500U/6GB/1TB/W10),Notebook,15.6,6.0,Intel HD Graphics 620,Windows,10,2.04,5148468.0,1366x768,0.0,,2.7,Intel Core i7 7500U,,,HDD,1000
1,Asus,Rog GL753VE-DS74,Gaming,17.3,16.0,Nvidia GeForce GTX 1050 Ti,Windows,10,2.99,15552108.0,1920x1080,0.0,Full HD,2.8,Intel Core i7 7700HQ,HDD,1000.0,SSD,256
2,Dell,Inspiron 7579,2 in 1 Convertible,15.6,12.0,Intel HD Graphics 620,Windows,10,2.19,11550708.0,1920x1080,1.0,IPS Panel Full HD /,2.7,Intel Core i7 7500U,,,SSD,512
3,Toshiba,Portege Z30-C-1CV,Notebook,13.3,4.0,Intel HD Graphics 520,Windows,7,1.2,10625940.0,1920x1080,0.0,Full HD,2.3,Intel Core i5 6200U,,,SSD,128
4,Lenovo,IdeaPad 320-15ABR,Notebook,15.6,6.0,AMD Radeon 530,Windows,10,2.2,4881708.0,1920x1080,0.0,Full HD,3.6,AMD A12-Series 9720P,,,SSD,256


## Model

In [93]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


## Training

In [94]:
trainingData = dfTrain

In [95]:
x = trainingData.drop('Price' , axis = 1)
y = trainingData['Price']

In [104]:
x.head()

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,RAM,GPU,Operating System,Operating System Version,Weight,resolution,touchScren,screenType,cpuFrequency,cpuModel,secondStorage,secondStorageSize,primaryStorage,primaryStorageSize
0,1,236,4,7,3,51,6,4,35,10,0,IPS Panel Retina,2.3,Intel Core i5,,,SSD,128
1,1,237,4,7,3,45,6,4,32,1,0,,1.8,Intel Core i5,,,Flash Storage,128
2,7,38,3,14,3,47,4,4,70,3,0,Full HD,2.5,Intel Core i5 7200U,,,SSD,256
3,1,236,4,13,5,7,6,4,67,12,0,IPS Panel Retina,2.7,Intel Core i7,,,SSD,512
4,1,236,4,7,3,52,6,4,35,10,0,IPS Panel Retina,3.1,Intel Core i5,,,SSD,256


In [134]:
x['screenType'] = x['screenType'].astype(str)
x['cpuModel'] = x['cpuModel'].astype(str)
x['secondStorage'] = x['secondStorage'].astype(str)
x['primaryStorage'] = x['primaryStorage'].astype(str)
x['cpuFrequency'] = x['cpuFrequency'].astype(float)
x['secondStorageSize'] = x['secondStorageSize'].astype(float)

encoder = preprocessing.LabelEncoder()
xColumns = x.columns
for col in xColumns:
    x[col] = encoder.fit_transform(x[col])
x

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,RAM,GPU,Operating System,Operating System Version,Weight,resolution,touchScren,screenType,cpuFrequency,cpuModel,secondStorage,secondStorageSize,primaryStorage,primaryStorageSize
0,1,236,4,7,3,51,6,4,35,10,0,10,13,29,2,5,3,2
1,1,237,4,7,3,45,6,4,32,1,0,13,8,29,2,5,0,2
2,7,38,3,14,3,47,4,4,70,3,0,17,15,4,2,5,3,6
3,1,236,4,13,5,7,6,4,67,12,0,10,17,9,2,5,3,10
4,1,236,4,7,3,52,6,4,35,10,0,10,20,29,2,5,3,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
972,4,43,1,16,7,72,5,0,156,3,0,17,16,18,0,3,3,6
973,16,339,3,10,3,41,5,0,78,3,0,17,13,85,2,5,3,6
974,2,318,1,16,5,71,5,0,127,3,0,17,18,42,0,3,3,6
975,7,293,3,14,3,62,5,0,85,3,0,6,17,16,0,3,3,2


### Testing

In [135]:
testingData = dfTest
xTest = testingData.drop('Price' , axis = 1)
yTest = testingData['Price']


In [136]:
xTest = xTest.rename(columns={'Model Name': 'modelName' ,'Operating System':'operatingSystem','Operating System Version':'operatingSystemVersion' })

xTest

Unnamed: 0,Manufacturer,modelName,Category,Screen Size,RAM,GPU,operatingSystem,operatingSystemVersion,Weight,resolution,touchScren,screenType,cpuFrequency,cpuModel,secondStorage,secondStorageSize,primaryStorage,primaryStorageSize
0,HP,15-bs053od (i7-7500U/6GB/1TB/W10),Notebook,15.6,6.0,Intel HD Graphics 620,Windows,10,2.04,1366x768,0.0,,2.7,Intel Core i7 7500U,,,HDD,1000
1,Asus,Rog GL753VE-DS74,Gaming,17.3,16.0,Nvidia GeForce GTX 1050 Ti,Windows,10,2.99,1920x1080,0.0,Full HD,2.8,Intel Core i7 7700HQ,HDD,1000,SSD,256
2,Dell,Inspiron 7579,2 in 1 Convertible,15.6,12.0,Intel HD Graphics 620,Windows,10,2.19,1920x1080,1.0,IPS Panel Full HD /,2.7,Intel Core i7 7500U,,,SSD,512
3,Toshiba,Portege Z30-C-1CV,Notebook,13.3,4.0,Intel HD Graphics 520,Windows,7,1.20,1920x1080,0.0,Full HD,2.3,Intel Core i5 6200U,,,SSD,128
4,Lenovo,IdeaPad 320-15ABR,Notebook,15.6,6.0,AMD Radeon 530,Windows,10,2.20,1920x1080,0.0,Full HD,3.6,AMD A12-Series 9720P,,,SSD,256
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
320,Lenovo,Yoga 500-14ISK,2 in 1 Convertible,14.0,4.0,Intel HD Graphics 520,Windows,10,1.80,1920x1080,1.0,IPS Panel Full HD /,2.5,Intel Core i7 6500U,,,SSD,128
321,Lenovo,Yoga 900-13ISK,2 in 1 Convertible,13.3,16.0,Intel HD Graphics 520,Windows,10,1.30,3200x1800,1.0,IPS Panel Quad HD+ /,2.5,Intel Core i7 6500U,,,SSD,512
322,Lenovo,IdeaPad 100S-14IBR,Notebook,14.0,2.0,Intel HD Graphics,Windows,10,1.50,1366x768,0.0,,1.6,Intel Celeron Dual Core N3050,,,Flash Storage,64
323,HP,15-AC110nv (i7-6500U/6GB/1TB/Radeon,Notebook,15.6,6.0,AMD Radeon R5 M330,Windows,10,2.19,1366x768,0.0,,2.5,Intel Core i7 6500U,,,HDD,1000


In [140]:
xTest.resolution = encoder.fit_transform(xTest.resolution)

In [141]:
xTest['screenType'] = xTest['screenType'].astype(str)
xTest['cpuModel'] = xTest['cpuModel'].astype(str)
xTest['secondStorage'] = xTest['secondStorage'].astype(str)
xTest['primaryStorage'] = xTest['primaryStorage'].astype(str)
xTest['cpuFrequency'] = xTest['cpuFrequency'].astype(float)
xTest['secondStorageSize'] = xTest['secondStorageSize'].astype(float)
xTest['modelName'] = xTest['modelName'].astype(str)
encoder = preprocessing.LabelEncoder()
xTestColumns = xTest.columns
for col in xTestColumns:
    xTest[col] = encoder.fit_transform(xTest[col])
xTest

Unnamed: 0,Manufacturer,modelName,Category,Screen Size,RAM,GPU,operatingSystem,operatingSystemVersion,Weight,resolution,touchScren,screenType,cpuFrequency,cpuModel,secondStorage,secondStorageSize,primaryStorage,primaryStorageSize
0,5,142,3,8,2,25,5,0,47,0,0,7,14,31,2,2,1,1
1,2,44,1,9,5,38,5,0,81,2,0,9,15,33,0,1,3,5
2,3,185,0,8,4,25,5,0,54,2,1,2,14,31,2,2,3,8
3,10,16,3,5,1,21,5,1,11,2,0,9,10,17,2,2,3,2
4,6,165,3,8,2,3,5,0,56,2,0,9,17,0,2,2,3,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
320,6,96,0,6,1,21,5,0,37,2,1,2,12,25,2,2,3,2
321,6,98,0,5,5,21,5,0,18,7,1,3,12,25,2,2,3,8
322,6,160,3,6,0,15,5,0,29,0,0,7,6,3,2,2,0,9
323,5,0,3,8,2,9,5,0,54,0,0,7,12,25,2,2,1,1


In [142]:
#model 
model = DecisionTreeRegressor()
model.fit(xTest,yTest)
prediction = model.predict(xTest)
r2 = r2_score(yTest, prediction)
print("r2 = " , r2)

r2 =  0.999888509603256


In [144]:
import csv
csv_file = 'predictions.csv'

with open(csv_file, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Prediction'])
    for predict in prediction:
        writer.writerow([predict])

print('Predictions saved successfully to', csv_file)

Predictions saved successfully to predictions.csv
