## Importing the Libraries


In [1]:
import numpy as np
import pandas as pd

## Importing the dataset


In [2]:
dataset = pd.read_csv('../Data/laptop_price.csv', encoding='iso-8859-1')


## Check is there any NULL values in the dataset


In [3]:
# dataset.isnull().sum()
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   laptop_ID         1303 non-null   int64  
 1   Company           1303 non-null   object 
 2   Product           1303 non-null   object 
 3   TypeName          1303 non-null   object 
 4   Inches            1303 non-null   float64
 5   ScreenResolution  1303 non-null   object 
 6   Cpu               1303 non-null   object 
 7   Ram               1303 non-null   object 
 8   Gpu               1303 non-null   object 
 9   OpSys             1303 non-null   object 
 10  Weight            1303 non-null   object 
 11  Price_euros       1303 non-null   float64
dtypes: float64(2), int64(1), object(9)
memory usage: 122.3+ KB


## Ignore Kg and GB in Ram , weight columns


In [4]:
dataset['Ram'] = dataset['Ram'].str.replace('GB', '').astype('Int32')
dataset['Weight'] = dataset['Weight'].str.replace('kg', '').astype('float32')

## Compute correlation


In [5]:
# dataset.corr()['Price_euros']

In [6]:
dataset['Company'].value_counts()

Company
Dell         297
Lenovo       297
HP           274
Asus         158
Acer         103
MSI           54
Toshiba       48
Apple         21
Samsung        9
Razer          7
Mediacom       7
Microsoft      6
Xiaomi         4
Vero           4
Chuwi          3
Google         3
Fujitsu        3
LG             3
Huawei         2
Name: count, dtype: int64

Naming the companies with less than 10 items as others


In [7]:
def add_company(company):
    if company == 'Samsung' or company == 'Razer' or company == 'Mediacom' or company == 'Microsoft' or company == 'Xiaomi' or company == 'Vero' or company == 'Chuwi' or company == 'Google' or company == 'Fujitsu' or company == 'LG' or company == 'Huawei':
        return 'Other'
    else:
        return company

dataset['Company'] = dataset['Company'].apply(add_company)
        

In [8]:
dataset['TouchScreen'] = dataset['ScreenResolution'].apply(lambda x: 1 if 'TouchScreen' in x else 0)
dataset['IPS'] = dataset['ScreenResolution'].apply(lambda x: 1 if 'IPS' in x else 0)

In [9]:
dataset['CPU_Name'] = dataset['Cpu'].apply(lambda x: ''.join(x.split()[0:3]))

In [10]:
dataset.head()

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Gpu,OpSys,Weight,Price_euros,TouchScreen,IPS,CPU_Name
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,Intel Iris Plus Graphics 640,macOS,1.37,1339.69,0,1,IntelCorei5
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,Intel HD Graphics 6000,macOS,1.34,898.94,0,0,IntelCorei5
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,Intel HD Graphics 620,No OS,1.86,575.0,0,0,IntelCorei5
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16,AMD Radeon Pro 455,macOS,1.83,2537.45,0,1,IntelCorei7
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8,Intel Iris Plus Graphics 650,macOS,1.37,1803.6,0,1,IntelCorei5


In [11]:
def set_processor(name):
    if name == 'IntelCorei7' or name == 'IntelCorei5' or name == 'IntelCorei3':
        return name
    else:
        if name.split()[0] == 'Intel':
            return 'Intel'
        elif name.split()[0] == 'AMD':
            return 'AMD'
        else:
            return 'Other'

dataset['CPU_Name'] = dataset['CPU_Name'].apply(set_processor)

In [12]:
dataset['CPU_Name'].value_counts()

CPU_Name
IntelCorei7    527
IntelCorei5    423
Other          217
IntelCorei3    136
Name: count, dtype: int64

In [13]:
def set_category(name):
    if name.split()[0] == 'Intel':
        return 'Intel'
    elif name.split()[0] == 'AMD':
        return 'AMD'
    elif name.split()[0] == 'Nvidia':
        return 'Nvidia'
    else:
        return 'Other'
    
dataset['Gpu'] = dataset['Gpu'].apply(set_category)
    


In [14]:
dataset['Gpu'].value_counts()

Gpu
Intel     722
Nvidia    400
AMD       180
Other       1
Name: count, dtype: int64

In [15]:
dataset = dataset[dataset['Gpu'] != 'Other']

In [16]:
dataset.shape

(1302, 15)

In [17]:
dataset['OpSys'].value_counts()

OpSys
Windows 10      1072
No OS             66
Linux             62
Windows 7         45
Chrome OS         26
macOS             13
Mac OS X           8
Windows 10 S       8
Android            2
Name: count, dtype: int64

In [18]:
def set_os(name):
    if name.split()[0] == 'Windows':
        return 'Windows'
    elif name.split()[0] == 'Linux': 
        return 'Linux'
    elif name.split()[0] == 'Mac' or name.split()[0] == 'macOS' :
        return 'Mac'
    else:
        return 'Other'
    
dataset['OS'] = dataset['OpSys'].apply(set_os)

In [19]:
dataset['OS'].value_counts()

OS
Windows    1125
Other        94
Linux        62
Mac          21
Name: count, dtype: int64

In [20]:
dataset.head()

Unnamed: 0,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Gpu,OpSys,Weight,Price_euros,TouchScreen,IPS,CPU_Name,OS
0,1,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,Intel,macOS,1.37,1339.69,0,1,IntelCorei5,Mac
1,2,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,Intel,macOS,1.34,898.94,0,0,IntelCorei5,Mac
2,3,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,Intel,No OS,1.86,575.0,0,0,IntelCorei5,Other
3,4,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16,AMD,macOS,1.83,2537.45,0,1,IntelCorei7,Mac
4,5,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8,Intel,macOS,1.37,1803.6,0,1,IntelCorei5,Mac


In [21]:
dataset = dataset.drop(columns=['laptop_ID', 'Product', 'Inches', 'ScreenResolution','Cpu','OpSys'])

In [22]:
dataset.head()

Unnamed: 0,Company,TypeName,Ram,Gpu,Weight,Price_euros,TouchScreen,IPS,CPU_Name,OS
0,Apple,Ultrabook,8,Intel,1.37,1339.69,0,1,IntelCorei5,Mac
1,Apple,Ultrabook,8,Intel,1.34,898.94,0,0,IntelCorei5,Mac
2,HP,Notebook,8,Intel,1.86,575.0,0,0,IntelCorei5,Other
3,Apple,Ultrabook,16,AMD,1.83,2537.45,0,1,IntelCorei7,Mac
4,Apple,Ultrabook,8,Intel,1.37,1803.6,0,1,IntelCorei5,Mac


In [23]:
dataset.head()

Unnamed: 0,Company,TypeName,Ram,Gpu,Weight,Price_euros,TouchScreen,IPS,CPU_Name,OS
0,Apple,Ultrabook,8,Intel,1.37,1339.69,0,1,IntelCorei5,Mac
1,Apple,Ultrabook,8,Intel,1.34,898.94,0,0,IntelCorei5,Mac
2,HP,Notebook,8,Intel,1.86,575.0,0,0,IntelCorei5,Other
3,Apple,Ultrabook,16,AMD,1.83,2537.45,0,1,IntelCorei7,Mac
4,Apple,Ultrabook,8,Intel,1.37,1803.6,0,1,IntelCorei5,Mac


## Move 6th column in to Last

In [24]:
columns = dataset.columns.to_list()
columns.append(columns.pop(5))

dataset = dataset[columns]

In [25]:
dataset.head()

Unnamed: 0,Company,TypeName,Ram,Gpu,Weight,TouchScreen,IPS,CPU_Name,OS,Price_euros
0,Apple,Ultrabook,8,Intel,1.37,0,1,IntelCorei5,Mac,1339.69
1,Apple,Ultrabook,8,Intel,1.34,0,0,IntelCorei5,Mac,898.94
2,HP,Notebook,8,Intel,1.86,0,0,IntelCorei5,Other,575.0
3,Apple,Ultrabook,16,AMD,1.83,0,1,IntelCorei7,Mac,2537.45
4,Apple,Ultrabook,8,Intel,1.37,0,1,IntelCorei5,Mac,1803.6


##


In [26]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

## Encoding categorical Data (Independant variables)


In [27]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

coulmn_Trans = ColumnTransformer(transformers=[('encoder', OneHotEncoder(),[0, 1, 3, 7, 8])],remainder='passthrough')
X = np.array(coulmn_Trans.fit_transform(X))

## Splitting the dataset into the Training set & Test set

In [28]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Feature Scaling

In [29]:
# from sklearn.preprocessing import StandardScaler

# sc_X = StandardScaler()
# sc_y = StandardScaler()

# X_train = sc_X.fit_transform(X_train)


## Training the Regression model on the Training set

In [52]:
from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(n_estimators=100, criterion='absolute_error', random_state=0)

regressor.fit(X_train, y_train)

## Predicting the Test set results


In [53]:
y_pred = regressor.predict(X_test)

np.printoptions(precision=2)

print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[ 772.43555  675.     ]
 [1284.64635 1348.48   ]
 [1934.44635 2099.     ]
 [2099.4289  1899.     ]
 [ 447.6376   459.     ]
 [1675.33    1398.99   ]
 [2205.5596  1799.     ]
 [1628.3726  1499.     ]
 [1560.7023  1629.     ]
 [ 438.2741   349.     ]
 [ 249.564    229.     ]
 [1905.813   2013.1    ]
 [1142.078   1272.     ]
 [1120.983   1249.     ]
 [1911.31695 1778.     ]
 [ 651.19245  677.35   ]
 [ 558.66355  468.     ]
 [ 577.7078   614.     ]
 [1215.466   1799.     ]
 [ 886.4365  1199.     ]
 [ 780.6268   959.     ]
 [1047.7436  1049.6    ]
 [ 602.0235   705.15   ]
 [ 980.4943   659.     ]
 [1137.20335 1008.52   ]
 [1176.1967   999.     ]
 [1959.92155 1763.     ]
 [ 924.06145 1154.     ]
 [ 541.81     615.     ]
 [2547.952   1269.     ]
 [ 447.6376   459.     ]
 [4416.04515 6099.     ]
 [ 487.5321   429.     ]
 [1311.9594  1729.     ]
 [1071.77515 1099.     ]
 [ 358.35025  345.99   ]
 [1470.558    752.     ]
 [1393.2079  1228.99   ]
 [1828.4208  2041.     ]
 [ 367.1293   443.99   ]


## Evaluating the Model Performance


In [54]:
from sklearn.metrics import r2_score

r2_score(y_test, y_pred)

0.8187028956219002