In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('laptops.csv', encoding='latin1')

# Fill missing values for 'Operating System Version' with a placeholder
df['Operating System Version'] = df['Operating System Version'].fillna('Unknown')

# Clean and convert 'Weight' to numeric
df['Weight'] = df['Weight'].str.extract('(\d+\.\d+|\d+)')[0].astype(float)

# Check if 'Price (Euros)' contains commas and convert to string if necessary
if df['Price (Euros)'].dtype == 'object':
    df['Price (Euros)'] = df['Price (Euros)'].str.replace(',', '').astype(float)
else:
    df['Price (Euros)'] = df['Price (Euros)'].astype(float)

# Check if 'Weight' has been cleaned and converted correctly
print(df['Weight'].head())
print(df['Weight'].dtype)

# Check if 'Price (Euros)' has been converted correctly
print(df['Price (Euros)'].head())
print(df['Price (Euros)'].dtype)

# Create a new column for the formatted name
df['Formatted Name'] = df.apply(lambda row: f"{row['Manufacturer']} {row['Model Name']} {row['Category']} {row['CPU'].split()[-1]}", axis=1)

# Display the first few rows to check the new column
print(df[['Formatted Name']].head())


  df['Weight'] = df['Weight'].str.extract('(\d+\.\d+|\d+)')[0].astype(float)


0    1.37
1    1.34
2    1.86
3    1.83
4    1.37
Name: Weight, dtype: float64
float64
0    133969.0
1     89894.0
2     57500.0
3    253745.0
4    180360.0
Name: Price (Euros), dtype: float64
float64
                       Formatted Name
0  Apple MacBook Pro Ultrabook 2.3GHz
1  Apple Macbook Air Ultrabook 1.8GHz
2           HP 250 G6 Notebook 2.5GHz
3  Apple MacBook Pro Ultrabook 2.7GHz
4  Apple MacBook Pro Ultrabook 3.1GHz


In [2]:
# Create a new column for the formatted name
df['Formatted Name'] = df.apply(lambda row: f"{row['Manufacturer']} {row['Model Name']} {row['Category']} {row['CPU'].split()[-1]}", axis=1)

# Display the first few rows to check the new column
print(df[['Formatted Name']].head())

                       Formatted Name
0  Apple MacBook Pro Ultrabook 2.3GHz
1  Apple Macbook Air Ultrabook 1.8GHz
2           HP 250 G6 Notebook 2.5GHz
3  Apple MacBook Pro Ultrabook 2.7GHz
4  Apple MacBook Pro Ultrabook 3.1GHz


In [3]:
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
for column in ['Manufacturer', 'Model Name', 'Category']:
    label_encoders[column] = LabelEncoder()
    df[column] = label_encoders[column].fit_transform(df[column])


In [4]:
print(df.columns)

Index(['Manufacturer', 'Model Name', 'Category', 'Screen Size', 'Screen',
       'CPU', 'RAM', ' Storage', 'GPU', 'Operating System',
       'Operating System Version', 'Weight', 'Price (Euros)',
       'Formatted Name'],
      dtype='object')


In [5]:
# Strip leading and trailing spaces from column names
df.columns = df.columns.str.strip()

# Verify the column names
print(df.columns)

Index(['Manufacturer', 'Model Name', 'Category', 'Screen Size', 'Screen',
       'CPU', 'RAM', 'Storage', 'GPU', 'Operating System',
       'Operating System Version', 'Weight', 'Price (Euros)',
       'Formatted Name'],
      dtype='object')


In [6]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import NearestNeighbors

features = ['Manufacturer', 'Model Name', 'Category']  # Add other relevant features
X = df[features].values

# Initialize and train the k-NN model
knn = NearestNeighbors(n_neighbors=3)
knn.fit(X)

In [7]:
import joblib

# Assuming you have trained your model and label encoders
joblib.dump(knn, 'knn_model.pkl')
joblib.dump(label_encoders, 'label_encoders.pkl')

['label_encoders.pkl']

In [8]:
df.sample()

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM,Storage,GPU,Operating System,Operating System Version,Weight,Price (Euros),Formatted Name
318,7,365,3,"14.0""",Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,Windows,10,1.95,98000.0,HP ProBook 640 Notebook 2.5GHz
