# Into to Data Science Mini-project

## Preprocessing

In [42]:
# Import data from the csv file called "complete_laptop_data0.csv" and create a dataframe called "df"

import pandas as pd

df = pd.read_csv("complete_laptop_data0.csv")

# Print the first 5 rows of the dataframe
df.head()


Unnamed: 0,id,link,name,user rating,Price,Sales Package,Model Number,Part Number,Model Name,Series,...,Security Chip,Inbuilt 4G LTE,Cloud Storage,S-video,Dock Port,Firewire Port,RJ11,Read/Write Speed,EMMC Storage Capacity,Optane Memory
0,0,https://www.flipkart.com/asus-rog-strix-scar-1...,ASUS ROG Strix SCAR 17 Core i9 12th Gen - (32 ...,5.0,"?2,34,990","Laptop, Power Adaptor, User Guide, Warranty Do...",G733ZW-LL139WS,90NR08G2-M007S0,G733ZW-LL139WS,ROG Strix SCAR 17,...,,,,,,,,,,
1,1,https://www.flipkart.com/asus-rog-strix-scar-1...,ASUS ROG Strix SCAR 15 Core i9 12th Gen - (32 ...,,"?2,29,990","Laptop, Power Adaptor, User Guide, Warranty Do...",G533ZW-LN136WS,90NR0872-M007L0,G533ZW-LN136WS,ROG Strix SCAR 15,...,,,,,,,,,,
2,2,https://www.flipkart.com/hp-victus-ryzen-7-oct...,HP Victus Ryzen 7 Octa Core 5800H - (16 GB/512...,,"?1,04,091","Laptop, battery, adapter, cables and user manuals",16-e0351AX,552X1PA#ACJ,16-e0351AX,Victus,...,,,,,,,,,,
3,3,https://www.flipkart.com/lenovo-ideapad-gaming...,Lenovo IdeaPad Gaming 3i Ryzen 7 Octa Core R7-...,,"?87,717","Laptop, Power Adaptor, User Guide, Warranty Do...",15ACH6,82K201RSIN,IdeaPad Gaming 3,IdeaPad Gaming 3i,...,,,,,,,,,,
4,4,https://www.flipkart.com/lenovo-yoga-slim-7-co...,Lenovo Yoga Slim 7 Core i5 11th Gen - (16 GB/5...,,"?75,990","Laptop, Adaptor",82A300MBIN,1LS2B001653,Yoga Slim 7 14ITL05,Yoga Slim 7,...,,,,,,,,,,


### Functions

In [43]:
import re

# Function to extract the first number from a string
def extract_number(s):
    numbers = re.findall(r'\d+', s)
    return int(numbers[0]) if numbers else 1

# Function to convert "Processor Generation" to a year format
def generation_to_year(row):
    generation = row['Processor Generation']
    brand = row['Processor Brand']

    if isinstance(generation, str):
        if brand == 'AMD':
            if '6th Gen' in generation:
                return 2022
            elif '5th Gen' in generation:
                return 2021
            elif '4th Gen' in generation:
                return 2020
            elif '3rd Gen' in generation:
                return 2019
        elif brand == 'Intel':
            if '12th Gen' in generation:
                return 2021
            elif '11th Gen' in generation:
                return 2020
            elif '10th Gen' in generation:
                return 2019
    return None

# Function to convert capacities to MB
def convert_to_gb(capacity):
    if isinstance(capacity, float):
        return capacity
    if capacity.endswith('TB'):
        return int(capacity.replace('TB', '')) * 1024
    elif capacity.endswith('GB'):
        return int(capacity.replace('GB', ''))
    elif capacity.endswith('MB'):
        return round(int(capacity.replace('MB', '')) / 1024, 2)  # Convert from MB to GB and round to 2 decimal places
    else:
        return None
    
# Function to calculate total pixels in "Screen Resolution" column
def calculate_total_pixels(resolution):
    matches = re.findall(r'\d+', resolution)
    if len(matches) >= 2:
        total_pixels = int(matches[0]) * int(matches[-1])
        return total_pixels
    else:
        return None


### Field modifications

In [44]:
# Modify/create the following columns and delete the rest (original column name in brackets):
# name (name): keep original
# brand (name): take the first word of the name column
# price (Price): keep original, remove "?" from the start of the field and remove ","
# processor brand (Processor Brand): keep original (remove entries with other than Intel or AMD)
# processor number (Processor Name):  modify from i3, i5, i7... and Ryzen 3, Ryzen 5, Ryzen 7... to 3, 5, 7...
# processor year (Processor Generation): convert to a year format
# processor clock speed (Clock Speed): only take the first number part of the field
# processor core count (Number of Cores): keep original
# SSD (SSD): modify from Yes/No to true/false
# drive capacity (SSD Capacity/HDD Capacity): combine fields "SSD Capacity" and "HDD Capacity" into one field, if field SSD is true, only take the SSD capacity, if SSD is false, only take the HDD capacity, convert to GB
# RAM (RAM): convert to GB
# GPU memory (Dedicated Graphic Memory Capacity): convert to GB
# screen size (Screen Size): only take the last number part of the field
# pixel count (Screen Resolution): keep original
# weight (Weight): take only the number part of the field
# speakers (Speakers): true/false (if null, false, otherwise true)
# touchscreen (Touchscreen): modify to true/false
# microphone (Internal Mic): true/false (if null, false, otherwise true)
# webcam (Web Camera): true/false (if null, false, otherwise true)
# pixel count (Screen Resolution): calculate total pixels by multiplying the first two numbers

# Create a new dataframe called "df2" with the modified columns
df2 = df.copy()

# brand (name): take the first word of the name column
df2["brand"] = df2["name"].str.split(" ").str[0]
# price (Price): keep original, remove "?" from the start of the field and remove ","
df2["price"] = df2["Price"].str.replace("?", "").str.replace(",", "")
# processor brand (Processor Brand): keep original (remove entries with other than Intel or AMD)
df2 = df2[df2["Processor Brand"].isin(["Intel", "AMD"])]
df2["processor brand"] = df2["Processor Brand"]
# processor number (Processor Name): modify from i3, i5, i7... and Ryzen 3, Ryzen 5, Ryzen 7... to 3, 5, 7...
df2["processor number"] = df2["Processor Name"].apply(extract_number)
# processor year (Processor Generation): convert to a year format
df2["processor year"] = df2.apply(generation_to_year, axis=1)
# processor clock speed (Clock Speed): only take the first number part of the field
df2["processor clock speed"] = df2["Clock Speed"].str.extract(r"([-+]?\d*\.\d+|\d+)")
# processor core count (Number of Cores): keep original
df2["processor core count"] = df2["Number of Cores"]
# SSD (SSD): modify from Yes/No to true/false
df2["SSD"] = df2["SSD"].str.replace(" ", "").str.replace("Yes", "True").str.replace("No", "False")
# drive capacity (SSD Capacity/HDD Capacity): combine fields "SSD Capacity" and "HDD Capacity" into one field, if field SSD is true, only take the SSD capacity, if SSD is false, only take the HDD capacity, convert to GB
df2["drive capacity"] = df2["SSD Capacity"].combine_first(df2["HDD Capacity"])
df2["drive capacity"] = df2["drive capacity"].apply(convert_to_gb)
# RAM (RAM): convert to GB
df2["RAM"] = df2["RAM"].apply(convert_to_gb)
# GPU memory capacity (Dedicated Graphic Memory Capacity): convert to GB
df2["GPU memory"] = df2["Dedicated Graphic Memory Capacity"].apply(convert_to_gb)
# screen size (Screen Size): only take the last number part of the field
df2["screen size"] = df2["Screen Size"].str.split(" ").str[2].str.replace("(","")
# weight (Weight): take only the number part of the field
df2["weight"] = df2["Weight"].str.split(" ").str[0]
# speakers (Speakers): true/false (if null, false, otherwise true)
df2["speakers"] = df2["Speakers"].isnull() == False
# touchscreen (Touchscreen): modify to true/false
df2["touchscreen"] = df2["Touchscreen"].str.replace(" ", "").isnull() == False
# microphone (Internal Mic): true/false (if null, false, otherwise true)
df2["microphone"] = df2["Internal Mic"].isnull() == False
# webcam (Web Camera): true/false (if null, false, otherwise true)
df2["webcam"] = df2["Web Camera"].isnull() == False
# pixel count (Screen Resolution): calculate total pixels by multiplying the first two numbers
df2["pixel count"] = df2["Screen Resolution"].apply(calculate_total_pixels)


df2 = df2[["brand", "price", "processor brand", "processor number", "processor year", "processor clock speed", "processor core count", "SSD", "drive capacity", "RAM", "GPU memory", "screen size", "pixel count", "weight", "speakers", "touchscreen", "microphone", "webcam"]]

df2.head()


Unnamed: 0,brand,price,processor brand,processor number,processor year,processor clock speed,processor core count,SSD,drive capacity,RAM,GPU memory,screen size,pixel count,weight,speakers,touchscreen,microphone,webcam
0,ASUS,234990,Intel,9,2021.0,2.5,14.0,True,1024.0,32,8.0,17.3,3686400.0,2.9,True,True,True,False
1,ASUS,229990,Intel,9,2021.0,2.5,14.0,True,1024.0,32,8.0,15.6,3686400.0,2.3,True,True,True,False
2,HP,104091,AMD,7,,3.2,8.0,True,512.0,16,4.0,16.1,2073600.0,2.48,True,True,True,True
3,Lenovo,87717,AMD,7,2021.0,3.2,8.0,True,512.0,16,4.0,15.6,2073600.0,2.25,True,True,True,True
4,Lenovo,75990,Intel,5,2020.0,,,True,512.0,16,,14.0,,,True,True,True,True


### Writing to a file

In [45]:
# Write df2 to a csv file called "preprocessed_laptop_data.csv"

df2.to_csv("preprocessed_laptop_data.csv", index=False)
