In [1]:
pip install pandas scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [14]:
import pandas as pd

# Specify the path to your CSV file
csv_file_path = '../data.csv'

# Load the CSV file into a DataFrame
df = pd.read_csv(csv_file_path)

# Get the list of all column names from headers
column_headers = list(df.columns.values)
print("The Column Header :", column_headers)

The Column Header : ['Unnamed: 0.1', 'Unnamed: 0', 'brand', 'name', 'price', 'spec_rating', 'processor', 'CPU', 'Ram', 'Ram_type', 'ROM', 'ROM_type', 'GPU', 'display_size', 'resolution_width', 'resolution_height', 'OS', 'warranty']


In [15]:
import re

df[['generation', 'tier']] = df['processor'].str.extract(r'(?:(\d{1,2}(?:th|rd|nd)?)\s*Gen)?(?:.*?((?:Apple M\d+|Ryzen \d+|i\d|Athlon|Celeron|M\d)(?: [a-zA-Z]+)?))\b', flags=re.IGNORECASE)

In [18]:

# Map launch years based on processor type and generation
launch_years = {
    'intel': {'1st': 2006, '2nd': 2008, '3rd': 2010, '4th': 2013, '5th': 2015, '6th': 2015, '7th': 2017, '8th': 2017, '9th': 2018, '10th': 2019, '11th': 2020, '12th': 2021, '13th': 2022},
    'amd': {'1st': 2011, '2nd': 2012, '3rd': 2015, '4th': 2017, '5th': 2019, '6th': 2020, '7th': 2020},
    'apple': {'M1': 2020, 'M2': 2021, 'M1 Pro': 2021, 'M1 Max': 2021}
}

def get_launch_year(row, launch_years):
    processor_brand = next((brand.lower() for brand in launch_years if brand.lower() in row['processor'].lower()), None)
    
    if processor_brand and row['generation']:
        return launch_years[processor_brand].get(row['generation'], None)
    
    tier_brand = row['brand'].lower()
    print(tier_brand)
    
    if not row['generation'] and tier_brand in launch_years:
        print(tier_brand)
        for gen in launch_years[tier_brand]:
            if gen in row['tier']:
                return gen
    
    return None

# Create a new column 'launch_year' based on the 'generation' and 'tier'
df['launch_year'] = df.apply(lambda row: get_launch_year(row, launch_years), axis=1)

primebook


In [17]:
from sklearn.preprocessing import LabelEncoder

def encode_column(df, column):
    # Create a LabelEncoder instance
    label_encoder = LabelEncoder()
    if column in df.columns:
        # Encode the column
        df[str(column + '_encoded')] = label_encoder.fit_transform(df[column])
        df = df.drop(column, axis=1)
    else:
        print('Column {} not present into Dataframe', column)
        
    return df

In [6]:
df = df.drop(df.columns[:2],axis=1)

df = encode_column(df, 'brand')
df = encode_column(df, 'OS')
df = encode_column(df, 'ROM_type')
df = encode_column(df, 'Ram_type')

df.head()


Unnamed: 0,name,price,spec_rating,processor,CPU,Ram,ROM,GPU,display_size,resolution_width,resolution_height,warranty,generation,tier,launch_year,brand_encoded,OS_encoded,ROM_type_encoded,Ram_type_encoded
0,Victus 15-fb0157AX Gaming Laptop,49900,73.0,5th Gen AMD Ryzen 5 5600H,"Hexa Core, 12 Threads",8GB,512GB,4GB AMD Radeon RX 6500M,15.6,1920.0,1080.0,1,5th,Ryzen 5,2019.0,9,12,1,2
1,15s-fq5007TU Laptop,39900,60.0,12th Gen Intel Core i3 1215U,"Hexa Core (2P + 4E), 8 Threads",8GB,512GB,Intel UHD Graphics,15.6,1920.0,1080.0,1,12th,i3,2021.0,9,12,1,2
2,One 14 Z8-415 Laptop,26990,69.323529,11th Gen Intel Core i3 1115G4,"Dual Core, 4 Threads",8GB,512GB,Intel Iris Xe Graphics,14.0,1920.0,1080.0,1,11th,i3,2020.0,1,12,1,2
3,Yoga Slim 6 14IAP8 82WU0095IN Laptop,59729,66.0,12th Gen Intel Core i5 1240P,"12 Cores (4P + 8E), 16 Threads",16GB,512GB,Intel Integrated Iris Xe,14.0,2240.0,1400.0,1,12th,i5,2021.0,14,12,1,8
4,MacBook Air 2020 MGND3HN Laptop,69990,69.323529,Apple M1,Octa Core (4P + 4E),8GB,256GB,Apple M1 Integrated Graphics,13.3,2560.0,1600.0,1,,Apple M1,,2,7,1,2


In [7]:
df = df.drop('name', axis=1)

In [8]:
df.processor.unique()

array(['5th Gen AMD Ryzen 5 5600H', '12th Gen Intel Core i3 1215U',
       '11th Gen Intel Core i3 1115G4', '12th Gen Intel Core i5 1240P',
       'Apple M1', '13th Gen Intel Core i5 13420H',
       '12th Gen Intel Core i5 12500H', '12th Gen Intel Core i7 1255U',
       'Intel Celeron  N4020', 'MediaTek MTK8788',
       '7th Gen AMD Ryzen 3 7320U', '11th Gen Intel Core i5 11400H ',
       '13th Gen Intel Core i9 13900H', '12th Gen Intel Core i5 12450H',
       '11th Gen Intel Core i5 11300H', 'Apple M2',
       '11th Gen Intel Core i5 1135G7 ', '5th Gen AMD Ryzen 7  5800H',
       '5th Gen AMD Ryzen 5 5500U', '3rd Gen AMD Athlon 3050U',
       'Intel Core i3 N305', '13th Gen Intel Core i7 1355U',
       '6th Gen AMD Ryzen 5 6600H', '13th Gen Intel Core i9 13900HX',
       '12th Gen Intel Core i7 12650H', '13th Gen Intel Core i5 1340P',
       '12th Gen Intel Core i5 1235U ', '13th Gen Intel Core i5 13450HX',
       '13th Gen Intel Core i9 13980HX', '11th Gen Intel Core i5 1135G7',
    

In [9]:
df['is_cpu_intel'] = df['processor'].str.contains('Intel', case=False).astype(int)

In [10]:
df['is_cpu_amd'] = df['processor'].str.contains('AMD', case=False).astype(int)

In [11]:
df['is_cpu_apple'] = df['processor'].str.contains('Apple', case=False).astype(int)

In [12]:
df.head()

Unnamed: 0,price,spec_rating,processor,CPU,Ram,ROM,GPU,display_size,resolution_width,resolution_height,...,generation,tier,launch_year,brand_encoded,OS_encoded,ROM_type_encoded,Ram_type_encoded,is_cpu_intel,is_cpu_amd,is_cpu_apple
0,49900,73.0,5th Gen AMD Ryzen 5 5600H,"Hexa Core, 12 Threads",8GB,512GB,4GB AMD Radeon RX 6500M,15.6,1920.0,1080.0,...,5th,Ryzen 5,2019.0,9,12,1,2,0,1,0
1,39900,60.0,12th Gen Intel Core i3 1215U,"Hexa Core (2P + 4E), 8 Threads",8GB,512GB,Intel UHD Graphics,15.6,1920.0,1080.0,...,12th,i3,2021.0,9,12,1,2,1,0,0
2,26990,69.323529,11th Gen Intel Core i3 1115G4,"Dual Core, 4 Threads",8GB,512GB,Intel Iris Xe Graphics,14.0,1920.0,1080.0,...,11th,i3,2020.0,1,12,1,2,1,0,0
3,59729,66.0,12th Gen Intel Core i5 1240P,"12 Cores (4P + 8E), 16 Threads",16GB,512GB,Intel Integrated Iris Xe,14.0,2240.0,1400.0,...,12th,i5,2021.0,14,12,1,8,1,0,0
4,69990,69.323529,Apple M1,Octa Core (4P + 4E),8GB,256GB,Apple M1 Integrated Graphics,13.3,2560.0,1600.0,...,,Apple M1,,2,7,1,2,0,0,1


In [14]:
print(df.tier)

0       Ryzen 5
1            i3
2            i3
3            i5
4      Apple M1
         ...   
888          i3
889     Ryzen 7
890     Ryzen 9
891          i7
892     Ryzen 9
Name: tier, Length: 893, dtype: object


In [19]:
excel_file_path = './tmp.xlsx'
df.to_excel(excel_file_path, index=False)