In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
%matplotlib inline

In [2]:
def get_cpu_name(x, pattern='[a-zA-Z0-9-.]+'):
    if 'ghz' not in x.lower():
        return None
    #x = x.replace('GHz', ' GHz ')
    names = re.findall(string=x, pattern=pattern)
    if names[-1].lower() == 'ghz': 
        return f'{names[0]} @ {names[-2]}{names[-1]}'
    if 'ghz' in names[-1].lower():
        return f'{names[-2]} @ {names[-1]}'
    return f'{names[0]} @ {names[-3]}{names[-2]}'

### Get price of CPU from Intel report

In [3]:
filename = '/home/phongdk/workspace/user-income/external_data/device/cpu_price_by_intel.csv'

In [4]:
df = pd.read_csv(filename, nrows=305)
df.dropna(inplace=True)
df.drop(columns='Unnamed: 1', inplace=True)
df.columns = ['cpu_model', 'price']
df = df[df['price'] != 'Price']
df.shape

(254, 2)

In [5]:
df['cpu_model_short'] = df['cpu_model'].apply(lambda x: get_cpu_name(x, pattern='[a-zA-Z0-9-.]+'))
df.head()

Unnamed: 0,cpu_model,price,cpu_model_short
2,Intel® X-series Processor Family,Jan'19 (01/07),
5,"i9-9980XE (24.75M cache, 18 Cores, 36 Threads,...","$1,979",i9-9980XE @ 3.00GHz
6,"i9-7980XE (24.75M cache, 18 Cores, 36 Threads,...","$1,979",i9-7980XE @ 2.60GHz
7,"i9-9960X (22M cache, 16 Cores, 32 Threads, 3.1...","$1,684",i9-9960X @ 3.10GHz
8,"i9-7960X (22M cache, 16 Cores, 32 Threads, 2.8...","$1,684",i9-7960X @ 2.80GHz


In [6]:
cpu_price_vnese_df = pd.read_csv('/home/phongdk/workspace/user-income/external_data/device/CPU_prices.csv', 
                                 sep='\t', usecols=['CPU model', 'Price, in VND'])

### Price from Bachan's file

In [7]:
cpu_price_vnese_df['cpu_model_short'] = cpu_price_vnese_df['CPU model'].apply(lambda x: 
                                                                              get_cpu_name(x.replace('CPU ', '')))
cpu_price_vnese_df.head()

Unnamed: 0,CPU model,"Price, in VND",cpu_model_short
0,Intel(R) Pentium(R) CPU G2030 @ 3.00GHz,1453760,G2030 @ 3.00GHz
1,Intel(R) Pentium(R) CPU G3250 @ 3.20GHz,1294755,G3250 @ 3.20GHz
2,Intel(R) Core(TM) i3-4005U CPU @ 1.70GHz,6246625,i3-4005U @ 1.70GHz
3,Intel(R) Core(TM) i3-3217U CPU @ 1.80GHz,5110875,i3-3217U @ 1.80GHz
4,Intel(R) Core(TM) i3-5005U CPU @ 2.00GHz,6246625,i3-5005U @ 2.00GHz


In [8]:
print(len(set(cpu_price_vnese_df['cpu_model_short'].values)), len(set(df['cpu_model_short'].values)))

141 231


In [9]:
print(set(cpu_price_vnese_df['cpu_model_short'].values).intersection(set(df['cpu_model_short'].values)))

{'i5-6200U @ 2.30GHz', 'i7-6500U @ 2.50GHz', 'i3-6100 @ 3.70GHz', 'i5-6400 @ 2.70GHz', 'i3-5005U @ 2.00GHz', 'i7-4510U @ 2.00GHz', 'i5-7200U @ 2.50GHz', 'G4400 @ 3.30GHz', 'i3-6100U @ 2.30GHz', 'i5-5250U @ 1.60GHz', 'i3-5010U @ 2.10GHz'}


### Get List CPU from database

In [10]:
cpu_list_from_db = pd.read_csv('/home/phongdk/tmp/cpu_list', names=['cpu model'])
cpu_list_from_db.dropna(inplace=True)
cpu_list_from_db.shape

(2371, 1)

In [11]:
cpu_list_from_db['cpu_model_short'] = cpu_list_from_db['cpu model'].apply(lambda x: 
                                                                              get_cpu_name(x.replace('CPU ', '')))
cpu_list_from_db.head()

Unnamed: 0,cpu model,cpu_model_short
0,Intel(R) Core(TM) i3-4160 CPU @ 3.60GHz,i3-4160 @ 3.60GHz
1,Intel(R) Pentium(R) CPU P6200 @ 2.13GHz,P6200 @ 2.13GHz
2,Intel(R) Pentium(R) CPU G2030 @ 3.00GHz,G2030 @ 3.00GHz
3,Intel(R) Core(TM) i3-2120 CPU @ 3.30GHz,i3-2120 @ 3.30GHz
4,Intel(R) Core(TM) i5-4570 CPU @ 3.20GHz,i5-4570 @ 3.20GHz


In [12]:
print(set(cpu_list_from_db['cpu_model_short'].values).intersection(set(df['cpu_model_short'].values)))

{'i7-6950X @ 3.00GHz', 'i5-5350U @ 1.80GHz', 'i7-5775C @ 3.30GHz', 'i5-8500 @ 3.00GHz', 'i5-5257U @ 2.70GHz', 'i7-7700T @ 2.90GHz', 'i3-5005U @ 2.00GHz', 'i7-8500Y @ 1.50GHz', 'i7-5700HQ @ 2.70GHz', '4410Y @ 1.50GHz', 'i3-6006U @ 2.00GHz', 'i7-7700K @ 4.20GHz', '3865U @ 1.80GHz', 'i5-6300U @ 2.40GHz', 'i3-6300T @ 3.30GHz', 'i7-8550U @ 1.80GHz', 'i7-8700 @ 3.20GHz', 'i7-4510U @ 2.00GHz', 'i5-6198DU @ 2.30GHz', 'i5-5250U @ 1.60GHz', 'i3-6300 @ 3.80GHz', 'i5-8500T @ 2.10GHz', 'G5400 @ 3.70GHz', 'i9-8950HK @ 2.90GHz', 'i7-8750H @ 2.20GHz', 'i5-6200U @ 2.30GHz', 'i5-6400T @ 2.20GHz', 'i3-7350K @ 4.20GHz', 'G5600 @ 3.90GHz', 'G4920 @ 3.20GHz', 'i3-7320 @ 4.10GHz', 'i7-6560U @ 2.20GHz', 'i5-8600 @ 3.10GHz', 'i5-6260U @ 1.80GHz', 'i5-6300HQ @ 2.30GHz', 'i5-5675C @ 3.10GHz', 'i5-8250U @ 1.60GHz', 'i7-7700HQ @ 2.80GHz', 'i7-6770HQ @ 2.60GHz', 'm7-6Y75 @ 1.20GHz', 'i5-6267U @ 2.90GHz', 'E-2176M @ 2.70GHz', 'i7-7567U @ 3.50GHz', 'i7-6700K @ 4.00GHz', 'i7-8700T @ 2.40GHz', 'i3-6100 @ 3.70GHz', 'i3-

In [13]:
print(len(set(cpu_list_from_db['cpu_model_short'].values).intersection(set(df['cpu_model_short'].values))))

156


In [14]:
s = 'i3-8130U (4M cache, 2 Cores, 4 Threads, 2.20 GHz)'

In [26]:
re.findall(string=s, pattern='i3[a-zA-Z0-9-.]+', flags=re.IGNORECASE)

['i3-8130U']