# Priprava in čiščenje podatkov

## Get the data - Reading CSV Files with Encodings

In [25]:
import pandas as pd 
import numpy as np

In [28]:
# https://medium.com/cloud-computer/analyzing-big-data-with-grep-and-awk-c07d362b6ab8
# head ukaz je podprt samo v linux bash-u
!head -n 3 data/INPUT_laptops.csv

'head' is not recognized as an internal or external command,
operable program or batch file.


    df = pd.read_csv("filename.csv", encoding="some_encoding")

In [26]:
laptops = pd.read_csv("data/INPUT_laptops.csv", encoding="Latin-1")
laptops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Manufacturer              1303 non-null   object
 1   Model Name                1303 non-null   object
 2   Category                  1303 non-null   object
 3   Screen Size               1303 non-null   object
 4   Screen                    1303 non-null   object
 5   CPU                       1303 non-null   object
 6   RAM                       1303 non-null   object
 7    Storage                  1303 non-null   object
 8   GPU                       1303 non-null   object
 9   Operating System          1303 non-null   object
 10  Operating System Version  1133 non-null   object
 11  Weight                    1303 non-null   object
 12  Price (Euros)             1303 non-null   object
dtypes: object(13)
memory usage: 132.5+ KB


In [27]:
laptops.head()

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM,Storage,GPU,Operating System,Operating System Version,Weight,Price (Euros)
0,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,133969
1,Apple,Macbook Air,Ultrabook,"13.3""",1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,89894
2,HP,250 G6,Notebook,"15.6""",Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,,1.86kg,57500
3,Apple,MacBook Pro,Ultrabook,"15.4""",IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,,1.83kg,253745
4,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,,1.37kg,180360


In [31]:
# laptops[" Storage"]

## Cleaning Column Names

In [28]:
laptops.columns

Index(['Manufacturer', 'Model Name', 'Category', 'Screen Size', 'Screen',
       'CPU', 'RAM', ' Storage', 'GPU', 'Operating System',
       'Operating System Version', 'Weight', 'Price (Euros)'],
      dtype='object')

In [29]:
def clean_column_name(name:str):
    name = name.strip().lower()
    name = name.replace("operating system", "os").replace(" ", "_")
    name = name.replace("(", "")
    name = name.replace(")", "")
    return name

new_columns_names = [clean_column_name(c) for c in laptops.columns]
laptops.columns = new_columns_names
laptops.head()

Unnamed: 0,manufacturer,model_name,category,screen_size,screen,cpu,ram,storage,gpu,os,os_version,weight,price_euros
0,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,133969
1,Apple,Macbook Air,Ultrabook,"13.3""",1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,89894
2,HP,250 G6,Notebook,"15.6""",Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,,1.86kg,57500
3,Apple,MacBook Pro,Ultrabook,"15.4""",IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,,1.83kg,253745
4,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,,1.37kg,180360


## Kako operirati s stolpci v kodi

In [34]:
# NACIN 1
# uporabljamo stringe, ampak uporabljamo neko standardno preciscevanje podatkov
# npr: predpona_ime_stolpca
# pri tem nacinu moramo poskrbeti, da precistimo imena stolpcev -> se znebimo predsledkov, posebnih znakov itd...

# NACIN 2
from enum import Enum
class Laptops(Enum):
    MODEL_NAME = "model_name"
    CATEGORY = "category"
    
Laptops.MODEL_NAME.value

'model_name'

## Converting String Columns to Numeric


<p><img alt="string to numeric cleaning workflow" src="images/cleaning_workflow.svg"></p>


In [30]:
laptops["screen_size"].dtype

dtype('O')

In [21]:
laptops["screen_size"].unique()

array(['13.3"', '15.6"', '15.4"', '14.0"', '12.0"', '11.6"', '17.3"',
       '10.1"', '13.5"', '12.5"', '13.0"', '18.4"', '13.9"', '12.3"',
       '17.0"', '15.0"', '14.1"', '11.3"'], dtype=object)

In [31]:
laptops["screen_size"].value_counts()

15.6"    665
14.0"    197
13.3"    164
17.3"    164
12.5"     39
11.6"     33
12.0"      6
13.5"      6
13.9"      6
12.3"      5
10.1"      4
15.4"      4
15.0"      4
13.0"      2
18.4"      1
17.0"      1
14.1"      1
11.3"      1
Name: screen_size, dtype: int64

In [36]:
laptops["screen_size"] = laptops["screen_size"].str.replace('"','')
laptops["screen_size"] = laptops["screen_size"].astype("float")

AttributeError: Can only use .str accessor with string values!


<p></p><center><img alt="vectorized_string_methods" src="images/Syntax.png"></center><p></p>





In [37]:
laptops.dtypes

manufacturer     object
model_name       object
category         object
screen_size     float64
screen           object
cpu              object
ram              object
storage          object
gpu              object
os               object
os_version       object
weight           object
price_euros      object
dtype: object

In [38]:
print(laptops["ram"].unique())
laptops["ram"] = laptops["ram"].str.replace("GB", "")
laptops["ram"] = laptops["ram"].astype("int")
laptops.dtypes

['8GB' '16GB' '4GB' '2GB' '12GB' '6GB' '32GB' '24GB' '64GB']


manufacturer     object
model_name       object
category         object
screen_size     float64
screen           object
cpu              object
ram               int32
storage          object
gpu              object
os               object
os_version       object
weight           object
price_euros      object
dtype: object

## Renaming Columns

In [39]:
laptops.rename(columns={"ram":"ram_gb"}, inplace=True)
laptops

Unnamed: 0,manufacturer,model_name,category,screen_size,screen,cpu,ram_gb,storage,gpu,os,os_version,weight,price_euros
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,133969
1,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,89894
2,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,256GB SSD,Intel HD Graphics 620,No OS,,1.86kg,57500
3,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16,512GB SSD,AMD Radeon Pro 455,macOS,,1.83kg,253745
4,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8,256GB SSD,Intel Iris Plus Graphics 650,macOS,,1.37kg,180360
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1298,Lenovo,Yoga 500-14ISK,2 in 1 Convertible,14.0,IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i7 6500U 2.5GHz,4,128GB SSD,Intel HD Graphics 520,Windows,10,1.8kg,63800
1299,Lenovo,Yoga 900-13ISK,2 in 1 Convertible,13.3,IPS Panel Quad HD+ / Touchscreen 3200x1800,Intel Core i7 6500U 2.5GHz,16,512GB SSD,Intel HD Graphics 520,Windows,10,1.3kg,149900
1300,Lenovo,IdeaPad 100S-14IBR,Notebook,14.0,1366x768,Intel Celeron Dual Core N3050 1.6GHz,2,64GB Flash Storage,Intel HD Graphics,Windows,10,1.5kg,22900
1301,HP,15-AC110nv (i7-6500U/6GB/1TB/Radeon,Notebook,15.6,1366x768,Intel Core i7 6500U 2.5GHz,6,1TB HDD,AMD Radeon R5 M330,Windows,10,2.19kg,76400


In [41]:
laptops.rename(columns={"screen_size":"screen_size_inch"}, inplace=True)
laptops

Unnamed: 0,manufacturer,model_name,category,screen_size_inch,screen,cpu,ram_gb,storage,gpu,os,os_version,weight,price_euros
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,133969
1,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,89894
2,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,256GB SSD,Intel HD Graphics 620,No OS,,1.86kg,57500
3,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16,512GB SSD,AMD Radeon Pro 455,macOS,,1.83kg,253745
4,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8,256GB SSD,Intel Iris Plus Graphics 650,macOS,,1.37kg,180360
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1298,Lenovo,Yoga 500-14ISK,2 in 1 Convertible,14.0,IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i7 6500U 2.5GHz,4,128GB SSD,Intel HD Graphics 520,Windows,10,1.8kg,63800
1299,Lenovo,Yoga 900-13ISK,2 in 1 Convertible,13.3,IPS Panel Quad HD+ / Touchscreen 3200x1800,Intel Core i7 6500U 2.5GHz,16,512GB SSD,Intel HD Graphics 520,Windows,10,1.3kg,149900
1300,Lenovo,IdeaPad 100S-14IBR,Notebook,14.0,1366x768,Intel Celeron Dual Core N3050 1.6GHz,2,64GB Flash Storage,Intel HD Graphics,Windows,10,1.5kg,22900
1301,HP,15-AC110nv (i7-6500U/6GB/1TB/Radeon,Notebook,15.6,1366x768,Intel Core i7 6500U 2.5GHz,6,1TB HDD,AMD Radeon R5 M330,Windows,10,2.19kg,76400


In [40]:
laptops["ram_gb"].describe()

count    1303.000000
mean        8.382195
std         5.084665
min         2.000000
25%         4.000000
50%         8.000000
75%         8.000000
max        64.000000
Name: ram_gb, dtype: float64

## Extracting Values from Strings

In [48]:
laptops["cpu"]

0                       Intel Core i5 2.3GHz
1                       Intel Core i5 1.8GHz
2                 Intel Core i5 7200U 2.5GHz
3                       Intel Core i7 2.7GHz
4                       Intel Core i5 3.1GHz
                        ...                 
1298              Intel Core i7 6500U 2.5GHz
1299              Intel Core i7 6500U 2.5GHz
1300    Intel Celeron Dual Core N3050 1.6GHz
1301              Intel Core i7 6500U 2.5GHz
1302    Intel Celeron Dual Core N3050 1.6GHz
Name: cpu, Length: 1303, dtype: object

In [43]:
# izlusci CPU manufacturers from cpu
laptops["cpu_manufacturer"] = laptops["cpu"].str.split().str[0]
laptops["cpu_manufacturer"].value_counts()
# izbrisi stolpec
# laptops.drop(columns=["cpu_manufacturer"],inplace=True)

Intel      1240
AMD          62
Samsung       1
Name: cpu_manufacturer, dtype: int64

In [44]:
# izlusci GPU manufacturers from gpu
laptops["gpu_manufacturer"] = laptops["gpu"].str.split().str[0]
laptops["gpu_manufacturer"].value_counts()

Intel     722
Nvidia    400
AMD       180
ARM         1
Name: gpu_manufacturer, dtype: int64

## Correcting Bad Values - map() method

Primer kako s slovarjem in metodo map popravimo vrednosti
### pozor
Bodi pozoren na default argument na_action https://pandas.pydata.org/docs/reference/api/pandas.Series.map.html

In [59]:
s = pd.Series(['pair', 'oranje', 'bananna', 'oranje', 'oranje', 'oranje'])
s

0       pair
1     oranje
2    bananna
3     oranje
4     oranje
5     oranje
dtype: object

In [60]:
# pripravimo slovar parov za korekcije
corrections = {
    "pair": "pear",
    "oranje": "orange",
    "bananna": "banana"
}
# POZOR: Ce v dict ni dolocenega kljuca, se bo ta v originalnem df po mappiranju preslikala v NaN
# za to delaj z value_counts

In [61]:
s.map(corrections)
s

0       pair
1     oranje
2    bananna
3     oranje
4     oranje
5     oranje
dtype: object

Primer na našem laptop dataframe-u

In [62]:
laptops["os"].value_counts()
# vidimo da je MacOS napisan na dva načina (nekonsistentni vnosi) - macOS in Mac OS

Windows      1125
No OS          66
Linux          62
Chrome OS      27
macOS          13
Mac OS          8
Android         2
Name: os, dtype: int64

In [63]:
mapping_dict = {
    'Android': 'Android',
    'Chrome OS': 'Chrome OS',
    'Linux': 'Linux',
    'Mac OS': 'macOS', # presilkamo v enoznačen macOS
    'No OS': 'No OS',
    'Windows': 'Windows',
    'macOS': 'macOS' # presilkamo v enoznačen macOS
}

In [64]:
laptops["os"] = laptops["os"].map(mapping_dict)
laptops["os"].value_counts()

Windows      1125
No OS          66
Linux          62
Chrome OS      27
macOS          21
Android         2
Name: os, dtype: int64

## Introduction to Missing Data 

### Trade-Offs in Missing Data Conventions


### Missing Data in Pandas




### None: Pythonic missing data


In [65]:
vals1 = np.array([1,None,2,4]) # ni smiselno None uporabljati v np arrayih
vals1.sum()

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

### NaN: Missing numerical data



In [67]:
vals2 = np.array([1, np.nan, 3, 4])

vals2.dtype



dtype('float64')

In [68]:
print(f"sestevanje z NaN vrne: {np.nan + 1}")
print(f"Mnozenje z NaN vrne: {np.nan * 0}")

sestevanje z NaN vrne: nan
Mnozenje z NaN vrne: nan


In [76]:
# metode ki jih lahko uporabljamo z nan
print(np.nansum(vals2))


8.0


In [77]:
np.nanmean(vals2)

2.6666666666666665

### NaN and None in Pandas



In [None]:
pd.Series([1, np.nan, 2, None]) # None vrednost se bo pretvorila v nan
# pd.Series([1, np.nan, 2, None], dtype=int) # bi javilo da ne more None convertati v int: "ValueError: cannot convert float NaN to integer"


<table>
<thead><tr>
<th>Typeclass</th>
<th>Conversion When Storing NAs</th>
<th>NA Sentinel Value</th>
</tr>
</thead>
<tbody>
<tr>
<td><code>floating</code></td>
<td>No change</td>
<td><code>np.nan</code></td>
</tr>
<tr>
<td><code>object</code></td>
<td>No change</td>
<td><code>None</code> or <code>np.nan</code></td>
</tr>
<tr>
<td><code>integer</code></td>
<td>Cast to <code>float64</code></td>
<td><code>np.nan</code></td>
</tr>
<tr>
<td><code>boolean</code></td>
<td>Cast to <code>object</code></td>
<td><code>None</code> or <code>np.nan</code></td>
</tr>
</tbody>
</table>


### Operating on Null Values



- `isnull()`: Generate a boolean mask indicating missing values
- `notnull()`: Opposite of isnull()
- `dropna()`: Return a filtered version of the data
- `fillna()`: Return a copy of the data with missing values filled or imputed



#### Detecting null values



In [74]:
data = pd.Series([1, np.nan, 'hello', None, -1, 0, -9999, "NA"])

In [75]:
data

0        1
1      NaN
2    hello
3     None
4       -1
5        0
6    -9999
7       NA
dtype: object

In [78]:
data.isnull()

0    False
1     True
2    False
3     True
4    False
5    False
6    False
7    False
dtype: bool

In [None]:
# izberi podatke, ki niso manjkajoci (indeksi se ohraniju)
data[data.notnull()] #.reset_index(drop=True)

In [None]:
# enako kot zgoraj
# v tem primeru prikazemo se metodo reset_index ki ponovno indeksira (bi jo lahko uporabili tudi zgoraj)
data.dropna().reset_index(drop=True)

#### Dropping null values


In [86]:
df = pd.DataFrame([[1,      np.nan, 2],
                   [2,      3,      5],
                   [np.nan, 4,      6]])
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [83]:
# privzeto po vrticah
df.dropna(inplace=True)
df

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [85]:
# odstranimo po stolpcih
df.dropna(axis="columns", inplace=True)
df


Unnamed: 0,2
0,2
1,5
2,6


In [87]:
# how: all or any
df[3] = np.nan
df.dropna(axis="columns", how="all")

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [None]:
df.dropna(axis="rows", thresh=3) # ostane le vrstica, ki ima vsaj 3 vrednosti

In [None]:
df.dropna(axis="columns", thresh=3) # ostanejo le stolpci, ki imajo dovolj podatkov - threshold ponavadi izracunamo % glede na stevilo vrstic

#### Filling null values



In [93]:
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))

TypeError: __init__() got multiple values for argument 'index'

In [89]:
data

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [90]:
data.fillna(0)

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

In [92]:
data.fillna(method="ffill") # forward fill

a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

In [None]:
data.fillna(method="bfill") # backward filld

In [None]:
df

In [99]:
df = pd.DataFrame([[1,      np.nan, 2],
                   [2,      3,      5],
                   [np.nan, 4,      6]])
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [98]:
df.fillna(method="ffill", axis=0) # columns

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,2.0,4.0,6


In [100]:
df.fillna(method="ffill", axis=1) # rows

Unnamed: 0,0,1,2
0,1.0,1.0,2.0
1,2.0,3.0,5.0
2,,4.0,6.0


## Dropping Missing Values

In [105]:
# preveri koliko je v DF manjkajocih vrednostih (sum na koncu poklicemo da prikaze po posameznih stolpcih)
laptops.isnull().sum()
laptops

Unnamed: 0,manufacturer,model_name,category,screen_size,screen,cpu,ram_gb,storage,gpu,os,os_version,weight,price_euros
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,133969
1,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,89894
2,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8,256GB SSD,Intel HD Graphics 620,No OS,,1.86kg,57500
3,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16,512GB SSD,AMD Radeon Pro 455,macOS,,1.83kg,253745
4,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8,256GB SSD,Intel Iris Plus Graphics 650,macOS,,1.37kg,180360
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1298,Lenovo,Yoga 500-14ISK,2 in 1 Convertible,14.0,IPS Panel Full HD / Touchscreen 1920x1080,Intel Core i7 6500U 2.5GHz,4,128GB SSD,Intel HD Graphics 520,Windows,10,1.8kg,63800
1299,Lenovo,Yoga 900-13ISK,2 in 1 Convertible,13.3,IPS Panel Quad HD+ / Touchscreen 3200x1800,Intel Core i7 6500U 2.5GHz,16,512GB SSD,Intel HD Graphics 520,Windows,10,1.3kg,149900
1300,Lenovo,IdeaPad 100S-14IBR,Notebook,14.0,1366x768,Intel Celeron Dual Core N3050 1.6GHz,2,64GB Flash Storage,Intel HD Graphics,Windows,10,1.5kg,22900
1301,HP,15-AC110nv (i7-6500U/6GB/1TB/Radeon,Notebook,15.6,1366x768,Intel Core i7 6500U 2.5GHz,6,1TB HDD,AMD Radeon R5 M330,Windows,10,2.19kg,76400


In [106]:
laptops.shape

(1303, 13)

In [107]:
laptops.dropna(axis=1).shape # odstranimo stolpec (v tem primeru os_version)

(1303, 12)

In [108]:
laptops.dropna(axis=0).shape # odstranimo vse vrstice kjer je podatek enako nan (v tem primeru so to vrstice kjer os_version = NaN)

(1133, 13)

## Filling Missing Values

In [109]:
laptops["os_version"].value_counts(dropna=False) # dropna=False --> prikaze nam tudi sestevek nan vrednosti

10      1072
NaN      170
7         45
X          8
10 S       8
Name: os_version, dtype: int64

In [None]:
# prestej frekvence os, kjer je os_version manjkajoc
laptops.loc[laptops["os_version"].isnull(), "os"].value_counts()

In [None]:
# med manjkajocimi vrednostmi os poiscemo te, ki morda imajo vpisan podatek za os_version
laptops.loc[laptops["os"] == "macOS", ["os", "os_version"]].head(10)

In [None]:
# odvisno od vsebine ... recimo da se odlocimo da naj bodo vsi neznani macOS sedaj X (ker smo nasli tak primer: celica zgoraj)
laptops.loc[laptops["os"] == "macOS", "os_version"] = "X"


In [None]:
laptops.loc[laptops["os_version"].isnull(), "os_version"] = "Version unknown"

In [None]:
laptops

In [110]:
# preverimo ce smo odstranili vse manjkajoce vrednosti iz nasega dataframe-a
laptops.isnull().sum()

manufacturer      0
model_name        0
category          0
screen_size       0
screen            0
cpu               0
ram_gb            0
storage           0
gpu               0
os                0
os_version      170
weight            0
price_euros       0
dtype: int64

## Removing Duplicates

In [112]:
# prikazi stevilo podvojenih vrstic (True) ... celoten entry je duplicated
laptops.duplicated().value_counts()

0       False
1       False
2       False
3       False
4       False
        ...  
1298     True
1299     True
1300     True
1301     True
1302     True
Length: 1303, dtype: bool

In [117]:
laptops[laptops.duplicated()].sort_values("model_name")

Unnamed: 0,manufacturer,model_name,category,screen_size,screen,cpu,ram_gb,storage,gpu,os,os_version,weight,price_euros
1287,HP,15-AC110nv (i7-6500U/6GB/1TB/Radeon,Notebook,15.6,1366x768,Intel Core i7 6500U 2.5GHz,6,1TB HDD,AMD Radeon R5 M330,Windows,10.0,2.19kg,76400
1301,HP,15-AC110nv (i7-6500U/6GB/1TB/Radeon,Notebook,15.6,1366x768,Intel Core i7 6500U 2.5GHz,6,1TB HDD,AMD Radeon R5 M330,Windows,10.0,2.19kg,76400
1277,Acer,Aspire ES1-531,Notebook,15.6,1366x768,Intel Celeron Dual Core N3060 1.6GHz,4,500GB HDD,Intel HD Graphics 400,Linux,,2.4kg,28900
1291,Acer,Aspire ES1-531,Notebook,15.6,1366x768,Intel Celeron Dual Core N3060 1.6GHz,4,500GB HDD,Intel HD Graphics 400,Linux,,2.4kg,28900
1300,Lenovo,IdeaPad 100S-14IBR,Notebook,14.0,1366x768,Intel Celeron Dual Core N3050 1.6GHz,2,64GB Flash Storage,Intel HD Graphics,Windows,10.0,1.5kg,22900
1286,Lenovo,IdeaPad 100S-14IBR,Notebook,14.0,1366x768,Intel Celeron Dual Core N3050 1.6GHz,2,64GB Flash Storage,Intel HD Graphics,Windows,10.0,1.5kg,22900
1293,Lenovo,IdeaPad Y700-15ISK,Notebook,15.6,IPS Panel Full HD 1920x1080,Intel Core i7 6700HQ 2.6GHz,8,1TB HDD,Nvidia GeForce GTX 960M,Windows,10.0,2.6kg,89900
1279,Lenovo,IdeaPad Y700-15ISK,Notebook,15.6,IPS Panel Full HD 1920x1080,Intel Core i7 6700HQ 2.6GHz,8,1TB HDD,Nvidia GeForce GTX 960M,Windows,10.0,2.6kg,89900
1278,Dell,Inspiron 3552,Notebook,15.6,1366x768,Intel Celeron Dual Core N3050 1.6GHz,2,500GB HDD,Intel HD Graphics,Windows,10.0,2.20kg,37900
1292,Dell,Inspiron 3552,Notebook,15.6,1366x768,Intel Celeron Dual Core N3050 1.6GHz,2,500GB HDD,Intel HD Graphics,Windows,10.0,2.20kg,37900


In [None]:
zenbook_model = 'ZenBook UX305CA-UBM1'
laptops[laptops["model_name"] == zenbook_model]

In [None]:
laptops.shape

In [None]:
# odstrani podvojene vrednosti (celotni entryji so enaki)
laptops.drop_duplicates(inplace=True)

In [None]:
laptops.shape

## Replacing Values
Za razliko od metode map, replace metoda po defaultu nadomesti samo izrecno zapisane vrednosti

In [None]:
laptops["manufacturer"].value_counts()

In [None]:
laptops.replace("MSI", "Micro-Star", inplace=True)

In [None]:
laptops["manufacturer"].value_counts()

In [120]:
# dodaten primer:
df = pd.DataFrame({'A': [0, 1, 2, 3, 4],
                   'B': [5, 6, 0, 8, 9],
                   'C': ['a', 'b', 'c', 'd', 'e']})

In [121]:
df.replace({0: 10, 1: 100, 'a':'z'}, inplace=True)
df

Unnamed: 0,A,B,C
0,10,5,z
1,100,6,b
2,2,10,c
3,3,8,d
4,4,9,e


## Dropping Columns 

In [None]:
laptops.head()

In [None]:
laptops.drop(columns=["category", "gpu"], inplace=True)

In [None]:
laptops.head()

# Vaja

In [None]:
# weight pretvori v stevilske vrednosti
laptops["weight"]

In [None]:
laptops["weight"] = laptops["weight"].str.replace("kgs", "").str.replace("kg", "").astype("float")

In [None]:
laptops.rename({"weight":"weight_kg"}, axis=1, inplace=True)

In [None]:
laptops

### Convert the price_euros column to a numeric dtype.

In [52]:
# pretvori stolpec euro v stevilske vrednosti
laptops["price_euros"] = laptops["price_euros"].str.replace(",",".").astype("float")

In [53]:
laptops["price_euros"].mean()

1123.6869915579432

### Extract the screen resolution from the screen column.

In [54]:
# regex metoda
# laptops["screen"].str.extract('(\d+x\d+)').head()

# split metoda
resolution = laptops["screen"].str.split(" ").str[-1].str.split("x")
laptops["screen_width_px"] = resolution.str[0].astype("int")
laptops["screen_height_px"] = resolution.str[1].astype("int")
laptops.drop(columns=["screen"], inplace=True)

In [55]:
laptops

Unnamed: 0,manufacturer,model_name,category,screen_size_inch,cpu,ram_gb,storage,gpu,os,os_version,weight,price_euros,cpu_manufacturer,gpu_manufacturer,screen_width_px,screen_height_px
0,Apple,MacBook Pro,Ultrabook,13.3,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,1339.69,Intel,Intel,2560,1600
1,Apple,Macbook Air,Ultrabook,13.3,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,898.94,Intel,Intel,1440,900
2,HP,250 G6,Notebook,15.6,Intel Core i5 7200U 2.5GHz,8,256GB SSD,Intel HD Graphics 620,No OS,,1.86kg,575.00,Intel,Intel,1920,1080
3,Apple,MacBook Pro,Ultrabook,15.4,Intel Core i7 2.7GHz,16,512GB SSD,AMD Radeon Pro 455,macOS,,1.83kg,2537.45,Intel,AMD,2880,1800
4,Apple,MacBook Pro,Ultrabook,13.3,Intel Core i5 3.1GHz,8,256GB SSD,Intel Iris Plus Graphics 650,macOS,,1.37kg,1803.60,Intel,Intel,2560,1600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1298,Lenovo,Yoga 500-14ISK,2 in 1 Convertible,14.0,Intel Core i7 6500U 2.5GHz,4,128GB SSD,Intel HD Graphics 520,Windows,10,1.8kg,638.00,Intel,Intel,1920,1080
1299,Lenovo,Yoga 900-13ISK,2 in 1 Convertible,13.3,Intel Core i7 6500U 2.5GHz,16,512GB SSD,Intel HD Graphics 520,Windows,10,1.3kg,1499.00,Intel,Intel,3200,1800
1300,Lenovo,IdeaPad 100S-14IBR,Notebook,14.0,Intel Celeron Dual Core N3050 1.6GHz,2,64GB Flash Storage,Intel HD Graphics,Windows,10,1.5kg,229.00,Intel,Intel,1366,768
1301,HP,15-AC110nv (i7-6500U/6GB/1TB/Radeon,Notebook,15.6,Intel Core i7 6500U 2.5GHz,6,1TB HDD,AMD Radeon R5 M330,Windows,10,2.19kg,764.00,Intel,AMD,1366,768


### Extract the processor speed from the cpu column.

In [83]:
cpu_speed = laptops["cpu"].str.split(" ").str[-1].str.replace("GHz", "").astype("float")
laptops["cpu_speed_GHz"] = cpu_speed
# laptops.insert(4, 'cpu_speed_GHz', cpu_speed)
laptops

Unnamed: 0,manufacturer,model_name,category,screen_size_inch,cpu,ram_gb,storage,gpu,os,os_version,weight,price_euros,cpu_manufacturer,gpu_manufacturer,screen_width_px,screen_height_px,cpu_speed_GHz
0,Apple,MacBook Pro,Ultrabook,13.3,Intel Core i5 2.3GHz,8,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,1339.69,Intel,Intel,2560,1600,2.3
1,Apple,Macbook Air,Ultrabook,13.3,Intel Core i5 1.8GHz,8,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,898.94,Intel,Intel,1440,900,1.8
2,HP,250 G6,Notebook,15.6,Intel Core i5 7200U 2.5GHz,8,256GB SSD,Intel HD Graphics 620,No OS,,1.86kg,575.00,Intel,Intel,1920,1080,2.5
3,Apple,MacBook Pro,Ultrabook,15.4,Intel Core i7 2.7GHz,16,512GB SSD,AMD Radeon Pro 455,macOS,,1.83kg,2537.45,Intel,AMD,2880,1800,2.7
4,Apple,MacBook Pro,Ultrabook,13.3,Intel Core i5 3.1GHz,8,256GB SSD,Intel Iris Plus Graphics 650,macOS,,1.37kg,1803.60,Intel,Intel,2560,1600,3.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1298,Lenovo,Yoga 500-14ISK,2 in 1 Convertible,14.0,Intel Core i7 6500U 2.5GHz,4,128GB SSD,Intel HD Graphics 520,Windows,10,1.8kg,638.00,Intel,Intel,1920,1080,2.5
1299,Lenovo,Yoga 900-13ISK,2 in 1 Convertible,13.3,Intel Core i7 6500U 2.5GHz,16,512GB SSD,Intel HD Graphics 520,Windows,10,1.3kg,1499.00,Intel,Intel,3200,1800,2.5
1300,Lenovo,IdeaPad 100S-14IBR,Notebook,14.0,Intel Celeron Dual Core N3050 1.6GHz,2,64GB Flash Storage,Intel HD Graphics,Windows,10,1.5kg,229.00,Intel,Intel,1366,768,1.6
1301,HP,15-AC110nv (i7-6500U/6GB/1TB/Radeon,Notebook,15.6,Intel Core i7 6500U 2.5GHz,6,1TB HDD,AMD Radeon R5 M330,Windows,10,2.19kg,764.00,Intel,AMD,1366,768,2.5


### Extract storage size numbers from storage

In [82]:
# https://regexr.com/ -> orodje za pisanje regexov
laptops["storage"].unique()
storage_regex = "(?P<size_disk_1>\d+)(?P<unit_disk_1>[T,G]B)\s*([\w\s+]+)"
laptops["storage"].str.extract(storage_regex).tail(30)

# laptops["storage_type"]

Unnamed: 0,size_disk_1,unit_disk_1,2
1273,1,TB,HDD
1274,500,GB,HDD
1275,512,GB,SSD
1276,1,TB,HDD
1277,500,GB,HDD
1278,500,GB,HDD
1279,1,TB,HDD
1280,1,TB,Hybrid
1281,1,TB,HDD
1282,32,GB,Flash Storage


## Save clean data to CSV file

In [84]:
# izvozi podatke v CSV
laptops.to_csv("data/clean_laptops.csv")

## Analiza

### Are laptops made by Apple more expensive than those made by other manufacturers?


In [91]:
laptops.groupby("manufacturer")["price_euros"].mean().sort_values(ascending=False)

manufacturer
Razer        3346.142857
LG           2099.000000
MSI          1728.908148
Google       1677.666667
Microsoft    1612.308333
Apple        1564.198571
Huawei       1424.000000
Samsung      1413.444444
Toshiba      1267.812500
Dell         1186.068990
Xiaomi       1133.462500
Asus         1104.169367
Lenovo       1086.384444
HP           1067.774854
Fujitsu       729.000000
Acer          626.775825
Chuwi         314.296667
Mediacom      295.000000
Vero          217.425000
Name: price_euros, dtype: float64

### What is the best value laptop with a screen size of 15" or more?
            

In [106]:
laptops.loc[laptops["screen_size_inch"] >= 15.0, ["model_name", "screen_size_inch","price_euros"]]\
            .sort_values(by="price_euros").head(5)

Unnamed: 0,model_name,screen_size_inch,price_euros
290,Chromebook C910-C2ST,15.6,199.0
1102,Chromebook 15,15.6,209.0
555,A541NA-GO342 (N3350/4GB/500GB/Linux),15.6,224.0
30,"LapBook 15.6""",15.6,244.99
483,"Lapbook 15,6",15.6,248.9


### Which laptop has the most RAM?

In [111]:
laptops.loc[laptops["ram_gb"] == laptops["ram_gb"].max()]

Unnamed: 0,manufacturer,model_name,category,screen_size_inch,cpu,ram_gb,storage,gpu,os,os_version,weight,price_euros,cpu_manufacturer,gpu_manufacturer,screen_width_px,screen_height_px,cpu_speed_GHz
1066,Asus,ROG G701VO,Gaming,17.3,Intel Core i7 6820HK 2.7GHz,64,1TB SSD,Nvidia GeForce GTX 980,Windows,10,3.58kg,3975.0,Intel,Nvidia,1920,1080,2.7
