# Data Cleaning Basics
We'll be working with laptops.csv, a CSV file containing information on about 1,300 laptop computers.

In [1]:
import pandas as pd

In [2]:
laptops = pd.read_csv('laptops.csv', encoding="Latin-1")
laptops.head()

Unnamed: 0,Manufacturer,Model Name,Category,Screen Size,Screen,CPU,RAM,Storage,GPU,Operating System,Operating System Version,Weight,Price (Euros)
0,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,133969
1,Apple,Macbook Air,Ultrabook,"13.3""",1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,89894
2,HP,250 G6,Notebook,"15.6""",Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,,1.86kg,57500
3,Apple,MacBook Pro,Ultrabook,"15.4""",IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,,1.83kg,253745
4,Apple,MacBook Pro,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,,1.37kg,180360


In [3]:
laptops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1303 entries, 0 to 1302
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Manufacturer              1303 non-null   object
 1   Model Name                1303 non-null   object
 2   Category                  1303 non-null   object
 3   Screen Size               1303 non-null   object
 4   Screen                    1303 non-null   object
 5   CPU                       1303 non-null   object
 6   RAM                       1303 non-null   object
 7    Storage                  1303 non-null   object
 8   GPU                       1303 non-null   object
 9   Operating System          1303 non-null   object
 10  Operating System Version  1133 non-null   object
 11  Weight                    1303 non-null   object
 12  Price (Euros)             1303 non-null   object
dtypes: object(13)
memory usage: 132.5+ KB


In [4]:
laptops.columns

Index(['Manufacturer', 'Model Name', 'Category', 'Screen Size', 'Screen',
       'CPU', 'RAM', ' Storage', 'GPU', 'Operating System',
       'Operating System Version', 'Weight', 'Price (Euros)'],
      dtype='object')

In [5]:
laptops.index.values

array([   0,    1,    2, ..., 1300, 1301, 1302])

## class pandas.Index
- Immutable ndarray implementing an ordered, sliceable set. The basic object storing axis labels for all pandas objects.
- If you want learn more, you can read at [here](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Index.html)

In [6]:
print(type(laptops.columns))
laptops.columns.dtype

<class 'pandas.core.indexes.base.Index'>


dtype('O')

In [7]:
laptops_test = laptops.copy()
laptops_test.columns = ['A', 'B', 'C', 'D', 'E', 'G', 'F', 'H', 'I', 'J', 'K', 'L', 'M']
laptops_test.columns

Index(['A', 'B', 'C', 'D', 'E', 'G', 'F', 'H', 'I', 'J', 'K', 'L', 'M'], dtype='object')

In [8]:
def clean_col(col):
    col = col.strip()
    col = col.replace("Operating System", "os")
    col = col.replace(" ","_")
    col = col.replace("(","")
    col = col.replace(")","")
    col = col.lower()
    return col
laptops.columns = [clean_col(c) for c in laptops.columns]
laptops.columns

Index(['manufacturer', 'model_name', 'category', 'screen_size', 'screen',
       'cpu', 'ram', 'storage', 'gpu', 'os', 'os_version', 'weight',
       'price_euros'],
      dtype='object')

In [9]:
laptops.iloc[:5, 2:5]

Unnamed: 0,category,screen_size,screen
0,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600
1,Ultrabook,"13.3""",1440x900
2,Notebook,"15.6""",Full HD 1920x1080
3,Ultrabook,"15.4""",IPS Panel Retina Display 2880x1800
4,Ultrabook,"13.3""",IPS Panel Retina Display 2560x1600


The first character specifies the kind of data and the remaining characters specify the number of bytes per item, except for Unicode, where it is interpreted as the number of characters. The item size must correspond to an existing type, or an error will be raised. The supported kinds are to an existing type, or an error will be raised. The supported kinds are:
- 'b'       boolean
- 'i'       (signed) integer
- 'u'       unsigned integer
- 'f'       floating-point
- 'c'       complex-floating point
- 'O'       (Python) objects
- 'S', 'a'  (byte-)string
- 'U'       Unicode
- 'V'       raw data (void)

In [10]:
laptops['screen_size'].dtype

dtype('O')

In [11]:
laptops['screen_size'].unique()

array(['13.3"', '15.6"', '15.4"', '14.0"', '12.0"', '11.6"', '17.3"',
       '10.1"', '13.5"', '12.5"', '13.0"', '18.4"', '13.9"', '12.3"',
       '17.0"', '15.0"', '14.1"', '11.3"'], dtype=object)

In [12]:
laptops['screen_size'] = laptops['screen_size'].str.replace('"','')
laptops['screen_size'].unique()

array(['13.3', '15.6', '15.4', '14.0', '12.0', '11.6', '17.3', '10.1',
       '13.5', '12.5', '13.0', '18.4', '13.9', '12.3', '17.0', '15.0',
       '14.1', '11.3'], dtype=object)

In [13]:
laptops['screen_size'] = laptops['screen_size'].astype(float)
laptops['screen_size'].unique()

array([13.3, 15.6, 15.4, 14. , 12. , 11.6, 17.3, 10.1, 13.5, 12.5, 13. ,
       18.4, 13.9, 12.3, 17. , 15. , 14.1, 11.3])

In [14]:
laptops['screen_size'].dtype

dtype('float64')

In [15]:
laptops.columns

Index(['manufacturer', 'model_name', 'category', 'screen_size', 'screen',
       'cpu', 'ram', 'storage', 'gpu', 'os', 'os_version', 'weight',
       'price_euros'],
      dtype='object')

## pandas.DataFrame.rename
DataFrame.rename(self, mapper=None, index=None, columns=None, axis=None, copy=True, inplace=False, level=None, errors='ignore')
You can refer at document [here](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rename.html)

In [16]:
laptops.rename({'screen_size': "screen_size_inches"}, axis=1, inplace=True)
laptops.columns

Index(['manufacturer', 'model_name', 'category', 'screen_size_inches',
       'screen', 'cpu', 'ram', 'storage', 'gpu', 'os', 'os_version', 'weight',
       'price_euros'],
      dtype='object')

In [17]:
laptops.index

RangeIndex(start=0, stop=1303, step=1)

In [18]:
laptops.rename(index=str).index

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '1293', '1294', '1295', '1296', '1297', '1298', '1299', '1300', '1301',
       '1302'],
      dtype='object', length=1303)

In [19]:
laptops.head()

Unnamed: 0,manufacturer,model_name,category,screen_size_inches,screen,cpu,ram,storage,gpu,os,os_version,weight,price_euros
0,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,,1.37kg,133969
1,Apple,Macbook Air,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,,1.34kg,89894
2,HP,250 G6,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,,1.86kg,57500
3,Apple,MacBook Pro,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,,1.83kg,253745
4,Apple,MacBook Pro,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,,1.37kg,180360


In [20]:
laptops["ram"].unique()

array(['8GB', '16GB', '4GB', '2GB', '12GB', '6GB', '32GB', '24GB', '64GB'],
      dtype=object)

In [21]:
laptops['ram'] = laptops['ram'].str.replace("GB","").astype(int)
laptops['ram']

0        8
1        8
2        8
3       16
4        8
        ..
1298     4
1299    16
1300     2
1301     6
1302     4
Name: ram, Length: 1303, dtype: int64

In [22]:
laptops.rename({'ram':'ram_gb'}, axis=1, inplace=True)
laptops['ram'].head()

NameError: name 'laptop' is not defined

In [None]:
laptops['weight'].unique()

In [None]:
laptops.loc[laptops['weight'].str.contains('s'), 'weight']

In [None]:
laptops.loc[1061, 'weight'] = '4kg'

In [None]:
laptops.loc[1061, 'weight']

In [None]:
laptops['weight'] = laptops['weight'].str.replace('kg','').astype(float)

In [None]:
laptops["price_euros"].unique()[:5]

In [None]:
laptops["price_euros"].unique()[-5:]

In [None]:
laptops['price_euros'] = laptops['price_euros'].str.replace(',','.').astype(float)

In [None]:
laptops['price_euros']

In [None]:
laptops["price_euros"].unique()[:5]

In [None]:
laptops["price_euros"].unique()[-5:]

In [None]:
laptops.rename({'weight':'weights_kg'}, axis=1, inplace=True)

In [None]:
laptops['weights_kg'].describe()

In [None]:
laptops["price_euros"].describe()

In [None]:
laptops['gpu'].head(10)

In [None]:
laptops['gpu'].head().str.split()

In [None]:
laptops['gpu'].head().str.split(n = 1)

In [None]:
laptops['gpu'].head().str.split(n = 1, expand = True)

In [None]:
laptops['gpu'].head().str.split(n=1, expand=True).iloc[:,0]

In [None]:
laptops["gpu_manufacturer"] = laptops["gpu"].str.split(n=1,expand=True).iloc[:,0]

In [None]:
laptops['cpu'].head(10)

In [None]:
laptops["cpu_manufacturer"] = laptops["cpu"].str.split(n=1,expand=True).iloc[:,0]

In [None]:
laptops['screen'].unique().shape

In [None]:
laptops['screen'].unique()[:10]

## pandas.Series.str.rsplit
You can refer document about built-in rsplit() at [here](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.str.rsplit.html?highlight=rsplit#pandas.Series.str.rsplit)

In [None]:
laptops['screen'].str.split(expand=True)

In [None]:
laptops.loc[:9, "screen"].str.rsplit(n=1,expand=True)

In [None]:
screen_res = laptops["screen"].str.rsplit(n=1, expand=True)
screen_res.columns = ['A', 'B']
screen_res.head()

In [None]:
screen_res.loc[screen_res['B'].isnull(), 'B'] = screen_res['A']
screen_res.head(10)

In [None]:
laptops['screen_resolution'] = screen_res['B']
laptops['screen_resolution']

In [None]:
laptops['screen_resolution'].unique().shape

In [None]:
laptops['screen_resolution'].unique()

In [None]:
laptops["cpu"].unique()[:5]

In [None]:
laptops["cpu"].str.replace("GHz","").head().str.rsplit(n=1,expand=True)

In [None]:
laptops["cpu_speed_ghz"] = laptops["cpu"].str.replace("GHz","").str.rsplit(n=1,expand=True).iloc[:,1].astype(float)
laptops['cpu_speed_ghz'].head()

In [None]:
laptops['os'].value_counts()

## pandas.Series.map
Used for substituting each value in a Series with another value, that may be derived from a function, a dict or a Series. Document at [here](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.map.html)

In [None]:
mapping_dict = {
    'Android': 'Android',
    'Chrome OS': 'Chrome OS',
    'Linux': 'Linux',
    'Mac OS': 'macOS',
    'No OS': 'No OS',
    'Windows': 'Windows',
    'macOS': 'macOS'
}
laptops["os"] = laptops["os"].map(mapping_dict)

In [None]:
laptops['os'].head(10)

In [None]:
laptops.info()

In [None]:
laptops.isnull().sum()

In [None]:
laptops_no_null_rows = laptops.dropna(axis=0)
laptops_no_null_cols = laptops.dropna(axis=1)

In [None]:
laptops["os_version"].value_counts(dropna=False)

In [None]:
os_with_null_v = laptops.loc[laptops['os_version'].isnull(), 'os']
os_with_null_v.value_counts()

In [None]:
mac_os_versions = laptops.loc[laptops["os"] == "macOS", "os_version"]
mac_os_versions.value_counts(dropna=False)

In [None]:
laptops.loc[laptops["os"] == "macOS", "os_version"] = "X"

In [None]:
value_counts_before = laptops.loc[laptops["os_version"].isnull(),"os"].value_counts()
value_counts_before

In [None]:
laptops.loc[laptops["os"] == "macOS","os_version"] = "X"
laptops.loc[laptops["os"] == "No OS","os_version"] = "No OS"

In [None]:
value_counts_after = laptops.loc[laptops["os_version"].isnull(),"os"].value_counts()
value_counts_after

In [None]:
laptops.loc[76:81, "storage"]

In [None]:
laptops["storage"] = laptops["storage"].str.replace('GB','').str.replace('TB',"000")
laptops["storage"].head(10)

In [None]:
laptops[["storage_1","storage_2"]] = laptops["storage"].str.split("+",expand=True)
laptops[["storage_1","storage_2"]]

In [None]:
for s in ["storage_1","storage_2"]:
    s_capacity = s + "_capacity_gb"
    s_type = s + "_type"
    laptops[[s_capacity, s_type]] = laptops[s].str.split(n=1,expand=True)
    laptops[s_capacity] = laptops[s_capacity].astype(float)
    
laptops.drop(["storage","storage_1","storage_2"],axis=1,inplace=False)

In [None]:
laptops.columns

In [None]:
laptops.dtypes

In [None]:
cols = ['manufacturer', 'model_name', 'category', 'screen_size_inches','screen', 'cpu', 'cpu_manufacturer', 'cpu_speed_ghz', 'ram_gb', 'storage_1_type', 'storage_1_capacity_gb', 'storage_2_type', 'storage_2_capacity_gb', 'gpu', 'gpu_manufacturer', 'os', 'os_version', 'weights_kg', 'price_euros']
laptops = laptops[cols]
laptops.to_csv('laptops_cleaned.scv',index=False)
laptops_cleaned = pd.read_csv('laptops_cleaned.scv')
laptops_cleaned_dtypes = laptops_cleaned.dtypes