# Basics of Data Wrangling
Data Wrangling is the process of converting data from the initial format to a format that may be better for analysis.

In [1]:
import pandas as pd
import matplotlib.pylab as plt

In [2]:
df = pd.read_csv("automobile.csv")

In [3]:
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [4]:
import numpy as np
# replace "?" with NaN, Python's default missing value marker
df.replace("?", np.nan, inplace=True)
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [None]:
# count missing values in each column
df.isnull().sum()


In [10]:
# None of the columns are empty enough to drop entirely. We will replace the missing 
# values with the mean for these columns: "normalized-losses", "stroke", "bore", 
# "horsepower" & "peak-rpm".
avg_norm_loss = df["normalized-losses"].astype("float").mean(axis=0)
# print("Average of normalized-losses:", avg_norm_loss)

df["normalized-losses"].replace(np.nan, avg_norm_loss, inplace=True)

In [12]:
# apply the same process for the above-mentioned columns
avg_bore = df["bore"].astype("float").mean(axis=0)
#print(avg_bore)

df["bore"].replace(np.nan, avg_bore, inplace=True)

In [13]:
avg_stroke = df["stroke"].astype("float").mean(axis=0)
#print(avg_stroke)

df["stroke"].replace(np.nan, avg_stroke, inplace=True)

In [15]:
avg_horsepower = df["horsepower"].astype("float").mean(axis=0)
# print(avg_horsepower)

df["horsepower"].replace(np.nan, avg_horsepower, inplace=True)

In [16]:
avg_peakrpm = df["peak-rpm"].astype("float").mean(axis=0)
# print(avg_peakrpm)

df["peak-rpm"].replace(np.nan, avg_peakrpm, inplace=True)

5125.36945813
