# Python untuk Data Analysis


## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns                       #visualisation
import matplotlib.pyplot as plt             #visualisation
%matplotlib inline     
sns.set(color_codes=True)

import warnings
warnings.filterwarnings('ignore')

## Importing Dataset

In [None]:
url = 'https://raw.githubusercontent.com/marwanmusa/dataset/main/data.csv'
df = pd.read_csv(url)
# To display the top 5 rows 
df.head(10) 

In [None]:
df.tail(5)                        
# To display the botton 5 rows

## Data Wrangling

### *Checking Tipe Data*

In [None]:
df.dtypes

### *Hapus Kolom yang Irrelevant*

In [None]:
df = df.drop(['Engine Fuel Type', 
              'Market Category', 
              'Vehicle Style', 
              'Popularity', 
              'Number of Doors', 
              'Vehicle Size'], axis=1)
df.head(5)

### *Mengubah Nama Kolom*

In [None]:
df = df.rename(columns={"Engine HP": "HP", 
                        "Engine Cylinders": "Cylinders", 
                        "Transmission Type": "Transmission", 
                        "Driven_Wheels": "Drive Mode",
                        "highway MPG": "MPG-H", 
                        "city mpg": "MPG-C", 
                        "MSRP": "Price" })
df.head(5)

### *Menghapus Kolom Duplikat*

In [None]:
df.shape

In [None]:
df[df.duplicated()]

In [None]:
df.count()
# Used to count the number of rows

So seen above there are 11914 rows and we are removing 989 rows of duplicate data.

In [None]:
df = df.drop_duplicates()
df.count() # after drop duplicate data

### *Menghapus Missing Values*

In [None]:
print(df.isnull().sum())
# Checking null values each columns

In [None]:
df = df.dropna()    # Dropping the missing values.
df.count()

Now we have removed all the rows which contain the Null or N/A values (Cylinders and Horsepower (HP)).

In [None]:
print(df.isnull().sum())   # After dropping the values

### *Mendeteksi Outliers*

In [None]:
sns.boxplot(x=df['Price'])

In [None]:
sns.boxplot(x=df['HP'])

In [None]:
sns.boxplot(x=df['Cylinders'])

In [None]:
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
print(IQR)

In [None]:
df = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]
df.shape

## Exploratory Data Analysis

### *Plot Korelasi antar Feature*

**Histogram**

In [None]:
# untuk frequency data, gunakan bar chart
df.Make.value_counts().nlargest(40).plot(kind='bar', figsize=(10,5))
plt.title("Number of cars by make")
plt.ylabel('Number of cars')
plt.xlabel('Make');

**Heat Maps**

In [None]:
# untuk analisis korelasi seluruh variabel gunakan heatmaps
plt.figure(figsize=(10,5))
c= df.corr()
sns.heatmap(c,cmap="BrBG",annot=True)
c

**Scatterplot**

In [None]:
# untuk korelasi antara dua variabel gunakan scatterplot
fig, ax = plt.subplots(figsize=(10,6))
ax.scatter(df['HP'], df['Price'])
ax.set_xlabel('HP')
ax.set_ylabel('Price')
plt.show()

## 10. Grouping Data

### *Categorical Data*

In [None]:
# Select data group categorical only
df.select_dtypes(include=[object])

### *Numerical Data*

In [None]:
# Select data group numerical only
df.select_dtypes(exclude=[object])

## 11. Descriptive statistics using `describe()`

In [None]:
df.describe()

### *Data Distribution*

In [None]:
# Numerical columns as a variable
data_num_columns = df.select_dtypes(exclude=['object'])

# Visualizing data distribution with distplot 
x = plt.figure(figsize=(18, 16))

for i, j in enumerate(data_num_columns): # enumerate i as index and j as data in data_num_columns
    x.add_subplot(3, 2, i+1)
    sns.distplot(df[j], bins=15)
    x.tight_layout()

plt.tight_layout()

In [None]:
for i, j in enumerate(data_num_columns):
    print(i, j)