<a href="https://colab.research.google.com/github/luqyz/Mudah.my/blob/main/Web_Scrapping_(Mudah_my).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Data Scraping

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL of the page to scrape
url = 'https://www.mudah.my/malaysia/cars-for-sale'

# Send an HTTP GET request to the page
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36'
}
response = requests.get(url, headers=headers)

# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

# Scraping "Region" data
region_elements = soup.find_all('span', {'title': 'Region'})
regions = []
for region in region_elements:
    region_text = region.get_text(strip=True)
    regions.append(region_text)  # Add to list

# Scraping "Condition" data
condition_elements = soup.find_all('div', title='Condition')
conditions = []
for element in condition_elements:
    text = element.get_text(strip=True)  # Get text and remove extra spaces
    conditions.append(text)  # Add to list

# Create a DataFrame with both Condition and Region data
df = pd.DataFrame({
    "Region": regions,
    "Condition": conditions,

})

# Display the combined DataFrame
print("DataFrame:")
print(df)



DataFrame:
          Region Condition
0       Selangor      Used
1          Johor       New
2          Johor       New
3       Selangor      Used
4   Kuala Lumpur      Used
5   Kuala Lumpur      Used
6   Kuala Lumpur      Used
7   Kuala Lumpur      Used
8   Kuala Lumpur     Recon
9   Kuala Lumpur      Used
10  Kuala Lumpur      Used
11  Kuala Lumpur     Recon
12      Selangor      Used
13  Kuala Lumpur     Recon
14      Selangor      Used
15  Kuala Lumpur     Recon
16      Selangor      Used
17      Selangor     Recon
18  Kuala Lumpur     Recon
19      Selangor     Recon
20      Selangor      Used
21      Selangor      Used
22      Selangor     Recon
23      Selangor      Used
24  Kuala Lumpur      Used
25  Kuala Lumpur     Recon
26      Selangor      Used
27      Selangor      Used
28        Pahang      Used
29      Selangor      Used
30         Sabah      Used
31      Selangor      Used
32      Selangor      Used
33      Selangor      Used
34  Kuala Lumpur      Used
35      Selangor 

Data Cleaning

In [None]:
# 1. Data Formatting: Convert all text to lowercase
df['Region'] = df['Region'].str.lower().str.strip()
df['Condition'] = df['Condition'].str.lower().str.strip()
print(df)

          Region Condition
0       selangor      used
1          johor       new
2          johor       new
3       selangor      used
4   kuala lumpur      used
5   kuala lumpur      used
6   kuala lumpur      used
7   kuala lumpur      used
8   kuala lumpur     recon
9   kuala lumpur      used
10  kuala lumpur      used
11  kuala lumpur     recon
12      selangor      used
13  kuala lumpur     recon
14      selangor      used
15  kuala lumpur     recon
16      selangor      used
17      selangor     recon
18  kuala lumpur     recon
19      selangor     recon
20      selangor      used
21      selangor      used
22      selangor     recon
23      selangor      used
24  kuala lumpur      used
25  kuala lumpur     recon
26      selangor      used
27      selangor      used
28        pahang      used
29      selangor      used
30         sabah      used
31      selangor      used
32      selangor      used
33      selangor      used
34  kuala lumpur      used
35      selangor      used
3

In [None]:
# 2. Remove duplicates
df_cleaned = df.drop_duplicates()

# Display the cleaned DataFrame
print("\nCleaned DataFrame:")
print(df_cleaned)


Cleaned DataFrame:
          Region Condition
0       selangor      used
1          johor       new
4   kuala lumpur      used
8   kuala lumpur     recon
17      selangor     recon
28        pahang      used
30         sabah      used
38         johor      used


Save the cleaned dataframe into .csv file

In [None]:
# Save the cleaned DataFrame to a CSV file
df_cleaned.to_csv('scraped_car_data_cleaned.csv', index=False)

# Print confirmation
print("\nData saved to 'scraped_car_data_cleaned.csv'")


Data saved to 'scraped_car_data_cleaned.csv'
