<a href="https://colab.research.google.com/github/omeryldzk/WebScraping-Dataset-/blob/main/fee_2024.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import pandas as pd
import re

# Load the CSV file
df = pd.read_csv('university_result.csv')

# Step 1: Clean up the "Üniversite" field
df['Üniversite'] = df['Üniversite'].str.replace(r' Eğitim Ücretleri 2025 ve Bursları', '', regex=True)
df['Üniversite'] = df['Üniversite'].str.replace(r' Eğitim Ücretleri ve Bursları 2024 2025', '', regex=True)

# Step 2: Extract "burs_oranı" and clean up the "ücret" column
def extract_burs_oranı(row):
    # Extract burs oranı percentage if present in the "Bölüm/Fakülte" or "Ücret" column
    # Convert row['Bölüm/Fakülte'] to string before concatenation
    burs_match = re.search(r'%\d+', str(row['Bölüm/Fakülte']) + ' ' + str(row['Ücret']))
    return burs_match.group(0) if burs_match else 'Ücretli'

df['burs_oranı'] = df.apply(extract_burs_oranı, axis=1)

# Step 3: Clean up the "Bölüm/Fakülte" column
df['Bölüm/Fakülte'] = df['Bölüm/Fakülte'].str.replace(r'%\d+', '', regex=True)
# Step 4: Remove "TL" from the "ücret" values

# Save the cleaned data to a new CSV file
df.to_csv('cleaned_data.csv', index=False)

print("CSV file has been formatted successfully and saved as 'cleaned_data.csv'")

CSV file has been formatted successfully and saved as 'cleaned_data.csv'


In [21]:
df = pd.read_csv('cleaned_data.csv')

# Step 3: Handle rows with two `ücret` values
def split_duplicated_ucret(row):
    # Find all instances of "ücret" values in the current row
    ucret_matches = re.findall(r'(\d+\.\d+)', str(row['Ücret']))
    # Find all percentage values (burs oranı)
    burs_matches = re.findall(r'%\s*(\d+)', str(row['Ücret']))
    rows = []
    # If two or more `ücret` values are found, create new rows
    for i in range(len(ucret_matches)):
        new_row = row.copy()
        new_row['Ücret'] = ucret_matches[i]  # Assign the found `ücret`
        new_row['burs_oranı'] = f'%{burs_matches[i]}' if i < len(burs_matches) else row['burs_oranı']
        rows.append(new_row)

    return pd.DataFrame(rows)

df = pd.concat(df.apply(split_duplicated_ucret, axis=1).tolist(), ignore_index=True)

# Clean and format the 'Ücret' field
df['Ücret'] = (
    df['Ücret']
    .replace('[^\d]', '', regex=True)   # Remove any non-numeric characters
    .astype(str)                       # Convert to numeric format
)

df.to_csv('final_cleaned_data.csv', index=False)

In [28]:
import pandas as pd
import re

# Load your data (ensure you have uploaded your CSV file to Colab)
df = pd.read_csv('final_cleaned_data.csv')  # Replace with your file path

# Function to convert burs_oranı
def convert_burs_oranı(row):
    # Convert burs_oranı to numeric values
    burs_oranı = row["burs_oranı"]
    if "50" in burs_oranı:
        burs_oranı = 50
    elif "25" in burs_oranı:
        burs_oranı = 25
    else:
        burs_oranı = 0 # for Ücretli

    return pd.Series([burs_oranı])

# Apply the function to each row
df[["burs_oranı"]] = df.apply(convert_burs_oranı, axis=1)
# Save the updated DataFrame to a new CSV file
df.to_csv('result_2024.csv', index=False)  # Specify your desired save path

print("burs_oranı values have been updated successfully and saved as 'updated_data.csv'")


burs_oranı values have been updated successfully and saved as 'updated_data.csv'
