In [1]:
import pandas as pd
import numpy as np
import pandas as pd
from fuzzywuzzy import fuzz, process
import warnings
from sklearn.preprocessing import OneHotEncoder, LabelEncoder




## Load YÖK ATLAS Data and add standart deviation

In [2]:


# Load the dataset, ensuring "ucret" is read as a string
df = pd.read_csv('FINALDATA.csv')

print(df.isnull().sum())

# Convert "baseRanking" and "topRanking" columns to numeric, forcing errors to NaN
df["baseRanking"] = pd.to_numeric(df["baseRanking"], errors='coerce')
df["topRanking"] = pd.to_numeric(df["topRanking"], errors='coerce')

# Function to calculate standard deviation for baseRanking and topRanking
def calculate_std(row):
    values = [row["baseRanking"], row["topRanking"]]
    return np.std(values)

# Apply the function to each row
df["stdDeviationStudents"] = df.apply(calculate_std, axis=1)

print(df.describe().round(3))

  df = pd.read_csv('FINALDATA.csv')


academicYear                       0
universityName                     0
universityType                     0
faculty                            0
departmentName                     0
idOSYM                             0
programType                        0
language                           0
scholarshipRate                36928
quota                              0
occupiedSlots                      0
tuitionFee                     80505
universityFoundingYear         80505
facultyFoundingYear            80505
universityLocation                 0
universityRegion                   0
profCount                          0
assoCount                          0
docCount                           0
baseRanking                        0
topRanking                         0
avgAdmissionRanking(TYT)           0
baseAdmissionRanking(TYT)          0
stdDeviationStudents           80505
revenue                        80505
outOfCityStudentRate               0
totalPreference                    0
w

## Foreign Students Features

In [3]:

df2 = pd.read_csv('foreignStudentDataFinal.csv')

# Create lookup dictionaries
foreign_students_lookup = df2.set_index('universityName')['totalForeignStudents'].to_dict()
total_students_lookup = df2.set_index('universityName')['totalStudentNumber'].to_dict()

# Add new columns to df based on the lookup dictionaries
df['totalForeignStudents'] = df['universityName'].map(foreign_students_lookup)
df['totalStudentNumber'] = df['universityName'].map(total_students_lookup)

# Display the updated DataFrame
print(df.describe().round(3))



       academicYear        idOSYM      quota  occupiedSlots  tuitionFee  \
count     80505.000  8.050500e+04  80505.000      80505.000         0.0   
mean       2021.320  1.575328e+08     41.231         37.401         NaN   
std           1.512  4.911195e+07     76.450         70.647         NaN   
min        2019.000  1.001100e+08      1.000          0.000         NaN   
25%        2020.000  1.061107e+08     10.000          8.000         NaN   
50%        2021.000  2.005106e+08     31.000         25.000         NaN   
75%        2022.000  2.037114e+08     60.000         58.000         NaN   
max        2024.000  2.104028e+08   5800.000       5945.000         NaN   

       universityFoundingYear  facultyFoundingYear  profCount  assoCount  \
count                     0.0                  0.0  80505.000  80505.000   
mean                      NaN                  NaN      4.586      2.328   
std                       NaN                  NaN     16.807      6.185   
min                 

## URAP Features

In [5]:
import pandas as pd
import numpy as np

# Load the datasets
df_2024 = pd.read_csv('2024.csv')
df_2023 = pd.read_csv('2023.csv')
df_2022 = pd.read_csv('2022.csv')
df_2021 = pd.read_csv('2021.csv')
df_2020 = pd.read_csv('2020.csv')
df_2019 = pd.read_csv('2019.csv')

# Convert "academicYear" column to numeric, forcing errors to NaN
df["academicYear"] = pd.to_numeric(df["academicYear"], errors='coerce')

# Ensure 'universityName' column exists in all DataFrames
required_columns = ['universityName', 'No', 'Toplam Puan']
for year_df in [df_2024, df_2023, df_2022, df_2021, df_2020, df_2019]:
    missing_columns = [col for col in required_columns if col not in year_df.columns]
    if missing_columns:
        raise ValueError(f"Missing columns {missing_columns} in one of the year DataFrames")

# Initialize an empty DataFrame to store the merged results
merged_df = pd.DataFrame()

# Function to merge based on academic year
def merge_by_year(row):
    if row["academicYear"] == 2024:
        return pd.merge(pd.DataFrame([row]), df_2024[['universityName', 'No', 'Toplam Puan']], on='universityName', how='left')
    elif row["academicYear"] == 2023:
        return pd.merge(pd.DataFrame([row]), df_2023[['universityName', 'No', 'Toplam Puan']], on='universityName', how='left')
    elif row["academicYear"] == 2022:
        return pd.merge(pd.DataFrame([row]), df_2022[['universityName', 'No', 'Toplam Puan']], on='universityName', how='left')
    elif row["academicYear"] == 2021:
        return pd.merge(pd.DataFrame([row]), df_2021[['universityName', 'No', 'Toplam Puan']], on='universityName', how='left')
    elif row["academicYear"] == 2020:
        return pd.merge(pd.DataFrame([row]), df_2020[['universityName', 'No', 'Toplam Puan']], on='universityName', how='left')
    elif row["academicYear"] == 2019:
        return pd.merge(pd.DataFrame([row]), df_2019[['universityName', 'No', 'Toplam Puan']], on='universityName', how='left')
    else:
        return pd.DataFrame([row])

# Apply the merge function to each row and concatenate the results
merged_df = pd.concat([merge_by_year(row) for index, row in df.iterrows()], ignore_index=True)

# Rename the columns
merged_df.rename(columns={'No': 'Urap_Rank', 'Toplam Puan': 'Urap_Score'}, inplace=True)



In [6]:
print(merged_df.describe().round(3))

       academicYear        idOSYM      quota  occupiedSlots  tuitionFee  \
count     80505.000  8.050500e+04  80505.000      80505.000         0.0   
mean       2021.320  1.575328e+08     41.231         37.401         NaN   
std           1.512  4.911195e+07     76.450         70.647         NaN   
min        2019.000  1.001100e+08      1.000          0.000         NaN   
25%        2020.000  1.061107e+08     10.000          8.000         NaN   
50%        2021.000  2.005106e+08     31.000         25.000         NaN   
75%        2022.000  2.037114e+08     60.000         58.000         NaN   
max        2024.000  2.104028e+08   5800.000       5945.000         NaN   

       universityFoundingYear  facultyFoundingYear  profCount  assoCount  \
count                     0.0                  0.0  80505.000  80505.000   
mean                      NaN                  NaN      4.586      2.328   
std                       NaN                  NaN     16.807      6.185   
min                 

## Clear duplicate rows add milestone as before_fee.csv

In [7]:
df_cleaned = merged_df.drop_duplicates(subset=['academicYear', 'idOSYM' , 'scholarshipRate', 'language'], keep='first')
df_cleaned.to_csv("before_fee.csv", index=False)

In [9]:
print(df_cleaned.describe().round(3))

       academicYear        idOSYM      quota  occupiedSlots  tuitionFee  \
count     52221.000  5.222100e+04  52221.000      52221.000         0.0   
mean       2021.732  1.446714e+08     46.849         43.973         NaN   
std           1.664  4.822705e+07     87.653         82.352         NaN   
min        2019.000  1.001100e+08      1.000          0.000         NaN   
25%        2020.000  1.048102e+08     14.000         10.000         NaN   
50%        2022.000  1.094106e+08     40.000         34.000         NaN   
75%        2023.000  2.027123e+08     62.000         62.000         NaN   
max        2024.000  2.104028e+08   5800.000       5945.000         NaN   

       universityFoundingYear  facultyFoundingYear  profCount  assoCount  \
count                     0.0                  0.0  52221.000  52221.000   
mean                      NaN                  NaN      4.881      2.551   
std                       NaN                  NaN     17.307      6.368   
min                 

## Fee Estimation

In [10]:


# Load data
yok_df = pd.read_csv("before_fee.csv")
fee_df = pd.read_csv("fee_updated.csv")

# Convert the 'Ücret' column to numeric, handling non-numeric values
fee_df['Ücret'] = pd.to_numeric(fee_df['Ücret'], errors='coerce')
yok_df['scholarshipRate'] = pd.to_numeric(yok_df['scholarshipRate'], errors='coerce')
yok_df['academicYear'] = yok_df['academicYear'].astype(int)
fee_df = fee_df.dropna(subset=['Ücret'])  # Remove rows with NaN in 'Ücret'

# Define inflation rates
inflation_rates = {
    2019: 1.1,
    2020: 1.3,
    2021: 1.8,
    2022: 2.1,
    2023: 2,
    2024: 1.9,
    2025: 2,
}

# Normalize strings by removing spaces and converting to lowercase
def normalize_string(value):
    if isinstance(value, str):
        return value.replace(" ", "").lower()
    return value

# Apply normalization to relevant columns in both dataframes
yok_df['departmentName'] = yok_df['departmentName'].apply(normalize_string)
yok_df['faculty'] = yok_df['faculty'].apply(normalize_string)
fee_df['Bölüm/Fakülte'] = fee_df['Bölüm/Fakülte'].apply(normalize_string)

# Adjust fee based on inflation
def adjust_fee(base_fee, base_year, target_year):
    base_fee = float(base_fee)
    adjustment_factor = 1.0
    if base_year < target_year:
        for year in range(base_year, target_year):
            adjustment_factor *= inflation_rates[year]
    elif base_year > target_year:
        for year in range(target_year, base_year):
            adjustment_factor /= inflation_rates[year]
    return base_fee * adjustment_factor

# Update fuzzy_match function to use normalized strings
def fuzzy_match(value, choices, threshold=70):
    result = process.extractOne(value, choices, scorer=fuzz.token_set_ratio)
    if result:
        match, score = result
        return match if score >= threshold else None
    return None

# Adjust fee based on scholarship difference
def adjust_for_scholarship(base_fee, base_scholarship, target_scholarship):
    base_fee = float(base_fee)
    if base_scholarship != target_scholarship:
        adjustment_factor = (100 - target_scholarship) / (100 - base_scholarship)
        return base_fee * adjustment_factor
    return base_fee

# Calculate the average fee for a given year, considering scholarships
def calculate_yearly_average_fee(year):
    # Filter data for the given year and calculate the average fee after adjusting for scholarship
    yearly_data = fee_df[fee_df['Akademik yıl'] == year]
    if not yearly_data.empty:
        adjusted_fees = yearly_data.apply(
            lambda row: adjust_for_scholarship(row['Ücret'], row['burs_oranı'], 0), axis=1
        )
        return adjusted_fees.mean()
    return None

# Main function to find the best fee
def find_best_fee(yok_row):
    uni = normalize_string(yok_row['universityName'])
    dept = normalize_string(yok_row['departmentName'])
    faculty = normalize_string(yok_row['faculty'])
    scholarship = yok_row['scholarshipRate']
    year = yok_row['academicYear']

    if yok_row['universityType'].lower() == "devlet":
        return 0
    # If the scholarship is 100%, set fee to 0
    if scholarship == 100:
        return 0
    # Find closest university name in fee.csv
    uni_match = fuzzy_match(uni, fee_df['universityName'].unique())

    # Attempt matches with both department and faculty names
    dept_or_faculty_matches = [dept, faculty]

    name_match = None
    for name_option in dept_or_faculty_matches:
        name_match = fuzzy_match(
            name_option, 
            fee_df[fee_df['universityName'] == uni_match]['Bölüm/Fakülte'].unique()
        )
        if name_match:
            break

    # Step 1: Exact match on department/faculty and year, ignoring scholarship initially
    exact_match = fee_df[
        (fee_df['universityName'] == uni_match) &
        (fee_df['Bölüm/Fakülte'] == name_match) &
        (fee_df['Akademik yıl'] == year)
    ]
    if not exact_match.empty:
        found_fee = exact_match['Ücret'].iloc[0]
        found_scholarship = exact_match['burs_oranı'].iloc[0]
        return adjust_for_scholarship(found_fee, found_scholarship, scholarship)

    # Step 2: Check other departments/faculties within the same year
    other_dept_same_year = fee_df[
        (fee_df['universityName'] == uni_match) &
        (fee_df['Akademik yıl'] == year)
    ]
    if not other_dept_same_year.empty:
        found_fee = other_dept_same_year['Ücret'].median()
        found_scholarship = other_dept_same_year['burs_oranı'].median()
        return adjust_for_scholarship(found_fee, found_scholarship, scholarship)

    # Step 3: Check the same department/faculty in other years
    same_dept_other_years = fee_df[
        (fee_df['universityName'] == uni_match) &
        (fee_df['Bölüm/Fakülte'] == name_match)
    ]
    if not same_dept_other_years.empty:
        closest_year = same_dept_other_years['Akademik yıl'].iloc[0]
        closest_fee = same_dept_other_years['Ücret'].iloc[0]
        closest_scholarship = same_dept_other_years['burs_oranı'].iloc[0]
        adjusted_fee = adjust_fee(closest_fee, closest_year, year)
        return adjust_for_scholarship(adjusted_fee, closest_scholarship, scholarship)

    # Step 4: Fallback to similar department/faculty at other universities if no match found
    similar_dept = fuzzy_match(dept, fee_df['Bölüm/Fakülte'].unique())
    if similar_dept:
        dept_match_same_year = fee_df[
            (fee_df['Bölüm/Fakülte'] == similar_dept) &
            (fee_df['Akademik yıl'] == year)
        ]
        if not dept_match_same_year.empty:
            found_fee = dept_match_same_year['Ücret'].median()
            found_scholarship = dept_match_same_year['burs_oranı'].median()
            return adjust_for_scholarship(found_fee, found_scholarship, scholarship)

    # Step 5: Search other departments/faculties in other years
    other_dept_other_years = fee_df[
        (fee_df['universityName'] == uni_match)
    ]
    if not other_dept_other_years.empty:
        closest_year = other_dept_other_years['Akademik yıl'].iloc[0]
        closest_fee = other_dept_other_years['Ücret'].iloc[0]
        closest_scholarship = other_dept_other_years['burs_oranı'].iloc[0]
        adjusted_fee = adjust_fee(closest_fee, closest_year, year)
        return adjust_for_scholarship(adjusted_fee, closest_scholarship, scholarship)

    # If all else fails, use the average fee for the year
    year_avg_fee = calculate_yearly_average_fee(year)
    return year_avg_fee

# Apply function to each row in yök.csv
yok_df['Estimated Fee'] = yok_df.apply(find_best_fee, axis=1)
# Save the updated yök data with estimated fees
yok_df.to_csv("fee_yök.csv", index=False)


  yok_df = pd.read_csv("before_fee.csv")


## TUİK Features

In [11]:

# Load the datasets
ilk_is_bulma_suresi_df = pd.read_csv('ilk_iş_bulma_süresi_cleraed.csv')
istihdam_orani_df = pd.read_csv('istihdam_oranı_cleared.csv')
kazanc_grubu_df = pd.read_csv('kazanç_grubu_cleared.csv')
yok_with_fees_df = pd.read_csv('fee_yök.csv')

def get_avarage(row,year):
    if year <= 2022:
        return row['2022'].mean()
    else:
        return row['2023'].mean()

# Normalize strings by removing spaces and converting to lowercase
def normalize_string(value):
    if isinstance(value, str):
        return value.replace(" ", "").lower()
    return value

# Apply normalization to relevant columns in both dataframes
yok_with_fees_df['departmentName'] = yok_with_fees_df['departmentName'].apply(normalize_string)
yok_with_fees_df['faculty'] = yok_with_fees_df['faculty'].apply(normalize_string)
kazanc_grubu_df['Program'] = kazanc_grubu_df['Program'].apply(normalize_string)
istihdam_orani_df['Program'] = istihdam_orani_df['Program'].apply(normalize_string)
ilk_is_bulma_suresi_df['Program'] = ilk_is_bulma_suresi_df['Program'].apply(normalize_string)

# Function to get the best match for a faculty or department name
def get_best_match(name, choices, threshold=70):
    result = process.extractOne(name, choices, scorer=fuzz.token_set_ratio)
    if result:
        match, score = result  
        return match if score >= threshold else None
    return None


# Function to merge datasets with fuzzy matching and nearest academic year
def merge_datasets_income(df1, df2, dept_or_faculty_matches):
    for dept_or_faculty in dept_or_faculty_matches:
        df1['matched_name'] = df1['Program'].apply(lambda x: get_best_match(x, df2[dept_or_faculty].unique()))
        for index, row in df2.iterrows():
            matched_rows = df1.loc[df1['matched_name'] == row[dept_or_faculty]]
            if not matched_rows.empty:
                if row['academicYear'] <= 2022:
                    df2.at[index, 'avg_monthly_income_group'] = matched_rows['2022'].values[0]
                elif row['academicYear'] >= 2023:
                    df2.at[index, 'avg_monthly_income_group'] = matched_rows['2023'].values[0]
        df1.drop(columns=['matched_name'], inplace=True)
    
    return df2
# Attempt matches with both department and faculty names
dept_or_faculty_matches = ['departmentName', 'faculty']

# Merge the datasets
merged_kazanc_grubu_df = merge_datasets_income(kazanc_grubu_df, yok_with_fees_df, dept_or_faculty_matches)

# Save the updated DataFrames to new CSV files
merged_kazanc_grubu_df.to_csv("final_tuik.csv", index=False)

In [12]:
# Function to merge datasets with fuzzy matching and nearest academic year
def merge_datasets_graduate(df1, df2, dept_or_faculty_matches):
    for dept_or_faculty in dept_or_faculty_matches:
        df1['matched_name'] = df1['Program'].apply(lambda x: get_best_match(x, df2[dept_or_faculty].unique()))
        for index, row in df2.iterrows():
            matched_rows = df1.loc[df1['matched_name'] == row[dept_or_faculty]]
            if not matched_rows.empty:
                if row['academicYear'] <= 2022:
                    df2.at[index, 'Time for Graduates to Find a Job'] = matched_rows['2022'].values[0]
                elif row['academicYear'] >= 2023:
                    df2.at[index, 'Time for Graduates to Find a Job'] = matched_rows['2023'].values[0]
        df1.drop(columns=['matched_name'], inplace=True)
    
    return df2
merge_ilk_is_bulma_suresi_df = merge_datasets_graduate(ilk_is_bulma_suresi_df, merged_kazanc_grubu_df, dept_or_faculty_matches)
merge_ilk_is_bulma_suresi_df.to_csv("final_ilk_is_bulma_suresi.csv", index=False)


In [13]:
# Function to merge datasets with fuzzy matching and nearest academic year
def merge_datasets_employment(df1, df2, dept_or_faculty_matches):
    for dept_or_faculty in dept_or_faculty_matches:
        df1['matched_name'] = df1['Program'].apply(lambda x: get_best_match(x, df2[dept_or_faculty].unique()))
        for index, row in df2.iterrows():
            matched_rows = df1.loc[df1['matched_name'] == row[dept_or_faculty]]
            if not matched_rows.empty:
                if row['academicYear'] <= 2022:
                    df2.at[index, 'employment_rate'] = matched_rows['2022'].values[0]
                elif row['academicYear'] >= 2023:
                    df2.at[index, 'employment_rate'] = matched_rows['2023'].values[0]
        df1.drop(columns=['matched_name'], inplace=True)
    return df2
merge_istihdam_orani_df = merge_datasets_employment(istihdam_orani_df, merge_ilk_is_bulma_suresi_df, dept_or_faculty_matches)
merge_istihdam_orani_df.to_csv("final_tuik.csv", index=False)

In [14]:
# Load the CSV file into a DataFrame
df = pd.read_csv('final_tuik.csv')

# Find columns with all empty values
empty_columns = df.columns[df.isnull().all()]

# Print the empty columns
print("Columns with all empty values:", empty_columns.tolist())

Columns with all empty values: ['tuitionFee', 'universityFoundingYear', 'facultyFoundingYear', 'revenue', 'weightedPreference', 'tuitionFeeIncrease', 'admittedPrefTrendRatio']


## Economic Features

In [15]:

# Define the data for base salary, inflation, and growth by year
base_salary_by_year = {
    2019: 2103,
    2020: 2103,   
    2021: 2825,
    2022: 5500,
    2023: 11402,
    2024: 17002
}

inflation_by_year = {
    2019: 11.8,
    2020: 14,
    2021: 36,
    2022: 64,
    2023: 64,
    2024: 64
}

growth_by_year = {
    2019: 0.7,
    2020: 1.8,    
    2021: 11.6,
    2022: 5.4,
    2023: 3.8,
    2024: 2.7
}

# Open the CSV file
df = pd.read_csv('final_tuik.csv')

# Add the new columns based on the academic year
df['base_salary_by_year'] = df['academicYear'].map(base_salary_by_year)
df['inflation_by_year'] = df['academicYear'].map(inflation_by_year)
df['growth_by_year'] = df['academicYear'].map(growth_by_year)

# Save the updated DataFrame to a new CSV file
df.to_csv('updated_final_tuik.csv', index=False)

# Display the updated DataFrame
df.head()

Unnamed: 0,academicYear,universityName,universityType,faculty,departmentName,idOSYM,programType,language,scholarshipRate,quota,...,totalStudentNumber,Urap_Rank,Urap_Score,Estimated Fee,avg_monthly_income_group,Time for Graduates to Find a Job,employment_rate,base_salary_by_year,inflation_by_year,growth_by_year
0,2023,ABDULLAH GÜL ÜNİVERSİTESİ,devlet,mimarlıkfakültesi,mimarlık,106510014.0,SAY,İngilizce,,70,...,2670.0,50.0,705.46,0.0,-1.0,11.3,79.2,11402,64.0,3.8
1,2022,ABDULLAH GÜL ÜNİVERSİTESİ,devlet,mimarlıkfakültesi,mimarlık,106510014.0,SAY,İngilizce,,70,...,2670.0,43.0,716.794001,0.0,-2.0,11.8,76.9,5500,64.0,5.4
2,2021,ABDULLAH GÜL ÜNİVERSİTESİ,devlet,mimarlıkfakültesi,mimarlık,106510014.0,SAY,İngilizce,,70,...,2670.0,40.0,506.88,0.0,-2.0,11.8,76.9,2825,36.0,11.6
3,2024,ABDULLAH GÜL ÜNİVERSİTESİ,devlet,mimarlıkfakültesi,mimarlık,106510014.0,SAY,İngilizce,,60,...,2670.0,58.0,617.88,0.0,-1.0,11.3,79.2,17002,64.0,2.7
4,2020,ABDULLAH GÜL ÜNİVERSİTESİ,devlet,mimarlıkfakültesi,mimarlık,106510014.0,SAY,İngilizce,,62,...,2670.0,36.0,521.63,0.0,-2.0,11.8,76.9,2103,14.0,1.8


## Fee Features

In [16]:
# Copy the values from 'Estimated Fee' to 'tuitionFee'
df['tuitionFee'] = df['Estimated Fee']

# Drop the 'Estimated Fee' column
df.drop(columns=['Estimated Fee'], inplace=True)

# Calculate revenue for 'vakıf' university types
df.loc[df['universityType'] == 'vakıf', 'revenue'] = df['tuitionFee'] * df['occupiedSlots']

# Find and drop columns with all empty values
empty_columns = df.columns[df.isnull().all()]
df.drop(columns=empty_columns, inplace=True)

print(df.isnull().sum())

academicYear                            0
universityName                          0
universityType                          0
faculty                                 0
departmentName                          0
idOSYM                                  0
programType                             0
language                                0
scholarshipRate                     31050
quota                                   0
occupiedSlots                           0
tuitionFee                              0
universityLocation                      0
universityRegion                        0
profCount                               0
assoCount                               0
docCount                                0
baseRanking                             0
topRanking                              0
avgAdmissionRanking(TYT)                0
baseAdmissionRanking(TYT)               0
stdDeviationStudents                    0
revenue                             31608
outOfCityStudentRate              

## Data Manipualtion and Imputation

In [17]:

# Ensure the columns are numeric
df['employment_rate'] = pd.to_numeric(df['employment_rate'], errors='coerce')
df['Time for Graduates to Find a Job'] = pd.to_numeric(df['Time for Graduates to Find a Job'], errors='coerce')
df['avg_monthly_income_group'] = pd.to_numeric(df['avg_monthly_income_group'], errors='coerce')

# Fill missing values in the employment_rate column with the mean
df['employment_rate'] = df['employment_rate'].fillna(
    df.groupby('academicYear')['employment_rate'].transform('mean')
)

# Fill missing values in the Time for Graduates to Find a Job column with the mean
df['Time for Graduates to Find a Job'] = df['Time for Graduates to Find a Job'].fillna(
    df.groupby('academicYear')['Time for Graduates to Find a Job'].transform('mean')
)

# Fill missing values in the avg_monthly_income_group column with the mean
df['avg_monthly_income_group'] = df['avg_monthly_income_group'].fillna(
    df.groupby('academicYear')['avg_monthly_income_group'].transform('mean')
)

print(df['avg_monthly_income_group'].isnull().sum())

0


In [18]:
print(df.isnull().sum())

academicYear                            0
universityName                          0
universityType                          0
faculty                                 0
departmentName                          0
idOSYM                                  0
programType                             0
language                                0
scholarshipRate                     31050
quota                                   0
occupiedSlots                           0
tuitionFee                              0
universityLocation                      0
universityRegion                        0
profCount                               0
assoCount                               0
docCount                                0
baseRanking                             0
topRanking                              0
avgAdmissionRanking(TYT)                0
baseAdmissionRanking(TYT)               0
stdDeviationStudents                    0
revenue                             31608
outOfCityStudentRate              

In [19]:

# Check for missing values in each column
df.fillna(0, inplace=True)

# ScholarshipRate and revenue columns have missing values devlet university types
missing_values = df.isnull().sum()
print("Missing values in each column:\n", missing_values)

df.rename(columns={'Time for Graduates to Find a Job': 'Time_for_employment'}, inplace=True)

df.to_csv('final_dataset.csv', index=False)

Missing values in each column:
 academicYear                        0
universityName                      0
universityType                      0
faculty                             0
departmentName                      0
idOSYM                              0
programType                         0
language                            0
scholarshipRate                     0
quota                               0
occupiedSlots                       0
tuitionFee                          0
universityLocation                  0
universityRegion                    0
profCount                           0
assoCount                           0
docCount                            0
baseRanking                         0
topRanking                          0
avgAdmissionRanking(TYT)            0
baseAdmissionRanking(TYT)           0
stdDeviationStudents                0
revenue                             0
outOfCityStudentRate                0
totalPreference                     0
top1PreferenceRati

In [5]:
import pandas as pd
df = pd.read_csv('final_dataset.csv', low_memory=False)

numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
print(numeric_columns)

df.head()

Index(['academicYear', 'idOSYM', 'scholarshipRate', 'quota', 'occupiedSlots',
       'tuitionFee', 'profCount', 'assoCount', 'docCount', 'baseRanking',
       'topRanking', 'avgAdmissionRanking(TYT)', 'baseAdmissionRanking(TYT)',
       'stdDeviationStudents', 'revenue', 'outOfCityStudentRate',
       'totalPreference', 'top1PreferenceRatio', 'avgOrderofPreference',
       'avgAdmittedStudentPrefOrder', 'top1AdmittedRatio', 'top3AdmittedRatio',
       'top10AdmittedRatio', 'admittedTotalPref',
       'admittedTotalDepartmentPref', 'currentStudentCount', 'baseScore',
       'topScore', 'totalForeignStudents', 'totalStudentNumber', 'Urap_Rank',
       'Urap_Score', 'avg_monthly_income_group', 'Time_for_employment',
       'employment_rate', 'base_salary_by_year', 'inflation_by_year',
       'growth_by_year'],
      dtype='object')


Unnamed: 0,academicYear,universityName,universityType,faculty,departmentName,idOSYM,programType,language,scholarshipRate,quota,...,totalForeignStudents,totalStudentNumber,Urap_Rank,Urap_Score,avg_monthly_income_group,Time_for_employment,employment_rate,base_salary_by_year,inflation_by_year,growth_by_year
0,2023,ABDULLAH GÜL ÜNİVERSİTESİ,devlet,mimarlıkfakültesi,mimarlık,106510014.0,SAY,İngilizce,0.0,70,...,235.0,2670.0,50.0,705.46,-1.0,11.3,79.2,11402,64.0,3.8
1,2022,ABDULLAH GÜL ÜNİVERSİTESİ,devlet,mimarlıkfakültesi,mimarlık,106510014.0,SAY,İngilizce,0.0,70,...,235.0,2670.0,43.0,716.794001,-2.0,11.8,76.9,5500,64.0,5.4
2,2021,ABDULLAH GÜL ÜNİVERSİTESİ,devlet,mimarlıkfakültesi,mimarlık,106510014.0,SAY,İngilizce,0.0,70,...,235.0,2670.0,40.0,506.88,-2.0,11.8,76.9,2825,36.0,11.6
3,2024,ABDULLAH GÜL ÜNİVERSİTESİ,devlet,mimarlıkfakültesi,mimarlık,106510014.0,SAY,İngilizce,0.0,60,...,235.0,2670.0,58.0,617.88,-1.0,11.3,79.2,17002,64.0,2.7
4,2020,ABDULLAH GÜL ÜNİVERSİTESİ,devlet,mimarlıkfakültesi,mimarlık,106510014.0,SAY,İngilizce,0.0,62,...,235.0,2670.0,36.0,521.63,-2.0,11.8,76.9,2103,14.0,1.8


## Conversion to Numeric Columns

In [6]:
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns

non_numeric_counts = {}
missing_counts = {}

for col in numeric_columns:
    non_numeric_counts[col] = df[col].apply(pd.to_numeric, errors='coerce').isnull().sum()
    missing_counts[col] = df[col].isnull().sum()

# Print non-numeric and missing value counts
print("Non-numeric value counts in numeric columns:")
print(non_numeric_counts)
print("\nMissing value counts in numeric columns:")
print(missing_counts)

# Get summary statistics for numeric columns
summary_statistics = df[numeric_columns].describe()
print("\nSummary statistics for numeric columns:")
print(summary_statistics)

Non-numeric value counts in numeric columns:
{'academicYear': np.int64(0), 'idOSYM': np.int64(0), 'scholarshipRate': np.int64(0), 'quota': np.int64(0), 'occupiedSlots': np.int64(0), 'tuitionFee': np.int64(0), 'profCount': np.int64(0), 'assoCount': np.int64(0), 'docCount': np.int64(0), 'baseRanking': np.int64(0), 'topRanking': np.int64(0), 'avgAdmissionRanking(TYT)': np.int64(0), 'baseAdmissionRanking(TYT)': np.int64(0), 'stdDeviationStudents': np.int64(0), 'revenue': np.int64(0), 'outOfCityStudentRate': np.int64(0), 'totalPreference': np.int64(0), 'top1PreferenceRatio': np.int64(0), 'avgOrderofPreference': np.int64(0), 'avgAdmittedStudentPrefOrder': np.int64(0), 'top1AdmittedRatio': np.int64(0), 'top3AdmittedRatio': np.int64(0), 'top10AdmittedRatio': np.int64(0), 'admittedTotalPref': np.int64(0), 'admittedTotalDepartmentPref': np.int64(0), 'currentStudentCount': np.int64(0), 'baseScore': np.int64(0), 'topScore': np.int64(0), 'totalForeignStudents': np.int64(0), 'totalStudentNumber': np

## Encoding

In [7]:
from category_encoders.target_encoder import TargetEncoder
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# One-hot encoding
one_hot_columns = ['universityType', 'programType', 'language']
df = pd.get_dummies(df, columns=one_hot_columns)

# Label encoding
label_columns = ['universityLocation', 'universityRegion','universityName', 'departmentName', 'faculty']
label_encoder = LabelEncoder()
for col in label_columns:
    df[col] = label_encoder.fit_transform(df[col])

# Ordinal encoding for ordinal features
ordinal_mapping = {
    'scholarshipRate': {0.0: 0, 25.0: 1, 50.0: 2, 75.0: 3, 100.0: 4},
}
for col, mapping in ordinal_mapping.items():
    if col in df.columns:
        df[col] = df[col].map(mapping)

non_numeric_columns = df.select_dtypes(exclude=['number']).columns

print(non_numeric_columns)


Index(['universityType_devlet', 'universityType_vakıf', 'programType_DİL',
       'programType_EA', 'programType_SAY', 'programType_SÖZ',
       'language_Almanca', 'language_Arapça', 'language_Bulgarca',
       'language_Ermenice', 'language_Fransızca', 'language_Korece',
       'language_Lehçe', 'language_Rusça', 'language_Türkçe', 'language_Çince',
       'language_İngilizce', 'language_İspanyolca', 'language_İtalyanca'],
      dtype='object')


In [8]:
for col in non_numeric_columns:
    df[col] = df[col].astype(int)
non_numeric_columns = df.select_dtypes(exclude=['number']).columns
print(non_numeric_columns)
df.head()
df.to_csv('final_dataset_encoded.csv', index=False)

Index([], dtype='object')
