In [5]:
import pandas as pd
from fuzzywuzzy import process, fuzz

# Load the datasets
ilk_is_bulma_suresi_df = pd.read_csv('ilk_iş_bulma_süresi_cleraed.csv')
istihdam_orani_df = pd.read_csv('istihdam_oranı_cleared.csv')
kazanc_grubu_df = pd.read_csv('kazanç_grubu_cleared.csv')
yok_with_fees_df = pd.read_csv('fee_yök.csv')



# Normalize strings by removing spaces and converting to lowercase
def normalize_string(value):
    if isinstance(value, str):
        return value.replace(" ", "").lower()
    return value

# Apply normalization to relevant columns in both dataframes
yok_with_fees_df['departmentName'] = yok_with_fees_df['departmentName'].apply(normalize_string)
yok_with_fees_df['faculty'] = yok_with_fees_df['faculty'].apply(normalize_string)

kazanc_grubu_df['Program'] = kazanc_grubu_df['Program'].apply(normalize_string)
istihdam_orani_df['Program'] = istihdam_orani_df['Program'].apply(normalize_string)
ilk_is_bulma_suresi_df['Program'] = ilk_is_bulma_suresi_df['Program'].apply(normalize_string)

# Function to get the best match for a faculty or department name
def get_best_match(name, choices, threshold=70):
    result = process.extractOne(name, choices, scorer=fuzz.token_set_ratio)
    if result:
        match, score = result  
        return match if score >= threshold else None
    return None


# Function to merge datasets with fuzzy matching and nearest academic year
def merge_datasets_income(df1, df2, dept_or_faculty_matches):
    for dept_or_faculty in dept_or_faculty_matches:
        df1['matched_name'] = df1['Program'].apply(lambda x: get_best_match(x, df2[dept_or_faculty].unique()))
        for index, row in df2.iterrows():
            matched_rows = df1.loc[df1['matched_name'] == row[dept_or_faculty]]
            if not matched_rows.empty:
                if row['academicYear'] <= 2022:
                    df2.at[index, 'avg_monthly_income_group'] = matched_rows['2022'].values[0]
                elif row['academicYear'] >= 2023:
                    df2.at[index, 'avg_monthly_income_group'] = matched_rows['2023'].values[0]
        df1.drop(columns=['matched_name'], inplace=True)
    return df2
# Attempt matches with both department and faculty names
dept_or_faculty_matches = ['departmentName', 'faculty']

# Merge the datasets
merged_kazanc_grubu_df = merge_datasets_income(kazanc_grubu_df, yok_with_fees_df, dept_or_faculty_matches)

# Save the updated DataFrames to new CSV files
merged_kazanc_grubu_df.to_csv("final_tuik.csv", index=False)

In [6]:
# Function to merge datasets with fuzzy matching and nearest academic year
def merge_datasets_graduate(df1, df2, dept_or_faculty_matches):
    for dept_or_faculty in dept_or_faculty_matches:
        df1['matched_name'] = df1['Program'].apply(lambda x: get_best_match(x, df2[dept_or_faculty].unique()))
        for index, row in df2.iterrows():
            matched_rows = df1.loc[df1['matched_name'] == row[dept_or_faculty]]
            if not matched_rows.empty:
                if row['academicYear'] <= 2022:
                    df2.at[index, 'Time for Graduates to Find a Job'] = matched_rows['2022'].values[0]
                elif row['academicYear'] >= 2023:
                    df2.at[index, 'Time for Graduates to Find a Job'] = matched_rows['2023'].values[0]
        df1.drop(columns=['matched_name'], inplace=True)
    return df2
merge_ilk_is_bulma_suresi_df = merge_datasets_graduate(ilk_is_bulma_suresi_df, merged_kazanc_grubu_df, dept_or_faculty_matches)
merge_ilk_is_bulma_suresi_df.to_csv("final_ilk_is_bulma_suresi.csv", index=False)


In [3]:
# Function to merge datasets with fuzzy matching and nearest academic year
def merge_datasets_employment(df1, df2, dept_or_faculty_matches):
    for dept_or_faculty in dept_or_faculty_matches:
        df1['matched_name'] = df1['Program'].apply(lambda x: get_best_match(x, df2[dept_or_faculty].unique()))
        for index, row in df2.iterrows():
            matched_rows = df1.loc[df1['matched_name'] == row[dept_or_faculty]]
            if not matched_rows.empty:
                if row['academicYear'] <= 2022:
                    df2.at[index, 'employment_rate'] = matched_rows['2022'].values[0]
                elif row['academicYear'] >= 2023:
                    df2.at[index, 'employment_rate'] = matched_rows['2023'].values[0]
        df1.drop(columns=['matched_name'], inplace=True)
    return df2
merge_istihdam_orani_df = merge_datasets_employment(istihdam_orani_df, merge_ilk_is_bulma_suresi_df, dept_or_faculty_matches)
merge_istihdam_orani_df.to_csv("final_tuik.csv", index=False)

In [8]:
print(merge_istihdam_orani_df.isnull().sum())

academicYear                            0
universityName                          0
universityType                          0
faculty                                 0
departmentName                          0
idOSYM                                  0
programType                             0
language                                0
scholarshipRate                     31241
quota                                   0
occupiedSlots                           0
tuitionFee                          65281
universityFoundingYear              65281
facultyFoundingYear                 65281
universityLocation                      0
universityRegion                        0
profCount                               0
assoCount                               0
docCount                                0
baseRanking                             0
topRanking                              0
avgAdmissionRanking(TYT)                0
baseAdmissionRanking(TYT)               0
stdDeviationStudents              