In [1]:
import pandas as pd
import numpy as np

# Load the dataset, ensuring "ucret" is read as a string
df = pd.read_csv('alldepartments(cleaned).csv')

# Convert "baseRanking" and "topRanking" columns to numeric, forcing errors to NaN
df["baseRanking"] = pd.to_numeric(df["baseRanking"], errors='coerce')
df["topRanking"] = pd.to_numeric(df["topRanking"], errors='coerce')

# Function to calculate standard deviation for baseRanking and topRanking
def calculate_std(row):
    values = [row["baseRanking"], row["topRanking"]]
    return np.std(values)

# Apply the function to each row
df["stdDeviationStudents"] = df.apply(calculate_std, axis=1)

# Save updated DataFrame to a new CSV
df.to_csv("updated_data.csv", index=False)

In [2]:
df2 = pd.read_csv('foreignStudentDataFinal.csv')

#Merge the DataFrames on the 'universityName' column
merged_df = pd.merge(df, df2[['universityName', 'totalForeignStudents', 'totalStudentNumber']], on='universityName', how='left')

# Save updated DataFrame to a new CSV
merged_df.to_csv("updated_data.csv", index=False)

In [3]:
import pandas as pd
from fuzzywuzzy import process,fuzz

# Load the datasets
df = pd.read_csv('updated_data.csv')
df3 = pd.read_csv('csv_files/facultyInfo.csv')

# Function to get the best match for a faculty name
def get_best_match(faculty_name, choices, threshold=70):
    result = process.extractOne(faculty_name, choices, scorer=fuzz.token_set_ratio)
    if result:
        match, score = result
        return match if score >= threshold else None
    return None

# Get unique faculty names from df3
faculty_choices = df3['faculty'].unique()

# Apply fuzzy matching to get the best match for each faculty in df
df['matched_faculty'] = df['faculty'].apply(lambda x: get_best_match(x, faculty_choices))

# Merge the DataFrames on the 'universityName' and 'matched_faculty' columns
merged_df = pd.merge(df, df3[['universityName', 'faculty', 'facultyFoundingYear']], left_on=['universityName', 'matched_faculty'], right_on=['universityName', 'faculty'], how='left')

# Drop the 'matched_faculty' column as it's no longer needed
merged_df.drop(columns=['matched_faculty'], inplace=True)

# Save the updated DataFrame to a new CSV
merged_df.to_csv("updated_data.csv", index=False)



KeyboardInterrupt: 