In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator, TransformerMixin


# Custom transformer for LabelEncoder
class MultiColumnLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns  # array of column names to encode
        self.encoders = {col: LabelEncoder() for col in columns}

    def fit(self, X, y=None):
        for col in self.columns:
            self.encoders[col].fit(X[col])
        return self

    def transform(self, X):
        X_copy = X.copy()
        for col in self.columns:
            X_copy[col] = self.encoders[col].transform(X[col])
        return X_copy

# Load dataset
data = pd.read_csv('final_dataset.csv')


# Fill missing values in the baseRanking column with the mean
data['baseRanking'] = data['baseRanking'].fillna(data['baseRanking'].mean())

# Fill missing values in scholarshipRate and revenue with 0
data['scholarshipRate'] = data['scholarshipRate'].fillna(0)
data['revenue'] = data['revenue'].fillna(0)


# Target variable and feature separation
target = 'baseRanking'  # Replace with your actual target column name
X = data.drop(columns=[target])
y = data[target]

data.head()


Unnamed: 0,academicYear,universityName,universityType,faculty,departmentName,idOSYM,programType,language,scholarshipRate,quota,...,admittedTotalPref,admittedTotalDepartmentPref,currentStudentCount,totalForeignStudents,totalStudentNumber,Urap_Rank,Urap_Score,Time_for_Graduates_Find_Job,employment_rate,avg_monthly_income_group
0,2021,ABDULLAH GÜL ÜNİVERSİTESİ,devlet,Mühendislik Fakültesi,Bilgisayar Mühendisliği,106510077,SAY,İngilizce,0.0,70,...,1044.0,622.0,243.0,251.0,3472.0,40.0,506.88,11.2,90.1,2.0
1,2021,ABDULLAH GÜL ÜNİVERSİTESİ,devlet,Mühendislik Fakültesi,Bilgisayar Mühendisliği,106510077,SAY,İngilizce,0.0,70,...,1044.0,622.0,243.0,235.0,2670.0,40.0,506.88,11.2,90.1,2.0
2,2022,ABDULLAH GÜL ÜNİVERSİTESİ,devlet,Mühendislik Fakültesi,Bilgisayar Mühendisliği,106510077,SAY,İngilizce,0.0,75,...,1017.0,696.0,362.0,251.0,3472.0,43.0,716.794001,11.2,90.1,2.0
3,2022,ABDULLAH GÜL ÜNİVERSİTESİ,devlet,Mühendislik Fakültesi,Bilgisayar Mühendisliği,106510077,SAY,İngilizce,0.0,75,...,1017.0,696.0,362.0,235.0,2670.0,43.0,716.794001,11.2,90.1,2.0
4,2023,ABDULLAH GÜL ÜNİVERSİTESİ,devlet,Mühendislik Fakültesi,Bilgisayar Mühendisliği,106510077,SAY,İngilizce,0.0,75,...,1156.0,602.0,383.0,251.0,3472.0,50.0,705.46,10.8,90.8,2.0


In [3]:
# Identify categorical and numerical columns
categorical_columns_label = ['universityName', 'universityLocation', 'universityRegion', 'faculty', 'departmentName', 'language']
categorical_columns_one_hot = ['universityType', 'programType']
categorical_columns = categorical_columns_label + categorical_columns_one_hot
numerical_columns = X.select_dtypes(include=['float64', 'int64']).columns.difference(categorical_columns).tolist()

# Preprocessing pipeline for categorical and numerical features
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

label_encoder_transformer = Pipeline(steps=[
    ('label_encoder', MultiColumnLabelEncoder(columns=categorical_columns_label))
])


numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns_one_hot),
        ('label', label_encoder_transformer, categorical_columns_label)
    ]
)



In [5]:

# Full pipeline with preprocessor and regressor
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model_pipeline.fit(X_train, y_train)

y_pred = model_pipeline.predict(X_test)

data.head()


ValueError: y contains previously unseen labels: 'Ali Fuad Başgil Hukuk Fakültesi'

In [None]:
# for col in data.columns:
#     if data[col].dtype == 'object' and col not in exclude_columns:
#         # Replace commas with dots
#         data[col] = data[col].str.replace(',', '.', regex=False)
#         # Remove dots that are not used as decimal separators
#         data[col] = data[col].str.replace(r'(?<=\d)\.(?=\d{3})', '', regex=True)
#         # Convert to float and handle errors
#         data[col] = pd.to_numeric(data[col], errors='coerce')
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")

ValueError: y contains previously unseen labels: 'Ali Fuad Başgil Hukuk Fakültesi'