In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer



# Load dataset
data = pd.read_csv('final_dataset.csv')

# List of categorical columns to encode
categorical_columns = ['universityName', 'universityLocation', 'universityRegion', 'faculty', 'departmentName', 'language']

# Apply LabelEncoder to each categorical column
le = LabelEncoder()
for col in categorical_columns:
    data[col] = le.fit_transform(data[col])

# Convert percentage columns to float
percentage_columns = ['top1AdmittedRatio', 'top3AdmittedRatio', 'top10AdmittedRatio']
for col in percentage_columns:
    data[col] = data[col].str.replace('%', '').str.replace(',', '.').astype(float) / 100
    
# Convert categorical data
categorical_columns = ['universityType', 'programType']
data = pd.get_dummies(data, columns=categorical_columns)


# Fill missing values in the baseRanking column with the mean
data['baseRanking'] = data['baseRanking'].fillna(data['baseRanking'].mean())

# Option 1: Fill with a specific value (e.g., 0)
data.fillna(0, inplace=True)

data.head()


Unnamed: 0,academicYear,universityName,faculty,departmentName,idOSYM,language,scholarshipRate,quota,occupiedSlots,tuitionFee,...,Urap_Score,Time_for_Graduates_Find_Job,employment_rate,avg_monthly_income_group,universityType_devlet,universityType_vakıf,programType_DİL,programType_EA,programType_SAY,programType_SÖZ
0,2021,0,186,52,106510077,12,0.0,70,70,0.0,...,506.88,11.2,90.1,2.0,True,False,False,False,True,False
1,2021,0,186,52,106510077,12,0.0,70,70,0.0,...,506.88,11.2,90.1,2.0,True,False,False,False,True,False
2,2022,0,186,52,106510077,12,0.0,75,75,0.0,...,716.794001,11.2,90.1,2.0,True,False,False,False,True,False
3,2022,0,186,52,106510077,12,0.0,75,75,0.0,...,716.794001,11.2,90.1,2.0,True,False,False,False,True,False
4,2023,0,186,52,106510077,12,0.0,75,75,0.0,...,705.46,10.8,90.8,2.0,True,False,False,False,True,False


In [None]:
# Ensure all numeric columns are properly formatted
exclude_columns = ['universityType_devlet', 'universityType_vakıf', 'programType_DİL', 'programType_EA', 'programType_SAY', 'programType_SÖZ']
for col in data.columns:
    if data[col].dtype == 'object' and col not in exclude_columns:
        # Replace commas with dots
        data[col] = data[col].str.replace(',', '.', regex=False)
        # Remove dots that are not used as decimal separators
        data[col] = data[col].str.replace(r'(?<=\d)\.(?=\d{3})', '', regex=True)
        # Convert to float and handle errors
        data[col] = pd.to_numeric(data[col], errors='coerce')
# numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
# scaler = StandardScaler()
# data[numeric_columns] = scaler.fit_transform(data[numeric_columns])

# Define preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), X.select_dtypes(include=['float64', 'int64']).columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
    ]
)


In [49]:
data.to_csv('raw_data.csv', index=False)
# Define target column
target_column = 'baseRanking'  # Replace with the actual target column name if different

# Ensure the target column is not included in the features
X = data.drop(target_column, axis=1)
y = data[target_column]

data.head()



Unnamed: 0,academicYear,universityName,faculty,departmentName,idOSYM,language,scholarshipRate,quota,occupiedSlots,tuitionFee,...,Urap_Score,Time_for_Graduates_Find_Job,employment_rate,avg_monthly_income_group,universityType_devlet,universityType_vakıf,programType_DİL,programType_EA,programType_SAY,programType_SÖZ
0,-1.277855,-1.77387,-0.110523,-1.602582,-0.788578,1.246873,-0.673148,0.280591,0.325821,-0.369272,...,-0.165574,-0.582149,1.19097,1.757279,True,False,False,False,True,False
1,-1.277855,-1.77387,-0.110523,-1.602582,-0.788578,1.246873,-0.673148,0.280591,0.325821,-0.369272,...,-0.165574,-0.582149,1.19097,1.757279,True,False,False,False,True,False
2,-0.050603,-1.77387,-0.110523,-1.602582,-0.788578,1.246873,-0.673148,0.340436,0.387361,-0.369272,...,0.710647,-0.582149,1.19097,1.757279,True,False,False,False,True,False
3,-0.050603,-1.77387,-0.110523,-1.602582,-0.788578,1.246873,-0.673148,0.340436,0.387361,-0.369272,...,0.710647,-0.582149,1.19097,1.757279,True,False,False,False,True,False
4,1.17665,-1.77387,-0.110523,-1.602582,-0.788578,1.246873,-0.673148,0.340436,0.387361,-0.369272,...,0.663337,-0.679627,1.260698,1.757279,True,False,False,False,True,False


In [None]:
# Full pipeline with preprocessor and regressor
model_pipeline = Pipeline(steps=[
    ('regressor', RandomForestRegressor(random_state=42))
])

In [None]:
# Perform 5-fold cross-validation
cv_scores = cross_val_score(model_pipeline, X, y, cv=5, scoring='neg_mean_squared_error')
print(f"Cross-validation MSE: {-cv_scores.mean():.2f}")

In [51]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model_pipeline.fit(X_train, y_train)

y_pred = model_pipeline.predict(X_test)

data.head()
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")

Mean Squared Error: 0.00


In [56]:
# Full pipeline with preprocessor and Ridge regressor
ridge_pipeline = Pipeline(steps=[
    ('regressor', Ridge(alpha=1.0, random_state=42))  # Adjust alpha for regularization strength
])

In [57]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
ridge_pipeline.fit(X_train, y_train)

# Predict
y_pred = ridge_pipeline.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Ridge Regression Mean Squared Error: {mse:.2f}")

ValueError: Input X contains NaN.
Ridge does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values