In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import  GaussianNB
from sklearn.linear_model import LinearRegression , LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score , classification_report
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Classification thorugh Global ranking of univeristies
uni_data = pd.read_csv('C:/Users/Mudassir Raza/Desktop/AI Project/Daata-set Univeristy comparison/top universities.csv')

In [3]:
# (Glaobal rank) = point the column
# apply function  = is the function used to apply function on every value in column.
# lambda = lambda is shortest way to write the function
# x represent the value in the column
uni_data['Rank Category'] = uni_data['Global Rank'].apply(lambda x: 'High' if x <= 3000 else 'Low')

In [4]:
X = uni_data['Country']
y = uni_data['Rank Category'].values #.value = ensure the y is 1d array!

In [5]:
# Convert text data to numerical format using CountVectorizer
# CountVectorizer = Convert the text to numerical value - counts how many time a unique word appear in the text
# fit_transform = learn the unique words from X  and count how many time each word apperas
vectorizer = CountVectorizer()
X_vectorized = vectorizer.fit_transform(X).toarray()

In [6]:

# Linear Regression: Predict Global Rank using numerical features (e.g., Country length)
uni_data['Country Length'] = uni_data['Country'].apply(len)
X_linear = uni_data[['Country Length']]
y_linear = uni_data['Global Rank']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.3, random_state=42)

In [8]:
X_train_lr, X_test_lr, y_train_lr, y_test_lr = train_test_split(X_linear, y_linear, test_size=0.3, random_state=42)


In [9]:
# Train models
nb_model = GaussianNB()
lin_model = LinearRegression()
log_model = LogisticRegression()
svm_model = SVC()
tree_model = DecisionTreeClassifier()
random_model = RandomForestClassifier()


In [10]:

nb_model.fit(X_train, y_train)
# Make predictions and calculate accuracy
y_pred = nb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy: {accuracy * 100:.2f}%')
print(classification_report(y_test , y_pred))

Model Accuracy: 38.93%
              precision    recall  f1-score   support

        High       0.27      0.97      0.42       890
         Low       0.96      0.22      0.35      2955

    accuracy                           0.39      3845
   macro avg       0.61      0.59      0.39      3845
weighted avg       0.80      0.39      0.37      3845



In [11]:
lin_model.fit(X_train_lr, y_train_lr)
y_pred_lr = lin_model.predict(X_test_lr)
y_pred_lr_category = ['High' if rank <= 3000 else 'Low' for rank in y_pred_lr]
y_test_lr_category = ['High' if rank <= 3000 else 'Low' for rank in y_test_lr]
accuracy = accuracy_score(y_test_lr_category, y_pred_lr_category)
print('LinearRegression')
print(f'accuracy:{accuracy*100:.2f}%')
print(classification_report(y_test_lr_category, y_pred_lr_category))

LinearRegression
accuracy:76.85%
              precision    recall  f1-score   support

        High       0.00      0.00      0.00       890
         Low       0.77      1.00      0.87      2955

    accuracy                           0.77      3845
   macro avg       0.38      0.50      0.43      3845
weighted avg       0.59      0.77      0.67      3845



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [12]:
log_model.fit(X_train , y_train)
y_pred = log_model.predict(X_test)
accuracy = accuracy_score(y_test , y_pred)
print('Logistic Regression')
print(f'accuracy:{accuracy*100:.2f}%')
print(classification_report(y_test, y_pred))

Logistic Regression
accuracy:81.46%
              precision    recall  f1-score   support

        High       0.62      0.50      0.56       890
         Low       0.86      0.91      0.88      2955

    accuracy                           0.81      3845
   macro avg       0.74      0.71      0.72      3845
weighted avg       0.80      0.81      0.81      3845



In [13]:
tree_model.fit(X_train , y_train)
y_pred = tree_model.predict(X_test)
accuracy = accuracy_score(y_test , y_pred)
print('Decision Tree')
print(f'accuracy:{accuracy*100:.2f}%')
print(classification_report(y_test , y_pred))

Decision Tree
accuracy:81.59%
              precision    recall  f1-score   support

        High       0.62      0.52      0.57       890
         Low       0.86      0.90      0.88      2955

    accuracy                           0.82      3845
   macro avg       0.74      0.71      0.73      3845
weighted avg       0.81      0.82      0.81      3845



In [14]:
random_model.fit(X_train , y_train)
y_pred = random_model.predict(X_test)
accuracy = accuracy_score(y_test , y_pred)
print('Random Forest')
print(f'accuracy:{accuracy*100:.2f}%')
print(classification_report(y_test , y_pred))

Random Forest
accuracy:81.66%
              precision    recall  f1-score   support

        High       0.62      0.52      0.57       890
         Low       0.86      0.90      0.88      2955

    accuracy                           0.82      3845
   macro avg       0.74      0.71      0.73      3845
weighted avg       0.81      0.82      0.81      3845



In [15]:
svm_model.fit(X_train , y_train)
y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('SVM')
print(f'accuracy:{accuracy*100:.2f}%')
print(classification_report(y_test, y_pred))

SVM
accuracy:81.66%
              precision    recall  f1-score   support

        High       0.62      0.52      0.57       890
         Low       0.86      0.91      0.88      2955

    accuracy                           0.82      3845
   macro avg       0.74      0.71      0.73      3845
weighted avg       0.81      0.82      0.81      3845

