## Imports

In [2]:
import pandas as pd
import numpy as np
from sklearn import metrics
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

ModuleNotFoundError: No module named 'sklearn'

## Create data frame

In [None]:
df_uni_rank_2020 = pd.read_csv('university_rank_2020.csv', index_col='University')
df_uni_rank = pd.read_csv('university_rank_other.csv', delim_whitespace=True)
df_uni_rank_students = pd.read_excel('university_rank_students.xlsx', index_col='University', sheet_name="Sheet1")

df = pd.concat([df_uni_rank_2020, df_uni_rank, df_uni_rank_students], axis=1)

In [None]:
print(df.shape)

In [None]:
print(df.dtypes)

In [None]:
print(df.head(10))

In [None]:
print(df.isnull().sum())

# Clean dataset

Fill NaN values with column average on numeric columns

In [None]:
numeric_columns = df.select_dtypes(np.number).columns
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
df[numeric_columns] = imp.fit_transform(df[numeric_columns])

print(df.isnull().sum())

Fill NaN with most frequent on non-numeric columns

In [None]:
categoric_columns = df.select_dtypes(object).columns
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
df[categoric_columns] = imp.fit_transform(df[categoric_columns])

print(df.isnull().sum())

Save df to .csv

In [None]:
df.to_csv("university_rank_no_nan.csv", sep=';', decimal='.', encoding='utf-8')

## Divide df to train and test datasets

   - dataset should only contain numeric values
   - we will predict "International outlook" column

In [None]:
X = df[numeric_columns]
y = df["International_Outlook"]
print(X.head())

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=789)

# Decision tree classifier

- print first 4 predictions
- print model accuracy

In [None]:
model = DecisionTreeClassifier()
model.fit(x_train, y_train)
predictions = model.predict(x_test)
print(predictions[:4])
print("Accuracy: ", metrics.accuracy_score(predictions, y_test))

# Prediction accuracy metrics

### Precision
The precision is the ratio tp / (tp + fp) where tp is the number of true positives and fp the number of false positives.
The precision is intuitively the ability of the classifier not to label as positive a sample that is negative. The best value is 1 and the worst value is 0.

In [None]:
print('Precision:', metrics.precision_score(predictions, y_test, average='micro'))

### Recall
Recall score is used to measure the model performance in terms of measuring the count of true positives in a correct manner out of all the actual positive values. Precision-Recall score is a useful measure of success of prediction when the classes are very imbalanced.

In [None]:
print('Recall:', metrics.recall_score(predictions, y_test, average='micro'))

## F1-score
The F1-score combines the precision and recall of a classifier into a single metric by taking their harmonic mean. It is primarily used to compare the performance of two classifiers. Suppose that classifier A has a higher recall, and classifier B has higher precision.

In [None]:
print('F1-score:', metrics.f1_score(predictions, y_test, average='micro'))

## Graph to show relationship between International_Outlook and Teaching values

In [None]:
print(sns.catplot(x="International_Outlook", y="Teaching", data=df))

# Repeat the same process but without Teaching column

In [None]:
columns_list = numeric_columns.values.tolist()
columns_list.pop(4)

X = df[columns_list]
y = df["International_Outlook"]

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=789)

model = DecisionTreeClassifier()
model.fit(x_train, y_train)
predictions = model.predict(x_test)
print("Accuracy: ", metrics.accuracy_score(predictions, y_test))

# Using K-fold validation and other classifiers

In [None]:
classifiers = [DecisionTreeClassifier(), LogisticRegression(), LinearSVC(), KNeighborsClassifier(), GaussianNB(),
               RandomForestClassifier(), ExtraTreesClassifier(), AdaBoostClassifier(), GradientBoostingClassifier()]

numeric_columns = df.select_dtypes(include=np.number).columns.tolist()
accuracy = {}
for model in classifiers:
    kfold = StratifiedKFold(n_splits=8, random_state=123, shuffle=True)
    predictions = cross_val_score(model, df[numeric_columns], y, cv=kfold, scoring='accuracy')
    accuracy[type(model).__name__] = predictions.mean()
    print(f'{type(model).__name__ : >30}: {predictions.mean():1.4f}')

In [None]:
accuracy = pd.DataFrame(accuracy, index=['i', ])
print(sns.barplot(data=accuracy))
plt.xticks(rotation=90)