<strong><h1>Cross validation</h1></strong>

<strong> <h2>Carregar libs</h2></strong>

In [1]:
import pandas as pd
from sklearn import tree
from sklearn.model_selection import cross_val_score

<strong><h2>Carregar dataset</h2></strong>

<p>O dataset a ser utilisado ranquea as universidades do mundo do ano de 2012, e inclue os seguintes dados:</p>
<ul>
    <li>world_rank</li>
    <li>institution</li>
    <li>country</li>
    <li>national_rank</li>
    <li>quality_of_education</li>
    <li>alumni_employment</li>
    <li>quality_of_faculty</li>
    <li>publications</li>
    <li>influence</li>
    <li>citations</li>
    <li>broad_impact</li>
    <li>patents</li>
    <li>score</li>
    <li>year</li>
</ul>

In [2]:
#Ler o dataset
df = pd.read_csv('./data/datasets_27_792993_cwurData.csv')
df.head()

Unnamed: 0,world_rank,institution,country,national_rank,quality_of_education,alumni_employment,quality_of_faculty,publications,influence,citations,broad_impact,patents,score,year
0,1,Harvard University,USA,1,7,9,1,1,1,1,,5,100.0,2012
1,2,Massachusetts Institute of Technology,USA,2,9,17,3,12,4,4,,1,91.67,2012
2,3,Stanford University,USA,3,17,11,5,4,2,2,,15,89.5,2012
3,4,University of Cambridge,United Kingdom,1,10,24,4,16,16,11,,50,86.17,2012
4,5,California Institute of Technology,USA,4,2,29,7,37,22,22,,18,85.21,2012


In [3]:
pd.isnull(df).any()

world_rank              False
institution             False
country                 False
national_rank           False
quality_of_education    False
alumni_employment       False
quality_of_faculty      False
publications            False
influence               False
citations               False
broad_impact             True
patents                 False
score                   False
year                    False
dtype: bool

<Strong><h2>Preparo dos dados</h2></Strong>

In [4]:
df_selected = df.drop(['broad_impact', 'year', 'institution'], axis=1)
df_selected.head()

Unnamed: 0,world_rank,country,national_rank,quality_of_education,alumni_employment,quality_of_faculty,publications,influence,citations,patents,score
0,1,USA,1,7,9,1,1,1,1,5,100.0
1,2,USA,2,9,17,3,12,4,4,1,91.67
2,3,USA,3,17,11,5,4,2,2,15,89.5
3,4,United Kingdom,1,10,24,4,16,16,11,50,86.17
4,5,USA,4,2,29,7,37,22,22,18,85.21


In [5]:
# encode "country"
from sklearn.preprocessing import LabelEncoder
labelEncoder_X = LabelEncoder()
df_selected.country = labelEncoder_X.fit_transform(df_selected.country)

df_selected.head()

Unnamed: 0,world_rank,country,national_rank,quality_of_education,alumni_employment,quality_of_faculty,publications,influence,citations,patents,score
0,1,54,1,7,9,1,1,1,1,5,100.0
1,2,54,2,9,17,3,12,4,4,1,91.67
2,3,54,3,17,11,5,4,2,2,15,89.5
3,4,57,1,10,24,4,16,16,11,50,86.17
4,5,54,4,2,29,7,37,22,22,18,85.21


In [6]:
df_selected['score'] = df_selected['score'].astype('int')
df_selected.head()

Unnamed: 0,world_rank,country,national_rank,quality_of_education,alumni_employment,quality_of_faculty,publications,influence,citations,patents,score
0,1,54,1,7,9,1,1,1,1,5,100
1,2,54,2,9,17,3,12,4,4,1,91
2,3,54,3,17,11,5,4,2,2,15,89
3,4,57,1,10,24,4,16,16,11,50,86
4,5,54,4,2,29,7,37,22,22,18,85


In [7]:
from sklearn.model_selection import train_test_split
X = df_selected.drop(['score'], axis=1)
Y = df_selected['score']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,test_size=0.3,random_state=1)
clf = tree.DecisionTreeClassifier()
clf.fit(X_train, Y_train)

# Testing Data
predictions = clf.predict(X_test)
    
# Evaluating score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
print("Test Accuracy score: ",accuracy_score(Y_test,predictions))
print("Classification report: ")
print(classification_report(Y_test,predictions))

Test Accuracy score:  0.8242424242424242
Classification report: 
              precision    recall  f1-score   support

          43       0.00      0.00      0.00         1
          44       0.98      0.99      0.99       293
          45       0.94      0.97      0.95       117
          46       0.85      0.83      0.84        48
          47       0.72      0.90      0.80        42
          48       0.79      0.81      0.80        27
          49       0.75      0.78      0.77        23
          50       0.50      0.50      0.50        10
          51       0.50      0.16      0.24        19
          52       0.33      0.25      0.29        12
          53       0.33      0.30      0.32        10
          54       0.50      0.17      0.25         6
          55       0.20      0.50      0.29         2
          56       0.25      0.20      0.22         5
          57       0.50      0.50      0.50         2
          58       0.00      0.00      0.00         4
          59    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
# K-fold (k=5)
scores = cross_val_score(clf, X, Y, cv=5, scoring='accuracy')
scores



array([0.65681818, 0.89772727, 0.44772727, 0.65227273, 0.80227273])

In [9]:
# Results
print("Accuracy: %0.2f" % (scores.mean()))
print("------------")
print()

Accuracy: 0.69
------------



<Strong><h2>Exemplo de LOOCV</h2></Strong>

In [17]:
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
scores = cross_val_score(clf, X, Y, cv=loo, scoring='accuracy', verbose=1)
print("Accuracy: %0.2f" % (scores.mean()))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Accuracy: 0.84


[Parallel(n_jobs=1)]: Done 2200 out of 2200 | elapsed:   29.6s finished


<Strong><h2>Exemplo de K-fold Repeated</h2></Strong>

In [13]:
from sklearn.model_selection import ShuffleSplit
kfold2 = ShuffleSplit(n_splits=10, test_size=0.30, random_state=1)
scores = cross_val_score(clf, X, Y, cv=kfold2, scoring='accuracy')
scores

array([0.83333333, 0.85757576, 0.81515152, 0.85151515, 0.82727273,
       0.85757576, 0.8       , 0.85      , 0.83181818, 0.83333333])

In [14]:
print("Accuracy: %0.2f" % (scores.mean()))

Accuracy: 0.84
