## Teste 2 Modelo para Todo DataSet

In [2]:
%run ../src/feature_engineering.py
%run ../src/model_evaluation.py
%run ../src/plot.py
%run ../src/pipeline.py

import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from keras import models
from keras import layers
from keras.utils import to_categorical
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR, SVC
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, precision_score,f1_score

%matplotlib inline
#Alterando configurações padrão dos plots
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (20, 160)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
course_df = pd.read_csv('../data/preprocessed/edx_2014_train.csv',low_memory=False)
course_df['grade'].replace(np.nan, -1, inplace=True)
course_df['grade'] = course_df['grade'].map(lambda g: -1 if g <0 else 0 if g < 0.6 else 1 if g < 0.7 else 2 if g < 0.8 else 3 if g < 0.9 else 4)    
 

In [3]:
class ModelDecorator:
    def __init__(self, model):
        self._model = model
        
    def fit(self, X, y):
        y = to_categorical(y)
        return self._model.fit(X, y, epochs=50, verbose=1)
    
    def predict(self, X):
        #flatten = lambda l: [item for sublist in l for item in sublist]
        return self._model.predict(X)

class _DeepLearning:
    def provide(self):
        # create model
        model = models.Sequential()
        model.add(layers.Dense(36, input_dim=13, activation='relu'))
        model.add(layers.Dense(5, activation='softmax'))
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        return ModelDecorator(model)
    def sample_size(self):
        return 50000    

class _RandonForest:
    def provide(self):
        return Pipeline([('scaler', StandardScaler()),
                  ('rfr',RandomForestClassifier(n_jobs=2, random_state=42))])
    def sample_size(self):
        return None

class _SuppotVectorMachine:
    def provide(self):
        return Pipeline([('scaler', StandardScaler()),
                  ('poly',SVC(kernel='poly', degree=3, random_state=42))])
    def sample_size(self):
        return 20000

class _LogisticRegression:
    def provide(self):
        return Pipeline([('scaler', StandardScaler()),
                  ('poly',LogisticRegression(solver='sag', random_state=42))])
    def sample_size(self):
        return None
    
   
_models = [    
 #   _DeepLearning(),
    _LogisticRegression(),
    _RandonForest(),
    _SuppotVectorMachine(),
]
    
def feat_extract_selected(df):
    data = feature_extract(df)
    return data[['viewed',
                 'explored',
                 'certified',
                 'nevents',
                 'ndays_act',
                 'nplay_video',
                 'nchapters',
                 'grade',
                 'nforum_posts']]

def evaluate(y_pred, y_test):
    return {'f1': f1_score(y_test, y_pred),
            'accuracy':accuracy_score(y_test, y_pred),
            'precision':precision_score(y_test, y_pred)}

results, plt = train_evaluate_by_course(_models, course_df, feature_extractor= feature_extract, 
                              label='grade', split_criteria='course_id', evaluation=evaluate, 
                              plots= [confusion_matrix])

#plt.show()

NameError: name 'course_df' is not defined

In [8]:
result2 = []
for modelname, courses in results.items():
    for course_id, metrics in courses.items():
        result2.append({'model': modelname, 'course_id': course_id, 'f1':metrics['f1']})
        
pd.DataFrame(result2)

Unnamed: 0,course_id,f1,model
0,CB22x,0.717637,_LogisticRegression
1,CS50x,0.98893,_LogisticRegression
2,ER22x,0.755661,_LogisticRegression
3,PH207x,0.757784,_LogisticRegression
4,6.002x,0.699764,_LogisticRegression
5,2.01x,0.839399,_LogisticRegression
6,6.00x,0.801145,_LogisticRegression
7,7.00x,0.87205,_LogisticRegression
8,8.02x,0.869232,_LogisticRegression
9,8.MReV,0.864541,_LogisticRegression


In [54]:
results

{'_RandonForest': {'2.01x': {'f1': 0.8461569814348606},
  '6.002x': {'f1': 0.7121978797072349},
  '6.00x': {'f1': 0.8027413062507601},
  '7.00x': {'f1': 0.8616496103817153},
  '8.02x': {'f1': 0.8671191956235043},
  '8.MReV': {'f1': 0.8756150046320311},
  'CB22x': {'f1': 0.7294231622410557},
  'CS50x': {'f1': 0.9864312747162091},
  'ER22x': {'f1': 0.7523598951347575},
  'PH207x': {'f1': 0.7633464455991051}}}

In [13]:
reform = {(level1_key, level2_key, level3_key): values
              for level1_key, level2_dict in results.items()
              for level2_key, level3_dict in level2_dict.items()
              for level3_key, values      in level3_dict.items()}

result2 = []
for modelname, courses in results.items():
    for course_id, metrics in courses.items():
        result2.append({'model': modelname, 'course_id': course_id, 'r2_score':metrics['r2_score']})
    
pd.options.display.float_format = '{:,.4f}'.format
pd.DataFrame(result2)

Unnamed: 0,course_id,model,r2_score
0,CB22x,_LinearRegression,-0.1357
1,CS50x,_LinearRegression,0.5164
2,ER22x,_LinearRegression,0.1133
3,PH207x,_LinearRegression,0.1796
4,6.002x,_LinearRegression,0.0786
5,2.01x,_LinearRegression,0.2255
6,6.00x,_LinearRegression,0.0316
7,7.00x,_LinearRegression,0.2511
8,8.02x,_LinearRegression,0.1794
9,8.MReV,_LinearRegression,0.1556


In [34]:
course_df['grade'].unique()

array([0, 4, 3, 1, 2], dtype=int64)